zmiany w sablonach i poprawki w ocr

2025-07-25 10:42:07 +02:00
parent 34205f0e65
commit 0d5b170cac
6 changed files with 173 additions and 148 deletions
--- a/app.py
+++ b/app.py
@@ -115,7 +115,7 @@ class ShoppingList(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    title = db.Column(db.String(150), nullable=False)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
-    
+
    owner_id = db.Column(db.Integer, db.ForeignKey("user.id"))
    owner = db.relationship("User", backref="lists", foreign_keys=[owner_id])

@@ -135,7 +135,9 @@ class Item(db.Model):
    # added_at = db.Column(db.DateTime, default=datetime.utcnow)
    added_at = db.Column(db.DateTime, default=utcnow)
    added_by = db.Column(db.Integer, db.ForeignKey("user.id"), nullable=True)
-    added_by_user = db.relationship("User", backref="added_items", lazy=True, foreign_keys=[added_by])
+    added_by_user = db.relationship(
+        "User", backref="added_items", lazy=True, foreign_keys=[added_by]
+    )

    purchased = db.Column(db.Boolean, default=False)
    purchased_at = db.Column(db.DateTime, nullable=True)
@@ -393,22 +395,25 @@ def preprocess_image_for_tesseract(image):


 def extract_total_tesseract(image):
+    import pytesseract
+    from pytesseract import Output
+    import re
+
    text = pytesseract.image_to_string(image, lang="pol", config="--psm 4")
    lines = text.splitlines()
    candidates = []
-    keyword_lines_debug = []

-    fuzzy_regex = re.compile(r"[\dOo][.,:;g9zZ][\d]{2}")
-    keyword_pattern = re.compile(
+    blacklist_keywords = re.compile(r"\b(ptu|vat|podatek|stawka)\b", re.IGNORECASE)
+
+    priority_keywords = re.compile(
        r"""
        \b(
-            [5s]u[mn][aąo0]? |
-            razem |
-            zap[łl][aąo0]ty |
-            do\s+zap[łl][aąo0]ty |
+            razem\s*do\s*zap[łl][aąo0]ty |
+            do\s*zap[łl][aąo0]ty |
+            suma |
            kwota |
-            płatno[śćs] |
            warto[śćs] |
+            płatno[śćs] |
            total |
            amount
        )\b
@@ -416,84 +421,71 @@ def extract_total_tesseract(image):
        re.IGNORECASE | re.VERBOSE,
    )

-    for idx, line in enumerate(lines):
-        if keyword_pattern.search(line[:30]):
-            keyword_lines_debug.append((idx, line))
-
    for line in lines:
        if not line.strip():
            continue

-        matches = re.findall(r"\d{1,4}\s?[.,]\d{2}", line)
+        if blacklist_keywords.search(line):
+            continue
+
+        is_priority = priority_keywords.search(line)
+
+        matches = re.findall(r"\d{1,4}[.,]\d{2}", line)
        for match in matches:
            try:
-                val = float(match.replace(" ", "").replace(",", "."))
+                val = float(match.replace(",", "."))
                if 0.1 <= val <= 100000:
-                    candidates.append((val, line))
+                    candidates.append((val, line, is_priority is not None))
            except:
                continue

-        spaced = re.findall(r"\d{1,4}\s\d{2}", line)
-        for match in spaced:
-            try:
-                val = float(match.replace(" ", "."))
-                if 0.1 <= val <= 100000:
-                    candidates.append((val, line))
-            except:
-                continue
+        # Tylko w liniach priorytetowych: sprawdzamy spaced fallback
+        if is_priority:
+            spaced = re.findall(r"\d{1,4}\s\d{2}", line)
+            for match in spaced:
+                try:
+                    val = float(match.replace(" ", "."))
+                    if 0.1 <= val <= 100000:
+                        candidates.append((val, line, True))
+                except:
+                    continue

-        fuzzy_matches = fuzzy_regex.findall(line)
-        for match in fuzzy_matches:
-            cleaned = (
-                match.replace("O", "0")
-                .replace("o", "0")
-                .replace(":", ".")
-                .replace(";", ".")
-                .replace(",", ".")
-                .replace("g", "9")
-                .replace("z", "9")
-                .replace("Z", "9")
-            )
-            try:
-                val = float(cleaned)
-                if 0.1 <= val <= 100000:
-                    candidates.append((val, line))
-            except:
-                continue
-
-    preferred = [
-        (val, line) for val, line in candidates if keyword_pattern.search(line.lower())
-    ]
+    # Preferujemy linie priorytetowe
+    preferred = [(val, line) for val, line, is_pref in candidates if is_pref]

    if preferred:
-        max_val = max(preferred, key=lambda x: x[0])[0]
-        return round(max_val, 2), lines
+        best_val = max(preferred, key=lambda x: x[0])[0]
+        if best_val < 99999:
+            return round(best_val, 2), lines

    if candidates:
-        max_val = max([val for val, _ in candidates])
-        return round(max_val, 2), lines
+        best_val = max(candidates, key=lambda x: x[0])[0]
+        if best_val < 99999:
+            return round(best_val, 2), lines

+    # Fallback: największy font + bold
    data = pytesseract.image_to_data(
        image, lang="pol", config="--psm 4", output_type=Output.DICT
    )
-    font_candidates = []

+    font_candidates = []
    for i in range(len(data["text"])):
        word = data["text"][i].strip()
-        if not word:
+        if not word or not re.match(r"^\d{1,5}[.,\s]\d{2}$", word):
            continue

-        if re.match(r"^\d{1,5}[.,\s]\d{2}$", word):
-            try:
-                val = float(word.replace(",", ".").replace(" ", "."))
-                height = data["height"][i]
-                if 0.1 <= val <= 10000:
-                    font_candidates.append((val, height, word))
-            except:
-                continue
+        try:
+            val = float(word.replace(",", ".").replace(" ", "."))
+            height = data["height"][i]
+            conf = int(data.get("conf", ["0"] * len(data["text"]))[i])
+            if 0.1 <= val <= 100000:
+                font_candidates.append((val, height, conf))
+        except:
+            continue

    if font_candidates:
-        best = max(font_candidates, key=lambda x: x[1])
+        # Preferuj najwyższy font z sensownym confidence
+        best = max(font_candidates, key=lambda x: (x[1], x[2]))
        return round(best[0], 2), lines

    return 0.0, lines
@@ -964,15 +956,32 @@ def view_list(list_id):
@app.route("/user_expenses")
@login_required
 def user_expenses():
-    # Lista wydatków użytkownika
-    expenses = (
+    start_date_str = request.args.get("start_date")
+    end_date_str = request.args.get("end_date")
+    start = None
+    end = None
+
+    # Przygotowanie podstawowego zapytania o wydatki użytkownika
+    expenses_query = (
        Expense.query.join(ShoppingList, Expense.list_id == ShoppingList.id)
        .options(joinedload(Expense.list))
        .filter(ShoppingList.owner_id == current_user.id)
-        .order_by(Expense.added_at.desc())
-        .all()
    )

+    # Filtrowanie po zakresie dat, jeśli podano
+    if start_date_str and end_date_str:
+        try:
+            start = datetime.strptime(start_date_str, "%Y-%m-%d")
+            end = datetime.strptime(end_date_str, "%Y-%m-%d") + timedelta(days=1)
+            expenses_query = expenses_query.filter(
+                Expense.added_at >= start, Expense.added_at < end
+            )
+        except ValueError:
+            flash("Błędny zakres dat", "danger")
+
+    expenses = expenses_query.order_by(Expense.added_at.desc()).all()
+
+    # Tabela wydatków
    expense_table = [
        {
            "title": e.list.title if e.list else "Nieznana",
@@ -982,34 +991,32 @@ def user_expenses():
        for e in expenses
    ]

+    # Tylko listy z tych wydatków
+    list_ids = {e.list_id for e in expenses}
    lists = (
-        ShoppingList.query
-        .filter(
-            or_(
-                ShoppingList.owner_id == current_user.id,
-                ShoppingList.is_public == True
-            )
-        )
+        ShoppingList.query.filter(ShoppingList.id.in_(list_ids))
        .order_by(ShoppingList.created_at.desc())
        .all()
    )

+    # Lista zsumowanych wydatków per lista (z uwzględnieniem filtra dat)
    lists_data = [
        {
            "id": l.id,
            "title": l.title,
            "created_at": l.created_at,
-            "total_expense": sum(e.amount for e in l.expenses),
-            "owner_username": l.owner.username if l.owner else "?"
+            "total_expense": sum(
+                e.amount
+                for e in l.expenses
+                if (not start or not end) or (e.added_at >= start and e.added_at < end)
+            ),
+            "owner_username": l.owner.username if l.owner else "?",
        }
        for l in lists
    ]

-
    return render_template(
-        "user_expenses.html",
-        expense_table=expense_table,
-        lists_data=lists_data
+        "user_expenses.html", expense_table=expense_table, lists_data=lists_data
    )


@@ -1028,7 +1035,7 @@ def user_expenses_data():
        try:
            start = datetime.strptime(start_date, "%Y-%m-%d")
            end = datetime.strptime(end_date, "%Y-%m-%d") + timedelta(days=1)
-            query = query.filter(Expense.timestamp >= start, Expense.timestamp < end)
+            query = query.filter(Expense.added_at >= start, Expense.added_at < end)
        except ValueError:
            return jsonify({"error": "Błędne daty"}), 400

@@ -2332,7 +2339,6 @@ def handle_add_item(data):
    )


-
@socketio.on("check_item")
 def handle_check_item(data):
    # item = Item.query.get(data["item_id"])
@@ -2420,7 +2426,6 @@ def handle_request_full_list(data):
    emit("full_list", {"items": items_data}, to=request.sid)


-
@socketio.on("update_note")
 def handle_update_note(data):
    item_id = data["item_id"]
@@ -2490,16 +2495,6 @@ def handle_unmark_not_purchased(data):
        emit("item_unmarked_not_purchased", {"item_id": item.id}, to=str(item.list_id))


-""" @socketio.on('receipt_uploaded')
-def handle_receipt_uploaded(data):
-    list_id = data['list_id']
-    url = data['url']
-
-    emit('receipt_added', {
-        'url': url
-    }, to=str(list_id), include_self=False) """
-
-
@app.cli.command("create_db")
 def create_db():
    db.create_all()