zmiany w sablonach i poprawki w ocr

This commit is contained in:
Mateusz Gruszczyński
2025-07-25 10:42:07 +02:00
parent 34205f0e65
commit 0d5b170cac
6 changed files with 173 additions and 148 deletions

179
app.py
View File

@@ -115,7 +115,7 @@ class ShoppingList(db.Model):
id = db.Column(db.Integer, primary_key=True)
title = db.Column(db.String(150), nullable=False)
created_at = db.Column(db.DateTime, default=datetime.utcnow)
owner_id = db.Column(db.Integer, db.ForeignKey("user.id"))
owner = db.relationship("User", backref="lists", foreign_keys=[owner_id])
@@ -135,7 +135,9 @@ class Item(db.Model):
# added_at = db.Column(db.DateTime, default=datetime.utcnow)
added_at = db.Column(db.DateTime, default=utcnow)
added_by = db.Column(db.Integer, db.ForeignKey("user.id"), nullable=True)
added_by_user = db.relationship("User", backref="added_items", lazy=True, foreign_keys=[added_by])
added_by_user = db.relationship(
"User", backref="added_items", lazy=True, foreign_keys=[added_by]
)
purchased = db.Column(db.Boolean, default=False)
purchased_at = db.Column(db.DateTime, nullable=True)
@@ -393,22 +395,25 @@ def preprocess_image_for_tesseract(image):
def extract_total_tesseract(image):
import pytesseract
from pytesseract import Output
import re
text = pytesseract.image_to_string(image, lang="pol", config="--psm 4")
lines = text.splitlines()
candidates = []
keyword_lines_debug = []
fuzzy_regex = re.compile(r"[\dOo][.,:;g9zZ][\d]{2}")
keyword_pattern = re.compile(
blacklist_keywords = re.compile(r"\b(ptu|vat|podatek|stawka)\b", re.IGNORECASE)
priority_keywords = re.compile(
r"""
\b(
[5s]u[mn][aąo0]? |
razem |
zap[łl][aąo0]ty |
do\s+zap[łl][aąo0]ty |
razem\s*do\s*zap[łl][aąo0]ty |
do\s*zap[łl][aąo0]ty |
suma |
kwota |
płatno[śćs] |
warto[śćs] |
płatno[śćs] |
total |
amount
)\b
@@ -416,84 +421,71 @@ def extract_total_tesseract(image):
re.IGNORECASE | re.VERBOSE,
)
for idx, line in enumerate(lines):
if keyword_pattern.search(line[:30]):
keyword_lines_debug.append((idx, line))
for line in lines:
if not line.strip():
continue
matches = re.findall(r"\d{1,4}\s?[.,]\d{2}", line)
if blacklist_keywords.search(line):
continue
is_priority = priority_keywords.search(line)
matches = re.findall(r"\d{1,4}[.,]\d{2}", line)
for match in matches:
try:
val = float(match.replace(" ", "").replace(",", "."))
val = float(match.replace(",", "."))
if 0.1 <= val <= 100000:
candidates.append((val, line))
candidates.append((val, line, is_priority is not None))
except:
continue
spaced = re.findall(r"\d{1,4}\s\d{2}", line)
for match in spaced:
try:
val = float(match.replace(" ", "."))
if 0.1 <= val <= 100000:
candidates.append((val, line))
except:
continue
# Tylko w liniach priorytetowych: sprawdzamy spaced fallback
if is_priority:
spaced = re.findall(r"\d{1,4}\s\d{2}", line)
for match in spaced:
try:
val = float(match.replace(" ", "."))
if 0.1 <= val <= 100000:
candidates.append((val, line, True))
except:
continue
fuzzy_matches = fuzzy_regex.findall(line)
for match in fuzzy_matches:
cleaned = (
match.replace("O", "0")
.replace("o", "0")
.replace(":", ".")
.replace(";", ".")
.replace(",", ".")
.replace("g", "9")
.replace("z", "9")
.replace("Z", "9")
)
try:
val = float(cleaned)
if 0.1 <= val <= 100000:
candidates.append((val, line))
except:
continue
preferred = [
(val, line) for val, line in candidates if keyword_pattern.search(line.lower())
]
# Preferujemy linie priorytetowe
preferred = [(val, line) for val, line, is_pref in candidates if is_pref]
if preferred:
max_val = max(preferred, key=lambda x: x[0])[0]
return round(max_val, 2), lines
best_val = max(preferred, key=lambda x: x[0])[0]
if best_val < 99999:
return round(best_val, 2), lines
if candidates:
max_val = max([val for val, _ in candidates])
return round(max_val, 2), lines
best_val = max(candidates, key=lambda x: x[0])[0]
if best_val < 99999:
return round(best_val, 2), lines
# Fallback: największy font + bold
data = pytesseract.image_to_data(
image, lang="pol", config="--psm 4", output_type=Output.DICT
)
font_candidates = []
font_candidates = []
for i in range(len(data["text"])):
word = data["text"][i].strip()
if not word:
if not word or not re.match(r"^\d{1,5}[.,\s]\d{2}$", word):
continue
if re.match(r"^\d{1,5}[.,\s]\d{2}$", word):
try:
val = float(word.replace(",", ".").replace(" ", "."))
height = data["height"][i]
if 0.1 <= val <= 10000:
font_candidates.append((val, height, word))
except:
continue
try:
val = float(word.replace(",", ".").replace(" ", "."))
height = data["height"][i]
conf = int(data.get("conf", ["0"] * len(data["text"]))[i])
if 0.1 <= val <= 100000:
font_candidates.append((val, height, conf))
except:
continue
if font_candidates:
best = max(font_candidates, key=lambda x: x[1])
# Preferuj najwyższy font z sensownym confidence
best = max(font_candidates, key=lambda x: (x[1], x[2]))
return round(best[0], 2), lines
return 0.0, lines
@@ -964,15 +956,32 @@ def view_list(list_id):
@app.route("/user_expenses")
@login_required
def user_expenses():
# Lista wydatków użytkownika
expenses = (
start_date_str = request.args.get("start_date")
end_date_str = request.args.get("end_date")
start = None
end = None
# Przygotowanie podstawowego zapytania o wydatki użytkownika
expenses_query = (
Expense.query.join(ShoppingList, Expense.list_id == ShoppingList.id)
.options(joinedload(Expense.list))
.filter(ShoppingList.owner_id == current_user.id)
.order_by(Expense.added_at.desc())
.all()
)
# Filtrowanie po zakresie dat, jeśli podano
if start_date_str and end_date_str:
try:
start = datetime.strptime(start_date_str, "%Y-%m-%d")
end = datetime.strptime(end_date_str, "%Y-%m-%d") + timedelta(days=1)
expenses_query = expenses_query.filter(
Expense.added_at >= start, Expense.added_at < end
)
except ValueError:
flash("Błędny zakres dat", "danger")
expenses = expenses_query.order_by(Expense.added_at.desc()).all()
# Tabela wydatków
expense_table = [
{
"title": e.list.title if e.list else "Nieznana",
@@ -982,34 +991,32 @@ def user_expenses():
for e in expenses
]
# Tylko listy z tych wydatków
list_ids = {e.list_id for e in expenses}
lists = (
ShoppingList.query
.filter(
or_(
ShoppingList.owner_id == current_user.id,
ShoppingList.is_public == True
)
)
ShoppingList.query.filter(ShoppingList.id.in_(list_ids))
.order_by(ShoppingList.created_at.desc())
.all()
)
# Lista zsumowanych wydatków per lista (z uwzględnieniem filtra dat)
lists_data = [
{
"id": l.id,
"title": l.title,
"created_at": l.created_at,
"total_expense": sum(e.amount for e in l.expenses),
"owner_username": l.owner.username if l.owner else "?"
"total_expense": sum(
e.amount
for e in l.expenses
if (not start or not end) or (e.added_at >= start and e.added_at < end)
),
"owner_username": l.owner.username if l.owner else "?",
}
for l in lists
]
return render_template(
"user_expenses.html",
expense_table=expense_table,
lists_data=lists_data
"user_expenses.html", expense_table=expense_table, lists_data=lists_data
)
@@ -1028,7 +1035,7 @@ def user_expenses_data():
try:
start = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d") + timedelta(days=1)
query = query.filter(Expense.timestamp >= start, Expense.timestamp < end)
query = query.filter(Expense.added_at >= start, Expense.added_at < end)
except ValueError:
return jsonify({"error": "Błędne daty"}), 400
@@ -2332,7 +2339,6 @@ def handle_add_item(data):
)
@socketio.on("check_item")
def handle_check_item(data):
# item = Item.query.get(data["item_id"])
@@ -2420,7 +2426,6 @@ def handle_request_full_list(data):
emit("full_list", {"items": items_data}, to=request.sid)
@socketio.on("update_note")
def handle_update_note(data):
item_id = data["item_id"]
@@ -2490,16 +2495,6 @@ def handle_unmark_not_purchased(data):
emit("item_unmarked_not_purchased", {"item_id": item.id}, to=str(item.list_id))
""" @socketio.on('receipt_uploaded')
def handle_receipt_uploaded(data):
list_id = data['list_id']
url = data['url']
emit('receipt_added', {
'url': url
}, to=str(list_id), include_self=False) """
@app.cli.command("create_db")
def create_db():
db.create_all()