From a44a61c7184f0c5e2bf96ba07a994e5b4f2cd967 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Gruszczy=C5=84ski?=
 <mateusz.gruszczynski@firma.interia.pl>
Date: Tue, 22 Jul 2025 11:23:00 +0200
Subject: [PATCH] ocr usprawnienia

---
 app.py | 106 ++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 71 insertions(+), 35 deletions(-)

diff --git a/app.py b/app.py
index ce92783..c2dd629 100644
--- a/app.py
+++ b/app.py
@@ -9,7 +9,7 @@ import psutil
 import secrets
 import hashlib
 import re
-
+import numpy as np
 
 from pillow_heif import register_heif_opener
 
@@ -44,7 +44,7 @@ from flask_compress import Compress
 from flask_socketio import SocketIO, emit, join_room
 from werkzeug.security import generate_password_hash, check_password_hash
 from config import Config
-from PIL import Image, ExifTags
+from PIL import Image, ExifTags, ImageFilter, ImageOps
 from werkzeug.utils import secure_filename
 from werkzeug.middleware.proxy_fix import ProxyFix
 from sqlalchemy import func, extract
@@ -54,6 +54,7 @@ from functools import wraps
 # OCR
 from collections import Counter
 import pytesseract
+from pytesseract import Output
 
 
 app = Flask(__name__)
@@ -295,8 +296,8 @@ def save_resized_image(file, path):
     image.info.clear()
 
     new_path = path.rsplit(".", 1)[0] + ".webp"
-    image.save(new_path, format="WEBP", quality=85, method=6)
-
+    #image.save(new_path, format="WEBP", quality=85, method=6)
+    image.save(new_path, format="WEBP", quality=100, method=0)
 
 def redirect_with_flash(
     message: str, category: str = "info", endpoint: str = "main_page"
@@ -343,43 +344,57 @@ def _receipt_error(message):
 
 ############# OCR ###########################
 
-
-def preprocess_image_for_tesseract(pil_image):
-    import cv2
-    import numpy as np
-    from PIL import Image
-
-    # Konwersja PIL.Image → NumPy grayscale
-    image = np.array(pil_image.convert("L"))
-
-    # Zwiększenie skali dla lepszej czytelności OCR
-    image = cv2.resize(image, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR)
-
-    # Adaptacyjne progowanie (lepsze niż THRESH_BINARY przy nierównym tle)
-    image = cv2.adaptiveThreshold(
-        image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, blockSize=15, C=10
-    )
-
-    # Konwersja z powrotem na PIL.Image (dla pytesseract)
-    return Image.fromarray(image)
-
+def preprocess_image_for_tesseract(image):
+    image = ImageOps.autocontrast(image)
+    image = image.point(lambda x: 0 if x < 160 else 255)  # mocniejsza binarizacja
+    image = image.resize((image.width * 2, image.height * 2), Image.BICUBIC)  # większe powiększenie
+    return image
 
 def extract_total_tesseract(image):
-
-    text = pytesseract.image_to_string(image, lang="pol", config="--psm 6")
+    text = pytesseract.image_to_string(image, lang="pol", config="--psm 4")
     lines = text.splitlines()
     candidates = []
+    keyword_lines_debug = []
 
     fuzzy_regex = re.compile(r"[\dOo][.,:;g9zZ][\d]{2}")
+    keyword_pattern = re.compile(
+        r"""
+        \b(
+            [5s]u[mn][aąo0]? |
+            razem |
+            zap[łl][aąo0]ty |
+            do\s+zap[łl][aąo0]ty |
+            kwota |
+            płatno[śćs] |
+            warto[śćs] |
+            total |
+            amount
+        )\b
+        """,
+        re.IGNORECASE | re.VERBOSE
+    )
+
+    for idx, line in enumerate(lines):
+        if keyword_pattern.search(line[:30]):
+            keyword_lines_debug.append((idx, line))
 
     for line in lines:
         if not line.strip():
             continue
 
-        matches = re.findall(r"\d{1,4}[.,]\d{2}", line)
+        matches = re.findall(r"\d{1,4}\s?[.,]\d{2}", line)
         for match in matches:
             try:
-                val = float(match.replace(",", "."))
+                val = float(match.replace(" ", "").replace(",", "."))
+                if 0.1 <= val <= 100000:
+                    candidates.append((val, line))
+            except:
+                continue
+
+        spaced = re.findall(r"\d{1,4}\s\d{2}", line)
+        for match in spaced:
+            try:
+                val = float(match.replace(" ", "."))
                 if 0.1 <= val <= 100000:
                     candidates.append((val, line))
             except:
@@ -399,24 +414,45 @@ def extract_total_tesseract(image):
             )
             try:
                 val = float(cleaned)
-                if 0.1 <= val <= 100:
+                if 0.1 <= val <= 100000:
                     candidates.append((val, line))
             except:
                 continue
 
     preferred = [
-        val
+        (val, line)
         for val, line in candidates
-        if re.search(r"sum[aąo]?|razem|zapłaty", line.lower())
+        if keyword_pattern.search(line.lower())
     ]
 
     if preferred:
-        max_val = round(max(preferred), 2)
-        return max_val, lines
+        max_val = max(preferred, key=lambda x: x[0])[0]
+        return round(max_val, 2), lines
 
     if candidates:
-        max_val = round(max([val for val, _ in candidates]), 2)
-        return max_val, lines
+        max_val = max([val for val, _ in candidates])
+        return round(max_val, 2), lines
+
+    data = pytesseract.image_to_data(image, lang="pol", config="--psm 4", output_type=Output.DICT)
+    font_candidates = []
+
+    for i in range(len(data["text"])):
+        word = data["text"][i].strip()
+        if not word:
+            continue
+
+        if re.match(r"^\d{1,5}[.,\s]\d{2}$", word):
+            try:
+                val = float(word.replace(",", ".").replace(" ", "."))
+                height = data["height"][i]
+                if 0.1 <= val <= 10000:
+                    font_candidates.append((val, height, word))
+            except:
+                continue
+
+    if font_candidates:
+        best = max(font_candidates, key=lambda x: x[1])
+        return round(best[0], 2), lines
 
     return 0.0, lines