commit bb941cff4354d9305b744314278d7358532d3398 Author: root Date: Sat May 24 23:00:30 2025 +0200 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2db0672 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__/ +*.pyc +.env +config.py +venv diff --git a/README.md b/README.md new file mode 100644 index 0000000..f43895d --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# compare_filesize_drupal6 + + diff --git a/config.example.py b/config.example.py new file mode 100644 index 0000000..ac57154 --- /dev/null +++ b/config.example.py @@ -0,0 +1,10 @@ +# config.py + +DB_CONFIG = { + 'host': 'localhost', + 'database': 'drupal6_db', + 'user': 'drupal_user', + 'password': 'secure_password', +} + +FILES_BASE_PATH = '/ścieżka/do/drupala/sites/default/files/' diff --git a/main.py b/main.py new file mode 100644 index 0000000..8540790 --- /dev/null +++ b/main.py @@ -0,0 +1,362 @@ +import os +import argparse +import pymysql +import hashlib +from PIL import Image +from config import DB_CONFIG, FILES_BASE_PATH + +MAX_WIDTH = 1920 +MAX_HEIGHT = 1080 +MAX_FILESIZE_BYTES = 1 * 1024 * 1024 # 1MB + +def get_connection(): + return pymysql.connect( + host=DB_CONFIG['host'], + user=DB_CONFIG['user'], + password=DB_CONFIG['password'], + database=DB_CONFIG['database'], + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor + ) + +def get_image_resolution(path): + try: + with Image.open(path) as img: + return img.size # (width, height) + except Exception: + return (0, 0) + +def hash_file(path): + hasher = hashlib.md5() + try: + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(8192), b""): + hasher.update(chunk) + return hasher.hexdigest() + except Exception: + return None + +def resize_image(path): + try: + with Image.open(path) as img: + width, height = img.size + if width <= MAX_WIDTH and height <= MAX_HEIGHT and os.path.getsize(path) <= MAX_FILESIZE_BYTES: + return False # Nie trzeba zmieniać + + # Oblicz skalę proporcjonalnie, by zmieścić w max rozmiar i rozdzielczość + width_ratio = MAX_WIDTH / width + height_ratio = MAX_HEIGHT / height + scale_ratio = min(width_ratio, height_ratio, 1) + + new_width = int(width * scale_ratio) + new_height = int(height * scale_ratio) + + img = img.resize((new_width, new_height), Image.LANCZOS) + # Zapisz nadpisując, jakość 85% + img.save(path, optimize=True, quality=85) + return True + except Exception as e: + print(f"[ERROR] Nie udało się zmniejszyć {path}: {e}") + return False + +def fetch_used_filepaths(cursor): + """ + Pobiera unikalne ścieżki plików (filepath) ze wszystkich tabel, które mają kolumnę 'filepath'. + Zwraca set ścieżek względnych (z '/' zamiast '\\'). + """ + # Pobierz listę tabel w bazie + cursor.execute("SHOW TABLES") + tables = [row[f'Tables_in_{DB_CONFIG["database"]}'] for row in cursor.fetchall()] + + filepaths = set() + + for table in tables: + # Sprawdź kolumny danej tabeli + cursor.execute(f"SHOW COLUMNS FROM `{table}` LIKE 'filepath'") + if cursor.rowcount == 1: + # Tabela ma kolumnę 'filepath' — pobierz jej wartości + try: + cursor.execute(f"SELECT filepath FROM `{table}`") + rows = cursor.fetchall() + for row in rows: + path = row['filepath'].replace('\\', '/') + filepaths.add(path) + except Exception as e: + print(f"[WARN] Nie udało się pobrać danych z tabeli '{table}': {e}") + # Możesz tu też zdecydować, czy chcesz przerwać, czy tylko ostrzec + + return filepaths + + +def file_exists_on_disk(base_path, rel_path): + # Normalizuj ścieżkę z bazy + rel_path_norm = rel_path.lstrip("/\\") # usuń początkowe ukośniki + + full_path = os.path.normpath(os.path.join(base_path, rel_path_norm)) + + # Debug print, żeby zweryfikować jak łączy się ścieżka + # print(f"Sprawdzam plik: {full_path}") + + return os.path.isfile(full_path), full_path + +def compare_files( + debug=False, + to_optimize=False, + update_db=False, + dry_run=False, + extensions='jpg,jpeg,png,gif', + exclude_folders=None, + find_duplicates=False, + show_deleted=False, + top=20, +): + allowed_exts = tuple(f".{ext.strip().lower()}" for ext in extensions.split(',')) + exclude_folders = exclude_folders or [] + exclude_folders = [folder.rstrip('/\\') for folder in exclude_folders] + + connection = get_connection() + cursor = connection.cursor() + + # Pobierz pliki z bazy + cursor.execute("SELECT fid, filepath, filesize FROM files") + files = cursor.fetchall() + + # Zbiór ścieżek plików używanych w systemie (dla duplikatów i wykrywania osieroconych) + used_filepaths = fetch_used_filepaths(cursor) + + stats = { + 'total': 0, + 'missing': 0, + 'mismatched': 0, + 'saved_bytes_potential': 0, + 'largest': [], + 'highest_res': [], + 'to_optimize': [], + 'duplicates': [], + 'deleted': [], + } + + images_info = [] + hash_map = {} + + # Przetwarzanie plików z bazy + for file in files: + rel_path = file['filepath'].replace('\\', '/') + # Sprawdź wykluczenia folderów + if any(rel_path.startswith(excl + '/') or rel_path == excl for excl in exclude_folders): + if debug: + print(f"[POMINIĘTO] {rel_path} (folder wykluczony)") + continue + + if not rel_path.lower().endswith(allowed_exts): + continue + + fid = file['fid'] + db_size = file['filesize'] + full_path = os.path.join(FILES_BASE_PATH, rel_path) + + stats['total'] += 1 + + if not os.path.isfile(full_path): + stats['missing'] += 1 + if debug: + print(f"[BRAK] {rel_path}") + print(f" → {full_path}") + continue + + actual_size = os.path.getsize(full_path) + width, height = get_image_resolution(full_path) + file_hash = hash_file(full_path) + + images_info.append({ + 'fid': fid, + 'path': full_path, + 'rel_path': rel_path, + 'db_size': db_size, + 'actual_size': actual_size, + 'width': width, + 'height': height, + 'hash': file_hash, + }) + + # Rozmiar różniący się + if actual_size != db_size: + stats['mismatched'] += 1 + diff = db_size - actual_size + if diff > 0: + stats['saved_bytes_potential'] += diff + + if debug: + print(f"[ROZMIAR] {rel_path}") + print(f" → DB: {db_size} B, Dysk: {actual_size} B ({(db_size - actual_size)/1024:.1f} KB różnicy)") + + if update_db and not dry_run: + update_sql = "UPDATE files SET filesize = %s WHERE fid = %s" + cursor.execute(update_sql, (actual_size, fid)) + connection.commit() + elif update_db and dry_run: + delta_kb = (actual_size - db_size) / 1024 + print(f"[DRY-RUN] {rel_path}") + print(f" DB: {db_size} B | FS: {actual_size} B | Δ: {delta_kb:+.1f} KB") + + # Optymalizacja (np. zmniejszanie obrazów) + if to_optimize: + if (width > MAX_WIDTH or height > MAX_HEIGHT) or actual_size > MAX_FILESIZE_BYTES: + stats['to_optimize'].append({ + 'path': full_path, + 'size': actual_size, + 'res': f"{width}x{height}" + }) + if update_db and not dry_run: + resized = resize_image(full_path) + if resized: + new_size = os.path.getsize(full_path) + # Aktualizuj bazę po zmianie + update_sql = "UPDATE files SET filesize = %s WHERE fid = %s" + cursor.execute(update_sql, (new_size, fid)) + connection.commit() + if debug: + print(f"[ZMIENIONO] {rel_path}: zmniejszono do {new_size} B") + + elif update_db and dry_run: + print(f"[DRY-RUN] Zmniejszyłbym {rel_path}") + + # Mapowanie hashy do detekcji duplikatów + if find_duplicates and file_hash: + if file_hash not in hash_map: + hash_map[file_hash] = [] + hash_map[file_hash].append({ + 'fid': fid, + 'rel_path': rel_path, + 'path': full_path, + 'db_size': db_size, + 'actual_size': actual_size, + 'width': width, + 'height': height, + }) + + # Największe pliki + stats['largest'] = sorted(images_info, key=lambda x: x['actual_size'], reverse=True)[:top] + + # Najwyższa rozdzielczość + stats['highest_res'] = sorted(images_info, key=lambda x: x['width'] * x['height'], reverse=True)[:top] + + # Szukanie duplikatów + if find_duplicates: + duplicates = [] + for file_hash, files_list in hash_map.items(): + if len(files_list) > 1: + # Sprawdź które pliki są używane w systemie + used = [] + orphaned = [] + for f in files_list: + if f['rel_path'] in used_filepaths: + used.append(f) + else: + orphaned.append(f) + + duplicates.append({ + 'hash': file_hash, + 'used': used, + 'orphaned': orphaned, + }) + stats['duplicates'] = duplicates + + # Pokazywanie plików na dysku, których brak w bazie (usunięte w bazie) + if show_deleted: + disk_files = [] + for root, _, files in os.walk(FILES_BASE_PATH): + # Sprawdź czy folder jest wykluczony + rel_root = os.path.relpath(root, FILES_BASE_PATH).replace('\\','/') + if any(rel_root.startswith(excl) for excl in exclude_folders): + continue + + for f in files: + if not f.lower().endswith(allowed_exts): + continue + full_path = os.path.join(root, f) + rel_path = os.path.relpath(full_path, FILES_BASE_PATH).replace('\\', '/') + if rel_path not in used_filepaths: + disk_files.append(rel_path) + stats['deleted'] = disk_files + + connection.close() + return stats + +def print_summary(stats, to_optimize=False, find_duplicates=False, show_deleted=False, top=20): + print(f"\n📊 Podsumowanie analizy:") + print(f" 🔢 Liczba plików: {stats['total']}") + print(f" ❌ Brakujące pliki: {stats['missing']}") + print(f" 🛠 Rozmiary różniące się: {stats['mismatched']}") + print(f" 💾 Potencjalne oszczędności po zmniejszeniu: {stats['saved_bytes_potential'] / (1024**2):.2f} MB") + + print(f"\n🖼 Największe obrazy (top {top}):") + for img in stats['largest']: + size_kb = img['actual_size'] / 1024 + size_mb = size_kb / 1024 + print(f" - {img['path']} ({size_kb:.1f} KB / {size_mb:.2f} MB)") + + print(f"\n📐 Najwyższe rozdzielczości (top {top}):") + for img in stats['highest_res']: + print(f" - {img['path']} ({img['width']}x{img['height']})") + + if to_optimize and stats['to_optimize']: + print(f"\n⚡ Pliki do optymalizacji ({len(stats['to_optimize'])}):") + for opt in stats['to_optimize']: + size_kb = opt['size'] / 1024 + print(f" - {opt['path']} ({opt['res']}, {size_kb:.1f} KB)") + + if find_duplicates and stats['duplicates']: + print(f"\n🔍 Duplikaty ({len(stats['duplicates'])} grup):") + for group in stats['duplicates']: + print(f" Hash: {group['hash']}") + if group['used']: + print(" ▶ Używane w systemie:") + for f in group['used']: + print(f" - {f['rel_path']}") + if group['orphaned']: + print(" ⚠️ Osierocone (nieużywane):") + for f in group['orphaned']: + print(f" - {f['rel_path']}") + + if show_deleted and stats['deleted']: + print(f"\n❌ Pliki na dysku nieznalezione w bazie (usunięte?):") + for f in stats['deleted']: + print(f" - {f}") + +def main(): + parser = argparse.ArgumentParser(description="Analiza plików w Drupal 6") + parser.add_argument('--debug', action='store_true', help='Tryb debugowania') + parser.add_argument('--update-db', action='store_true', help='Aktualizuj rozmiary plików w bazie') + parser.add_argument('--dry-run', action='store_true', help='Symuluj zmiany bez zapisywania') + parser.add_argument('--extensions', default='jpg,jpeg,png,gif', help='Rozszerzenia plików do analizy') + parser.add_argument('--exclude-folders', nargs='*', default=[], help='Foldery do wykluczenia (względem files/)') + parser.add_argument('--find-duplicates', action='store_true', help='Znajdź i pokaż duplikaty') + parser.add_argument('--show-deleted', action='store_true', help='Pokaż pliki na dysku usunięte z bazy') + parser.add_argument('--optimize', action='store_true', help='Zmniejsz obrazy powyżej limitów') + parser.add_argument('--top', type=int, default=20, help='Ilość największych i najwyższych do pokazania') + + args = parser.parse_args() + + stats = compare_files( + debug=args.debug, + to_optimize=args.optimize, + update_db=args.update_db, + dry_run=args.dry_run, + extensions=args.extensions, + exclude_folders=args.exclude_folders, + find_duplicates=args.find_duplicates, + show_deleted=args.show_deleted, + top=args.top, + ) + + print_summary( + stats, + to_optimize=args.optimize, + find_duplicates=args.find_duplicates, + show_deleted=args.show_deleted, + top=args.top + ) + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..48f10fb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pymysql +Pillow