"""Phash distance probe — gate-keeper test przed włączeniem nowego browse scrapera. Usage: python scripts/probe_browse_scraper.py : e.g. scripts/probe_browse_scraper.py freshporno:FreshpornoScraper Procedura: 1. Scrap'nij 1 stronę (~20-60 scen) — bez ingestu do DB 2. Dla każdej sceny: download thumbnail → compute phash 3. Policz min Hamming distance vs canonical phashes (z StashDB w DB) 4. Raportuj % scen z Hamming ≤5 (acceptable threshold dla auto-merge) Decision rule: - ≥30% scen z Hamming ≤5 → włącz scraper (hot-link studio thumbs, działa jak freshporno) - <30% → wyłącz (own screenshots, orphan factory jak shyfap) """ from __future__ import annotations import argparse import importlib import sys from sqlalchemy import select from app.db import session_scope from app.models.scene import SceneExternalRef, SceneFingerprint from app.models.source import Source, SourceKind def hamming_hex(a: str, b: str) -> int: return bin(int(a, 16) ^ int(b, 16)).count("1") def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("scraper_ref", help="module:ClassName (np. freshporno:FreshpornoScraper)") ap.add_argument("--pages", type=int, default=1, help="max pages do scrap (default 1)") args = ap.parse_args() module_name, class_name = args.scraper_ref.split(":", 1) mod = importlib.import_module(f"app.connectors.direct_scrapers.{module_name}") cls = getattr(mod, class_name) s = cls() print(f"Testing scraper {s.sitetag} ({class_name}) on max_pages={args.pages}") # Scrap scraped: list[tuple[str, str | None]] = [] # (title, phash) for raw in s.latest_scenes(max_pages=args.pages): ph = next((f.value for f in raw.fingerprints if f.kind == "phash"), None) scraped.append((raw.title, ph)) n_scraped = len(scraped) n_with_phash = sum(1 for _, ph in scraped if ph) print(f"Scraped {n_scraped} scenes, {n_with_phash} with phash") if n_with_phash == 0: print("ERROR: no phashes — scraper / thumbnail issue.") return 1 # Canonical phashes with session_scope() as session: canon = session.execute( select(SceneFingerprint.value) .join(SceneExternalRef, SceneExternalRef.scene_id == SceneFingerprint.scene_id) .join(Source, Source.id == SceneExternalRef.source_id) .where( SceneFingerprint.kind == "phash", Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]), ) ).scalars().all() canon_phashes = [c for c in canon if len(c) == 16] print(f"Comparing against {len(canon_phashes)} canonical phashes...") # Distances distances: list[tuple[str, int]] = [] for title, ph in scraped: if not ph: continue best = 999 for c in canon_phashes: try: d = hamming_hex(ph, c) if d < best: best = d if best == 0: break # short-circuit except ValueError: continue distances.append((title, best)) # Stats in_0 = sum(1 for _, d in distances if d == 0) in_5 = sum(1 for _, d in distances if d <= 5) in_10 = sum(1 for _, d in distances if d <= 10) in_15 = sum(1 for _, d in distances if d <= 15) print(f"\nPhash distance distribution (N={len(distances)}):") print(f" Hamming = 0: {in_0:3d} ({100*in_0/len(distances):.0f}%) — exact match") print(f" Hamming ≤ 5: {in_5:3d} ({100*in_5/len(distances):.0f}%) — auto-merge threshold (DEFAULT)") print(f" Hamming ≤10: {in_10:3d} ({100*in_10/len(distances):.0f}%)") print(f" Hamming ≤15: {in_15:3d} ({100*in_15/len(distances):.0f}%)") pct_5 = 100 * in_5 / len(distances) if pct_5 >= 30: print(f"\nVERDICT: ENABLE — {pct_5:.0f}% scen ≤Hamming 5 (≥30% threshold)") return 0 print(f"\nVERDICT: DISABLE — tylko {pct_5:.0f}% scen ≤Hamming 5 (poniżej 30% threshold)") print("Tube robi własne screenshots, nie matchują canonical. Orphan factory ryzyko.") return 2 if __name__ == "__main__": sys.exit(main())