"""Diagnostyka: dla danego performera porównaj listę scen z TPDB, StashDB i lokalnej DB. Pokazuje: - ile scen TPDB / StashDB ma performer - ile z nich mamy w naszej DB (po canonical external_ref) - ile naszych scen ma BOTH refs (cross-source merged) - tube-scraper sceny przypięte do performera (bez canonical) Użycie: python -m scripts.compare_performer_canon "Kitana Montana" """ from __future__ import annotations import sys from collections import defaultdict from sqlalchemy import select from app.connectors.stashdb import StashDBConnector from app.connectors.tpdb import TPDBConnector from app.db import session_scope from app.models.performer import Performer, PerformerExternalRef from app.models.scene import Scene, SceneExternalRef, ScenePerformer from app.models.source import Source, SourceKind def _fmt_scene(s) -> str: title = (s.title or "")[:60] date = s.release_date.isoformat() if s.release_date else "????-??-??" studio = s.studio_id and "yes" or "-" return f" [{date}] {title!r} (studio={studio}, dur={s.duration_sec})" def main(name: str) -> None: print(f"=== Performer: {name} ===\n") # 1. TPDB tpdb = TPDBConnector() tpdb_id = tpdb.find_performer_id_by_name(name) print(f"TPDB performer_id: {tpdb_id}") tpdb_scenes: dict[str, dict] = {} if tpdb_id: for raw in tpdb.fetch_scenes_for_performer(tpdb_id, limit=2000): tpdb_scenes[raw.external_id] = { "title": raw.title, "date": raw.release_date.isoformat() if raw.release_date else None, "studio": raw.studio.name if raw.studio else None, "duration": raw.duration_sec, } print(f"TPDB scenes count: {len(tpdb_scenes)}") # 2. StashDB stashdb = StashDBConnector() stashdb_id = stashdb.find_performer_id_by_name(name) print(f"\nStashDB performer_id: {stashdb_id}") stashdb_scenes: dict[str, dict] = {} if stashdb_id: for raw in stashdb.fetch_scenes_for_performer(stashdb_id, limit=2000): stashdb_scenes[raw.external_id] = { "title": raw.title, "date": raw.release_date.isoformat() if raw.release_date else None, "studio": raw.studio.name if raw.studio else None, "duration": raw.duration_sec, } print(f"StashDB scenes count: {len(stashdb_scenes)}") # 3. Lokalna DB with session_scope() as session: perf_row = session.execute( select(Performer) .where(Performer.canonical_name.ilike(name)) .limit(1) ).scalar_one_or_none() if perf_row is None: # spróbuj normalized from app.normalize.text import normalize perf_row = session.execute( select(Performer) .where(Performer.name_normalized == normalize(name)) .limit(1) ).scalar_one_or_none() local_perf_id = perf_row.id if perf_row else None print(f"\nLocal performer: id={local_perf_id} canonical_name={perf_row.canonical_name if perf_row else None!r}") # external_refs lokalnego performera (per source kind) if perf_row: ext_rows = session.execute( select(Source.kind, PerformerExternalRef.external_id) .join(Source, Source.id == PerformerExternalRef.source_id) .where(PerformerExternalRef.performer_id == perf_row.id) ).all() print(f"Local performer external_refs:") for kind, ext_id in ext_rows: print(f" {kind.value}: {ext_id}") if local_perf_id is None: print("\n(brak lokalnego performera, kończę po fetchach)") print(f"\n=== TPDB-only ({len(tpdb_scenes)}) ===") for ext_id, s in list(tpdb_scenes.items())[:30]: print(f" [{s['date']}] {s['title']}") print(f"\n=== StashDB-only ({len(stashdb_scenes)}) ===") for ext_id, s in list(stashdb_scenes.items())[:30]: print(f" [{s['date']}] {s['title']}") return # wszystkie sceny z performerem w lokalnej DB local_scene_rows = session.execute( select(Scene) .join(ScenePerformer, ScenePerformer.scene_id == Scene.id) .where(ScenePerformer.performer_id == local_perf_id) .order_by(Scene.release_date.desc().nulls_last()) ).scalars().all() print(f"\nLocal scenes for performer: {len(local_scene_rows)}") # per scene: zbierz refs (tpdb / stashdb / scraper) scene_refs: dict[str, dict[str, list[str]]] = {} for s in local_scene_rows: refs = session.execute( select(Source.kind, SceneExternalRef.external_id) .join(Source, Source.id == SceneExternalRef.source_id) .where(SceneExternalRef.scene_id == s.id) ).all() buckets: dict[str, list[str]] = defaultdict(list) for kind, ext_id in refs: buckets[kind.value].append(ext_id) scene_refs[str(s.id)] = dict(buckets) # statystyki n_with_tpdb_ref = sum(1 for r in scene_refs.values() if r.get("tpdb")) n_with_stashdb_ref = sum(1 for r in scene_refs.values() if r.get("stashdb")) n_with_both = sum(1 for r in scene_refs.values() if r.get("tpdb") and r.get("stashdb")) n_canonical = sum(1 for r in scene_refs.values() if r.get("tpdb") or r.get("stashdb")) n_scraper_only = sum( 1 for r in scene_refs.values() if not r.get("tpdb") and not r.get("stashdb") and r.get("scraper") ) print(f"\nLocal scene breakdown:") print(f" with TPDB ref: {n_with_tpdb_ref}") print(f" with StashDB ref: {n_with_stashdb_ref}") print(f" with BOTH (merged!): {n_with_both}") print(f" canonical (any): {n_canonical}") print(f" scraper-only: {n_scraper_only}") # 4. Co z TPDB/StashDB nie trafiło do bazy? local_tpdb_ids = set() local_stashdb_ids = set() for refs in scene_refs.values(): for x in refs.get("tpdb") or []: local_tpdb_ids.add(x) for x in refs.get("stashdb") or []: local_stashdb_ids.add(x) tpdb_missing = set(tpdb_scenes.keys()) - local_tpdb_ids stashdb_missing = set(stashdb_scenes.keys()) - local_stashdb_ids print(f"\nMissing from local DB:") print(f" TPDB scenes not in DB: {len(tpdb_missing)} / {len(tpdb_scenes)}") print(f" StashDB scenes not in DB: {len(stashdb_missing)} / {len(stashdb_scenes)}") if tpdb_missing: print(f"\n--- TPDB scenes missing from DB (first 20) ---") for ext_id in list(tpdb_missing)[:20]: s = tpdb_scenes[ext_id] print(f" [{s['date']}] {s['title']!r} studio={s['studio']}") if stashdb_missing: print(f"\n--- StashDB scenes missing from DB (first 20) ---") for ext_id in list(stashdb_missing)[:20]: s = stashdb_scenes[ext_id] print(f" [{s['date']}] {s['title']!r} studio={s['studio']}") # cross-source: TPDB ∩ StashDB (po dacie+tytule, bo external_id z dwóch baz są różne) # heurystyka: ta sama data + token_set_ratio(tytuł) >= 80 from rapidfuzz import fuzz cross_pairs = [] for tpdb_eid, t in tpdb_scenes.items(): for stashdb_eid, s in stashdb_scenes.items(): if t["date"] and s["date"] and t["date"] == s["date"]: score = fuzz.token_set_ratio(t["title"] or "", s["title"] or "") if score >= 80: cross_pairs.append((t, s, score)) break print(f"\nTPDB ∩ StashDB (same date + title fuzzy ≥80): {len(cross_pairs)} par") if __name__ == "__main__": if len(sys.argv) < 2: print("usage: python -m scripts.compare_performer_canon \"\"") sys.exit(1) main(sys.argv[1])