"""Znajdź performerów którym brakuje TPDB lub StashDB external_ref, mimo że mają sporo scen. Use-case: ten sam bug co Kitana Montana — performer ma stashdb ref ale nie tpdb (lub odwrotnie), więc performer-driven nigdy nie zaciąga z tej drugiej bazy. Skutek: niedoszacowanie sceny per performer + zerowa szansa na cross-source merge. Output: lista (canonical_name, scene_count, has_tpdb, has_stashdb) posortowana po scene_count desc. """ from __future__ import annotations import sys from sqlalchemy import case, func, select from app.db import session_scope from app.models.performer import Performer, PerformerExternalRef from app.models.scene import ScenePerformer from app.models.source import Source, SourceKind def main(top_n: int = 50) -> None: with session_scope() as session: tpdb_src = session.execute( select(Source.id).where(Source.kind == SourceKind.tpdb) ).scalar_one() stashdb_src = session.execute( select(Source.id).where(Source.kind == SourceKind.stashdb) ).scalar_one() # subq: dla każdego performera czy ma TPDB / StashDB ref has_tpdb = ( select(PerformerExternalRef.performer_id) .where(PerformerExternalRef.source_id == tpdb_src) .distinct() ).subquery() has_stashdb = ( select(PerformerExternalRef.performer_id) .where(PerformerExternalRef.source_id == stashdb_src) .distinct() ).subquery() # main query rows = session.execute( select( Performer.id, Performer.canonical_name, func.count(ScenePerformer.scene_id).label("c"), case((has_tpdb.c.performer_id.isnot(None), 1), else_=0).label("has_tpdb"), case((has_stashdb.c.performer_id.isnot(None), 1), else_=0).label("has_stashdb"), ) .outerjoin(ScenePerformer, ScenePerformer.performer_id == Performer.id) .outerjoin(has_tpdb, has_tpdb.c.performer_id == Performer.id) .outerjoin(has_stashdb, has_stashdb.c.performer_id == Performer.id) .group_by( Performer.id, Performer.canonical_name, has_tpdb.c.performer_id, has_stashdb.c.performer_id, ) .order_by(func.count(ScenePerformer.scene_id).desc()) .limit(top_n * 5) # buffer to filter ).all() # bucket only_stashdb: list[tuple] = [] only_tpdb: list[tuple] = [] no_canonical: list[tuple] = [] both: list[tuple] = [] for pid, name, count, has_t, has_s in rows: if has_t and has_s: both.append((name, count)) elif has_s and not has_t: only_stashdb.append((name, count)) elif has_t and not has_s: only_tpdb.append((name, count)) else: no_canonical.append((name, count)) print(f"=== Top {top_n*5} performers by scene_count ===") print(f" both canonical refs: {len(both)}") print(f" only stashdb (miss TPDB): {len(only_stashdb)}") print(f" only tpdb (miss StashDB): {len(only_tpdb)}") print(f" no canonical refs: {len(no_canonical)}") print(f"\n--- Top {top_n} missing TPDB (have stashdb) ---") for name, c in only_stashdb[:top_n]: print(f" {c:5d} {name}") print(f"\n--- Top {top_n} missing StashDB (have tpdb) ---") for name, c in only_tpdb[:top_n]: print(f" {c:5d} {name}") print(f"\n--- Top {top_n} no canonical refs at all (probably scraper-only or alias issue) ---") for name, c in no_canonical[:top_n]: print(f" {c:5d} {name}") if __name__ == "__main__": n = int(sys.argv[1]) if len(sys.argv) > 1 else 30 main(n)