"""Diagnostyka: dla danego performera porównaj listę scen z TPDB, StashDB i lokalnej DB.

Pokazuje:
  - ile scen TPDB / StashDB ma performer
  - ile z nich mamy w naszej DB (po canonical external_ref)
  - ile naszych scen ma BOTH refs (cross-source merged)
  - tube-scraper sceny przypięte do performera (bez canonical)

Użycie: python -m scripts.compare_performer_canon "Kitana Montana"
"""
from __future__ import annotations

import sys
from collections import defaultdict

from sqlalchemy import select

from app.connectors.stashdb import StashDBConnector
from app.connectors.tpdb import TPDBConnector
from app.db import session_scope
from app.models.performer import Performer, PerformerExternalRef
from app.models.scene import Scene, SceneExternalRef, ScenePerformer
from app.models.source import Source, SourceKind


def _fmt_scene(s) -> str:
    title = (s.title or "")[:60]
    date = s.release_date.isoformat() if s.release_date else "????-??-??"
    studio = s.studio_id and "yes" or "-"
    return f"  [{date}] {title!r} (studio={studio}, dur={s.duration_sec})"


def main(name: str) -> None:
    print(f"=== Performer: {name} ===\n")

    # 1. TPDB
    tpdb = TPDBConnector()
    tpdb_id = tpdb.find_performer_id_by_name(name)
    print(f"TPDB performer_id: {tpdb_id}")
    tpdb_scenes: dict[str, dict] = {}
    if tpdb_id:
        for raw in tpdb.fetch_scenes_for_performer(tpdb_id, limit=2000):
            tpdb_scenes[raw.external_id] = {
                "title": raw.title,
                "date": raw.release_date.isoformat() if raw.release_date else None,
                "studio": raw.studio.name if raw.studio else None,
                "duration": raw.duration_sec,
            }
    print(f"TPDB scenes count: {len(tpdb_scenes)}")

    # 2. StashDB
    stashdb = StashDBConnector()
    stashdb_id = stashdb.find_performer_id_by_name(name)
    print(f"\nStashDB performer_id: {stashdb_id}")
    stashdb_scenes: dict[str, dict] = {}
    if stashdb_id:
        for raw in stashdb.fetch_scenes_for_performer(stashdb_id, limit=2000):
            stashdb_scenes[raw.external_id] = {
                "title": raw.title,
                "date": raw.release_date.isoformat() if raw.release_date else None,
                "studio": raw.studio.name if raw.studio else None,
                "duration": raw.duration_sec,
            }
    print(f"StashDB scenes count: {len(stashdb_scenes)}")

    # 3. Lokalna DB
    with session_scope() as session:
        perf_row = session.execute(
            select(Performer)
            .where(Performer.canonical_name.ilike(name))
            .limit(1)
        ).scalar_one_or_none()

        if perf_row is None:
            # spróbuj normalized
            from app.normalize.text import normalize
            perf_row = session.execute(
                select(Performer)
                .where(Performer.name_normalized == normalize(name))
                .limit(1)
            ).scalar_one_or_none()

        local_perf_id = perf_row.id if perf_row else None
        print(f"\nLocal performer: id={local_perf_id} canonical_name={perf_row.canonical_name if perf_row else None!r}")

        # external_refs lokalnego performera (per source kind)
        if perf_row:
            ext_rows = session.execute(
                select(Source.kind, PerformerExternalRef.external_id)
                .join(Source, Source.id == PerformerExternalRef.source_id)
                .where(PerformerExternalRef.performer_id == perf_row.id)
            ).all()
            print(f"Local performer external_refs:")
            for kind, ext_id in ext_rows:
                print(f"  {kind.value}: {ext_id}")

        if local_perf_id is None:
            print("\n(brak lokalnego performera, kończę po fetchach)")
            print(f"\n=== TPDB-only ({len(tpdb_scenes)}) ===")
            for ext_id, s in list(tpdb_scenes.items())[:30]:
                print(f"  [{s['date']}] {s['title']}")
            print(f"\n=== StashDB-only ({len(stashdb_scenes)}) ===")
            for ext_id, s in list(stashdb_scenes.items())[:30]:
                print(f"  [{s['date']}] {s['title']}")
            return

        # wszystkie sceny z performerem w lokalnej DB
        local_scene_rows = session.execute(
            select(Scene)
            .join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
            .where(ScenePerformer.performer_id == local_perf_id)
            .order_by(Scene.release_date.desc().nulls_last())
        ).scalars().all()
        print(f"\nLocal scenes for performer: {len(local_scene_rows)}")

        # per scene: zbierz refs (tpdb / stashdb / scraper)
        scene_refs: dict[str, dict[str, list[str]]] = {}
        for s in local_scene_rows:
            refs = session.execute(
                select(Source.kind, SceneExternalRef.external_id)
                .join(Source, Source.id == SceneExternalRef.source_id)
                .where(SceneExternalRef.scene_id == s.id)
            ).all()
            buckets: dict[str, list[str]] = defaultdict(list)
            for kind, ext_id in refs:
                buckets[kind.value].append(ext_id)
            scene_refs[str(s.id)] = dict(buckets)

        # statystyki
        n_with_tpdb_ref = sum(1 for r in scene_refs.values() if r.get("tpdb"))
        n_with_stashdb_ref = sum(1 for r in scene_refs.values() if r.get("stashdb"))
        n_with_both = sum(1 for r in scene_refs.values() if r.get("tpdb") and r.get("stashdb"))
        n_canonical = sum(1 for r in scene_refs.values() if r.get("tpdb") or r.get("stashdb"))
        n_scraper_only = sum(
            1 for r in scene_refs.values()
            if not r.get("tpdb") and not r.get("stashdb") and r.get("scraper")
        )
        print(f"\nLocal scene breakdown:")
        print(f"  with TPDB ref:        {n_with_tpdb_ref}")
        print(f"  with StashDB ref:     {n_with_stashdb_ref}")
        print(f"  with BOTH (merged!):  {n_with_both}")
        print(f"  canonical (any):      {n_canonical}")
        print(f"  scraper-only:         {n_scraper_only}")

        # 4. Co z TPDB/StashDB nie trafiło do bazy?
        local_tpdb_ids = set()
        local_stashdb_ids = set()
        for refs in scene_refs.values():
            for x in refs.get("tpdb") or []:
                local_tpdb_ids.add(x)
            for x in refs.get("stashdb") or []:
                local_stashdb_ids.add(x)

        tpdb_missing = set(tpdb_scenes.keys()) - local_tpdb_ids
        stashdb_missing = set(stashdb_scenes.keys()) - local_stashdb_ids
        print(f"\nMissing from local DB:")
        print(f"  TPDB scenes not in DB:    {len(tpdb_missing)} / {len(tpdb_scenes)}")
        print(f"  StashDB scenes not in DB: {len(stashdb_missing)} / {len(stashdb_scenes)}")

        if tpdb_missing:
            print(f"\n--- TPDB scenes missing from DB (first 20) ---")
            for ext_id in list(tpdb_missing)[:20]:
                s = tpdb_scenes[ext_id]
                print(f"  [{s['date']}] {s['title']!r}  studio={s['studio']}")

        if stashdb_missing:
            print(f"\n--- StashDB scenes missing from DB (first 20) ---")
            for ext_id in list(stashdb_missing)[:20]:
                s = stashdb_scenes[ext_id]
                print(f"  [{s['date']}] {s['title']!r}  studio={s['studio']}")

        # cross-source: TPDB ∩ StashDB (po dacie+tytule, bo external_id z dwóch baz są różne)
        # heurystyka: ta sama data + token_set_ratio(tytuł) >= 80
        from rapidfuzz import fuzz
        cross_pairs = []
        for tpdb_eid, t in tpdb_scenes.items():
            for stashdb_eid, s in stashdb_scenes.items():
                if t["date"] and s["date"] and t["date"] == s["date"]:
                    score = fuzz.token_set_ratio(t["title"] or "", s["title"] or "")
                    if score >= 80:
                        cross_pairs.append((t, s, score))
                        break
        print(f"\nTPDB ∩ StashDB (same date + title fuzzy ≥80): {len(cross_pairs)} par")


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("usage: python -m scripts.compare_performer_canon \"<performer name>\"")
        sys.exit(1)
    main(sys.argv[1])