goon/scripts/compare_performer_canon.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

190 lines
8 KiB
Python

"""Diagnostyka: dla danego performera porównaj listę scen z TPDB, StashDB i lokalnej DB.
Pokazuje:
- ile scen TPDB / StashDB ma performer
- ile z nich mamy w naszej DB (po canonical external_ref)
- ile naszych scen ma BOTH refs (cross-source merged)
- tube-scraper sceny przypięte do performera (bez canonical)
Użycie: python -m scripts.compare_performer_canon "Kitana Montana"
"""
from __future__ import annotations
import sys
from collections import defaultdict
from sqlalchemy import select
from app.connectors.stashdb import StashDBConnector
from app.connectors.tpdb import TPDBConnector
from app.db import session_scope
from app.models.performer import Performer, PerformerExternalRef
from app.models.scene import Scene, SceneExternalRef, ScenePerformer
from app.models.source import Source, SourceKind
def _fmt_scene(s) -> str:
title = (s.title or "")[:60]
date = s.release_date.isoformat() if s.release_date else "????-??-??"
studio = s.studio_id and "yes" or "-"
return f" [{date}] {title!r} (studio={studio}, dur={s.duration_sec})"
def main(name: str) -> None:
print(f"=== Performer: {name} ===\n")
# 1. TPDB
tpdb = TPDBConnector()
tpdb_id = tpdb.find_performer_id_by_name(name)
print(f"TPDB performer_id: {tpdb_id}")
tpdb_scenes: dict[str, dict] = {}
if tpdb_id:
for raw in tpdb.fetch_scenes_for_performer(tpdb_id, limit=2000):
tpdb_scenes[raw.external_id] = {
"title": raw.title,
"date": raw.release_date.isoformat() if raw.release_date else None,
"studio": raw.studio.name if raw.studio else None,
"duration": raw.duration_sec,
}
print(f"TPDB scenes count: {len(tpdb_scenes)}")
# 2. StashDB
stashdb = StashDBConnector()
stashdb_id = stashdb.find_performer_id_by_name(name)
print(f"\nStashDB performer_id: {stashdb_id}")
stashdb_scenes: dict[str, dict] = {}
if stashdb_id:
for raw in stashdb.fetch_scenes_for_performer(stashdb_id, limit=2000):
stashdb_scenes[raw.external_id] = {
"title": raw.title,
"date": raw.release_date.isoformat() if raw.release_date else None,
"studio": raw.studio.name if raw.studio else None,
"duration": raw.duration_sec,
}
print(f"StashDB scenes count: {len(stashdb_scenes)}")
# 3. Lokalna DB
with session_scope() as session:
perf_row = session.execute(
select(Performer)
.where(Performer.canonical_name.ilike(name))
.limit(1)
).scalar_one_or_none()
if perf_row is None:
# spróbuj normalized
from app.normalize.text import normalize
perf_row = session.execute(
select(Performer)
.where(Performer.name_normalized == normalize(name))
.limit(1)
).scalar_one_or_none()
local_perf_id = perf_row.id if perf_row else None
print(f"\nLocal performer: id={local_perf_id} canonical_name={perf_row.canonical_name if perf_row else None!r}")
# external_refs lokalnego performera (per source kind)
if perf_row:
ext_rows = session.execute(
select(Source.kind, PerformerExternalRef.external_id)
.join(Source, Source.id == PerformerExternalRef.source_id)
.where(PerformerExternalRef.performer_id == perf_row.id)
).all()
print(f"Local performer external_refs:")
for kind, ext_id in ext_rows:
print(f" {kind.value}: {ext_id}")
if local_perf_id is None:
print("\n(brak lokalnego performera, kończę po fetchach)")
print(f"\n=== TPDB-only ({len(tpdb_scenes)}) ===")
for ext_id, s in list(tpdb_scenes.items())[:30]:
print(f" [{s['date']}] {s['title']}")
print(f"\n=== StashDB-only ({len(stashdb_scenes)}) ===")
for ext_id, s in list(stashdb_scenes.items())[:30]:
print(f" [{s['date']}] {s['title']}")
return
# wszystkie sceny z performerem w lokalnej DB
local_scene_rows = session.execute(
select(Scene)
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
.where(ScenePerformer.performer_id == local_perf_id)
.order_by(Scene.release_date.desc().nulls_last())
).scalars().all()
print(f"\nLocal scenes for performer: {len(local_scene_rows)}")
# per scene: zbierz refs (tpdb / stashdb / scraper)
scene_refs: dict[str, dict[str, list[str]]] = {}
for s in local_scene_rows:
refs = session.execute(
select(Source.kind, SceneExternalRef.external_id)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(SceneExternalRef.scene_id == s.id)
).all()
buckets: dict[str, list[str]] = defaultdict(list)
for kind, ext_id in refs:
buckets[kind.value].append(ext_id)
scene_refs[str(s.id)] = dict(buckets)
# statystyki
n_with_tpdb_ref = sum(1 for r in scene_refs.values() if r.get("tpdb"))
n_with_stashdb_ref = sum(1 for r in scene_refs.values() if r.get("stashdb"))
n_with_both = sum(1 for r in scene_refs.values() if r.get("tpdb") and r.get("stashdb"))
n_canonical = sum(1 for r in scene_refs.values() if r.get("tpdb") or r.get("stashdb"))
n_scraper_only = sum(
1 for r in scene_refs.values()
if not r.get("tpdb") and not r.get("stashdb") and r.get("scraper")
)
print(f"\nLocal scene breakdown:")
print(f" with TPDB ref: {n_with_tpdb_ref}")
print(f" with StashDB ref: {n_with_stashdb_ref}")
print(f" with BOTH (merged!): {n_with_both}")
print(f" canonical (any): {n_canonical}")
print(f" scraper-only: {n_scraper_only}")
# 4. Co z TPDB/StashDB nie trafiło do bazy?
local_tpdb_ids = set()
local_stashdb_ids = set()
for refs in scene_refs.values():
for x in refs.get("tpdb") or []:
local_tpdb_ids.add(x)
for x in refs.get("stashdb") or []:
local_stashdb_ids.add(x)
tpdb_missing = set(tpdb_scenes.keys()) - local_tpdb_ids
stashdb_missing = set(stashdb_scenes.keys()) - local_stashdb_ids
print(f"\nMissing from local DB:")
print(f" TPDB scenes not in DB: {len(tpdb_missing)} / {len(tpdb_scenes)}")
print(f" StashDB scenes not in DB: {len(stashdb_missing)} / {len(stashdb_scenes)}")
if tpdb_missing:
print(f"\n--- TPDB scenes missing from DB (first 20) ---")
for ext_id in list(tpdb_missing)[:20]:
s = tpdb_scenes[ext_id]
print(f" [{s['date']}] {s['title']!r} studio={s['studio']}")
if stashdb_missing:
print(f"\n--- StashDB scenes missing from DB (first 20) ---")
for ext_id in list(stashdb_missing)[:20]:
s = stashdb_scenes[ext_id]
print(f" [{s['date']}] {s['title']!r} studio={s['studio']}")
# cross-source: TPDB ∩ StashDB (po dacie+tytule, bo external_id z dwóch baz są różne)
# heurystyka: ta sama data + token_set_ratio(tytuł) >= 80
from rapidfuzz import fuzz
cross_pairs = []
for tpdb_eid, t in tpdb_scenes.items():
for stashdb_eid, s in stashdb_scenes.items():
if t["date"] and s["date"] and t["date"] == s["date"]:
score = fuzz.token_set_ratio(t["title"] or "", s["title"] or "")
if score >= 80:
cross_pairs.append((t, s, score))
break
print(f"\nTPDB ∩ StashDB (same date + title fuzzy ≥80): {len(cross_pairs)} par")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("usage: python -m scripts.compare_performer_canon \"<performer name>\"")
sys.exit(1)
main(sys.argv[1])