Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
190 lines
8 KiB
Python
190 lines
8 KiB
Python
"""Diagnostyka: dla danego performera porównaj listę scen z TPDB, StashDB i lokalnej DB.
|
|
|
|
Pokazuje:
|
|
- ile scen TPDB / StashDB ma performer
|
|
- ile z nich mamy w naszej DB (po canonical external_ref)
|
|
- ile naszych scen ma BOTH refs (cross-source merged)
|
|
- tube-scraper sceny przypięte do performera (bez canonical)
|
|
|
|
Użycie: python -m scripts.compare_performer_canon "Kitana Montana"
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
from sqlalchemy import select
|
|
|
|
from app.connectors.stashdb import StashDBConnector
|
|
from app.connectors.tpdb import TPDBConnector
|
|
from app.db import session_scope
|
|
from app.models.performer import Performer, PerformerExternalRef
|
|
from app.models.scene import Scene, SceneExternalRef, ScenePerformer
|
|
from app.models.source import Source, SourceKind
|
|
|
|
|
|
def _fmt_scene(s) -> str:
|
|
title = (s.title or "")[:60]
|
|
date = s.release_date.isoformat() if s.release_date else "????-??-??"
|
|
studio = s.studio_id and "yes" or "-"
|
|
return f" [{date}] {title!r} (studio={studio}, dur={s.duration_sec})"
|
|
|
|
|
|
def main(name: str) -> None:
|
|
print(f"=== Performer: {name} ===\n")
|
|
|
|
# 1. TPDB
|
|
tpdb = TPDBConnector()
|
|
tpdb_id = tpdb.find_performer_id_by_name(name)
|
|
print(f"TPDB performer_id: {tpdb_id}")
|
|
tpdb_scenes: dict[str, dict] = {}
|
|
if tpdb_id:
|
|
for raw in tpdb.fetch_scenes_for_performer(tpdb_id, limit=2000):
|
|
tpdb_scenes[raw.external_id] = {
|
|
"title": raw.title,
|
|
"date": raw.release_date.isoformat() if raw.release_date else None,
|
|
"studio": raw.studio.name if raw.studio else None,
|
|
"duration": raw.duration_sec,
|
|
}
|
|
print(f"TPDB scenes count: {len(tpdb_scenes)}")
|
|
|
|
# 2. StashDB
|
|
stashdb = StashDBConnector()
|
|
stashdb_id = stashdb.find_performer_id_by_name(name)
|
|
print(f"\nStashDB performer_id: {stashdb_id}")
|
|
stashdb_scenes: dict[str, dict] = {}
|
|
if stashdb_id:
|
|
for raw in stashdb.fetch_scenes_for_performer(stashdb_id, limit=2000):
|
|
stashdb_scenes[raw.external_id] = {
|
|
"title": raw.title,
|
|
"date": raw.release_date.isoformat() if raw.release_date else None,
|
|
"studio": raw.studio.name if raw.studio else None,
|
|
"duration": raw.duration_sec,
|
|
}
|
|
print(f"StashDB scenes count: {len(stashdb_scenes)}")
|
|
|
|
# 3. Lokalna DB
|
|
with session_scope() as session:
|
|
perf_row = session.execute(
|
|
select(Performer)
|
|
.where(Performer.canonical_name.ilike(name))
|
|
.limit(1)
|
|
).scalar_one_or_none()
|
|
|
|
if perf_row is None:
|
|
# spróbuj normalized
|
|
from app.normalize.text import normalize
|
|
perf_row = session.execute(
|
|
select(Performer)
|
|
.where(Performer.name_normalized == normalize(name))
|
|
.limit(1)
|
|
).scalar_one_or_none()
|
|
|
|
local_perf_id = perf_row.id if perf_row else None
|
|
print(f"\nLocal performer: id={local_perf_id} canonical_name={perf_row.canonical_name if perf_row else None!r}")
|
|
|
|
# external_refs lokalnego performera (per source kind)
|
|
if perf_row:
|
|
ext_rows = session.execute(
|
|
select(Source.kind, PerformerExternalRef.external_id)
|
|
.join(Source, Source.id == PerformerExternalRef.source_id)
|
|
.where(PerformerExternalRef.performer_id == perf_row.id)
|
|
).all()
|
|
print(f"Local performer external_refs:")
|
|
for kind, ext_id in ext_rows:
|
|
print(f" {kind.value}: {ext_id}")
|
|
|
|
if local_perf_id is None:
|
|
print("\n(brak lokalnego performera, kończę po fetchach)")
|
|
print(f"\n=== TPDB-only ({len(tpdb_scenes)}) ===")
|
|
for ext_id, s in list(tpdb_scenes.items())[:30]:
|
|
print(f" [{s['date']}] {s['title']}")
|
|
print(f"\n=== StashDB-only ({len(stashdb_scenes)}) ===")
|
|
for ext_id, s in list(stashdb_scenes.items())[:30]:
|
|
print(f" [{s['date']}] {s['title']}")
|
|
return
|
|
|
|
# wszystkie sceny z performerem w lokalnej DB
|
|
local_scene_rows = session.execute(
|
|
select(Scene)
|
|
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
|
|
.where(ScenePerformer.performer_id == local_perf_id)
|
|
.order_by(Scene.release_date.desc().nulls_last())
|
|
).scalars().all()
|
|
print(f"\nLocal scenes for performer: {len(local_scene_rows)}")
|
|
|
|
# per scene: zbierz refs (tpdb / stashdb / scraper)
|
|
scene_refs: dict[str, dict[str, list[str]]] = {}
|
|
for s in local_scene_rows:
|
|
refs = session.execute(
|
|
select(Source.kind, SceneExternalRef.external_id)
|
|
.join(Source, Source.id == SceneExternalRef.source_id)
|
|
.where(SceneExternalRef.scene_id == s.id)
|
|
).all()
|
|
buckets: dict[str, list[str]] = defaultdict(list)
|
|
for kind, ext_id in refs:
|
|
buckets[kind.value].append(ext_id)
|
|
scene_refs[str(s.id)] = dict(buckets)
|
|
|
|
# statystyki
|
|
n_with_tpdb_ref = sum(1 for r in scene_refs.values() if r.get("tpdb"))
|
|
n_with_stashdb_ref = sum(1 for r in scene_refs.values() if r.get("stashdb"))
|
|
n_with_both = sum(1 for r in scene_refs.values() if r.get("tpdb") and r.get("stashdb"))
|
|
n_canonical = sum(1 for r in scene_refs.values() if r.get("tpdb") or r.get("stashdb"))
|
|
n_scraper_only = sum(
|
|
1 for r in scene_refs.values()
|
|
if not r.get("tpdb") and not r.get("stashdb") and r.get("scraper")
|
|
)
|
|
print(f"\nLocal scene breakdown:")
|
|
print(f" with TPDB ref: {n_with_tpdb_ref}")
|
|
print(f" with StashDB ref: {n_with_stashdb_ref}")
|
|
print(f" with BOTH (merged!): {n_with_both}")
|
|
print(f" canonical (any): {n_canonical}")
|
|
print(f" scraper-only: {n_scraper_only}")
|
|
|
|
# 4. Co z TPDB/StashDB nie trafiło do bazy?
|
|
local_tpdb_ids = set()
|
|
local_stashdb_ids = set()
|
|
for refs in scene_refs.values():
|
|
for x in refs.get("tpdb") or []:
|
|
local_tpdb_ids.add(x)
|
|
for x in refs.get("stashdb") or []:
|
|
local_stashdb_ids.add(x)
|
|
|
|
tpdb_missing = set(tpdb_scenes.keys()) - local_tpdb_ids
|
|
stashdb_missing = set(stashdb_scenes.keys()) - local_stashdb_ids
|
|
print(f"\nMissing from local DB:")
|
|
print(f" TPDB scenes not in DB: {len(tpdb_missing)} / {len(tpdb_scenes)}")
|
|
print(f" StashDB scenes not in DB: {len(stashdb_missing)} / {len(stashdb_scenes)}")
|
|
|
|
if tpdb_missing:
|
|
print(f"\n--- TPDB scenes missing from DB (first 20) ---")
|
|
for ext_id in list(tpdb_missing)[:20]:
|
|
s = tpdb_scenes[ext_id]
|
|
print(f" [{s['date']}] {s['title']!r} studio={s['studio']}")
|
|
|
|
if stashdb_missing:
|
|
print(f"\n--- StashDB scenes missing from DB (first 20) ---")
|
|
for ext_id in list(stashdb_missing)[:20]:
|
|
s = stashdb_scenes[ext_id]
|
|
print(f" [{s['date']}] {s['title']!r} studio={s['studio']}")
|
|
|
|
# cross-source: TPDB ∩ StashDB (po dacie+tytule, bo external_id z dwóch baz są różne)
|
|
# heurystyka: ta sama data + token_set_ratio(tytuł) >= 80
|
|
from rapidfuzz import fuzz
|
|
cross_pairs = []
|
|
for tpdb_eid, t in tpdb_scenes.items():
|
|
for stashdb_eid, s in stashdb_scenes.items():
|
|
if t["date"] and s["date"] and t["date"] == s["date"]:
|
|
score = fuzz.token_set_ratio(t["title"] or "", s["title"] or "")
|
|
if score >= 80:
|
|
cross_pairs.append((t, s, score))
|
|
break
|
|
print(f"\nTPDB ∩ StashDB (same date + title fuzzy ≥80): {len(cross_pairs)} par")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 2:
|
|
print("usage: python -m scripts.compare_performer_canon \"<performer name>\"")
|
|
sys.exit(1)
|
|
main(sys.argv[1])
|