goon/scripts/fill_tpdb_refs_batch.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

113 lines
4.1 KiB
Python

"""Batch fill TPDB external_refs dla top-N performerów którzy mają stashdb ref ale nie tpdb.
Live lookup TPDB UUID po nazwie → INSERT do PerformerExternalRef. Po skończeniu printuje
listę CSV nazw — gotową do podania do `worker --once --strategy=performer-driven --performers='...'`.
"""
from __future__ import annotations
import sys
from sqlalchemy import case, func, select
from app.connectors.tpdb import TPDBConnector
from app.db import session_scope
from app.models.performer import Performer, PerformerExternalRef
from app.models.scene import ScenePerformer
from app.models.source import Source, SourceKind
def main(top_n: int = 30) -> None:
with session_scope() as session:
tpdb_src_id = session.execute(
select(Source.id).where(Source.kind == SourceKind.tpdb)
).scalar_one()
stashdb_src_id = session.execute(
select(Source.id).where(Source.kind == SourceKind.stashdb)
).scalar_one()
has_tpdb = (
select(PerformerExternalRef.performer_id)
.where(PerformerExternalRef.source_id == tpdb_src_id)
.distinct()
).subquery()
has_stashdb = (
select(PerformerExternalRef.performer_id)
.where(PerformerExternalRef.source_id == stashdb_src_id)
.distinct()
).subquery()
candidates = session.execute(
select(
Performer.id,
Performer.canonical_name,
func.count(ScenePerformer.scene_id).label("c"),
)
.outerjoin(ScenePerformer, ScenePerformer.performer_id == Performer.id)
.outerjoin(has_tpdb, has_tpdb.c.performer_id == Performer.id)
.join(has_stashdb, has_stashdb.c.performer_id == Performer.id)
.where(has_tpdb.c.performer_id.is_(None))
.group_by(Performer.id, Performer.canonical_name)
.order_by(func.count(ScenePerformer.scene_id).desc())
.limit(top_n)
).all()
print(f"=== {len(candidates)} candidates (have stashdb, missing tpdb) ===")
for pid, name, count in candidates:
print(f" {count:5d} {name}")
print("\n=== Live lookup TPDB ===")
tpdb = TPDBConnector()
matched: list[str] = []
not_found: list[str] = []
for pid, name, count in candidates:
try:
tpdb_id = tpdb.find_performer_id_by_name(name)
except Exception as e:
print(f" ERR {name}: {e}")
tpdb_id = None
if not tpdb_id:
not_found.append(name)
print(f" -- {name} (not found in TPDB)")
continue
# Insert ref
with session_scope() as session:
existing = session.execute(
select(PerformerExternalRef).where(
PerformerExternalRef.source_id == tpdb_src_id,
PerformerExternalRef.external_id == tpdb_id,
)
).scalar_one_or_none()
if existing:
# Conflict: TPDB UUID już zmapowany do innego performera lokalnego.
# Zostawić — ręczna decyzja czy mergować performerów.
if existing.performer_id != pid:
print(
f" CONFLICT {name}: tpdb={tpdb_id} already mapped to "
f"performer_id={existing.performer_id}"
)
else:
print(f" ok (already linked) {name}: tpdb={tpdb_id}")
matched.append(name)
continue
session.add(
PerformerExternalRef(
source_id=tpdb_src_id,
external_id=tpdb_id,
performer_id=pid,
confidence=0.9,
)
)
print(f" + {name}: tpdb={tpdb_id}")
matched.append(name)
print(f"\n=== Done ===")
print(f"linked: {len(matched)}")
print(f"not_found in tpdb: {len(not_found)}")
if matched:
print("\nNames CSV (paste to --performers):")
print(",".join(matched))
if __name__ == "__main__":
n = int(sys.argv[1]) if len(sys.argv) > 1 else 30
main(n)