goon/scripts/tpdb_studio_backfill.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

72 lines
2.7 KiB
Python

"""TPDB studio scenes batch ingest dla top-orphan studios w naszej DB.
Bierze studios które:
- Mają ≥1 freshporno scene jako orphan (bez TPDB/StashDB canonical ref)
- Mają TPDB external_id (site ID)
Per każde studio: fetch wszystkich scen z `/sites/<id>/scenes` TPDB endpoint,
przepuszcza przez nasz standard resolver. Po backfill — phash/title fuzzy match
może podpiąć orphany do nowo zaimportowanych canonical scenes (Path 3 resolver).
"""
from __future__ import annotations
from app.connectors.tpdb import TPDBConnector
from app.db import session_scope
from app.models.source import SourceKind
from app.scheduler.performer_driven import _ingest_iter_into_run
# Używamy psycopg %s placeholders (nie SA text() z :colon) — `:` w wartości
# 'tube:freshpornoorg' rozwala SA bind parser nawet przy explicit bindparams.
QUERY = """
WITH fp_orphans AS (
SELECT DISTINCT sc.studio_id FROM scenes sc
JOIN playback_sources ps ON ps.scene_id = sc.id
WHERE ps.origin = %s AND ps.dead_at IS NULL
AND sc.studio_id IS NOT NULL
AND NOT EXISTS (
SELECT 1 FROM scene_external_refs er JOIN sources s ON s.id=er.source_id
WHERE er.scene_id=sc.id AND s.name IN (%s, %s)
)
)
SELECT st.name, ser.external_id
FROM fp_orphans fo
JOIN studios st ON st.id = fo.studio_id
JOIN studio_external_refs ser ON ser.studio_id = st.id
JOIN sources s ON s.id = ser.source_id AND s.name = %s
ORDER BY st.name;
"""
def main() -> None:
c = TPDBConnector()
with session_scope() as sess:
rows = list(sess.connection().exec_driver_sql(
QUERY, ("tube:freshpornoorg", "tpdb", "stashdb", "tpdb"),
))
# Resume idempotent — _ingest_iter_into_run dedups by external_id, więc
# już-processed studios skipnęją szybko (seen ~= 0, new = 0).
print(f"FOUND {len(rows)} top-orphan studios with TPDB site IDs", flush=True)
total_new = 0
total_seen = 0
for i, (name, site_id) in enumerate(rows, 1):
print(f"[{i}/{len(rows)}] {name} (site={site_id}) ...", flush=True)
try:
counters = _ingest_iter_into_run(
source_kind=SourceKind.tpdb,
source_name="tpdb",
run_label=f"tpdb-studio-backfill:{name}",
iterator_factory=lambda sid=site_id: c.fetch_scenes_for_site(sid),
)
seen = counters.get("seen", 0)
new = counters.get("new", 0)
total_seen += seen
total_new += new
print(f" seen={seen} new={new}", flush=True)
except Exception as e:
print(f" ERR: {type(e).__name__}: {str(e)[:200]}", flush=True)
print(f"\nDONE total_seen={total_seen} total_new={total_new}", flush=True)
if __name__ == "__main__":
main()