goon/scripts/tpdb_backfill.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

191 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""TPDB full sync via date-iteration backfill.
**Powód:** TPDB `/scenes` endpoint capuje `meta.total=10000` i `last_page=100` na
globalnym sortowaniu. Bez date filter dostajemy tylko najnowsze 10k scen (z czego
mamy już 54k → 0 nowych). Trick: `?date=YYYY-MM-DD` filtruje na sceny RELEASED
tego dnia — pagination per-day nie ma capu. Iterując daty back to 2010 łapiemy
pełen katalog.
**ETA:**
- Date range: configurable (default 2020-01-01 → today, ~5.5 lat × 365 = ~2000 dni)
- Per-day API: ~5-20 pages × 1.7s = 8-34s
- Total: ~10-20h API time + ingest overhead, plus DB write
- Per-day delay: 0.5s żeby nie hammerować API
**Resumability:** progress zapisuje się w `/tmp/tpdb_backfill_progress.txt` (jedna
data per linia). Re-run automatycznie skipuje przerobione daty.
**Usage on VPS:**
docker compose exec -d worker python scripts/tpdb_backfill.py \
--start 2020-01-01 --end 2026-05-12
# monitor:
docker compose logs -f worker | grep tpdb_backfill
"""
from __future__ import annotations
import argparse
import logging
import sys
import time
from datetime import UTC, date, datetime, timedelta
import httpx
from app.config import get_settings
from app.connectors.tpdb import TPDBConnector, _parse_scene
from app.db import session_scope
from app.ingest import _process_scene, get_or_create_source
from app.models.ingest_run import IngestRun, IngestStatus
from app.models.source import SourceKind
log = logging.getLogger("tpdb_backfill")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
)
PROGRESS_FILE = "/tmp/tpdb_backfill_progress.txt"
def _load_done_dates() -> set[str]:
try:
with open(PROGRESS_FILE) as f:
return {line.strip() for line in f if line.strip()}
except FileNotFoundError:
return set()
def _mark_done(d: str) -> None:
with open(PROGRESS_FILE, "a") as f:
f.write(d + "\n")
def _ingest_date(connector: TPDBConnector, source_id, day: str) -> dict[str, int]:
"""Ingest jednego dnia. Wszystkie strony /scenes?date=<day>."""
counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}
page = 1
with connector._client() as client:
while True:
try:
payload = connector._get(
client, "/scenes", {"per_page": 100, "page": page, "date": day}
)
except httpx.HTTPStatusError as e:
# 429 from connector — tenacity already retried 5x. Backoff + continue.
log.warning("tpdb fetch %s page=%d failed: %s — backoff 30s", day, page, e)
time.sleep(30)
continue
data = payload.get("data") or []
if not data:
break
for raw in data:
scene = _parse_scene(raw)
if scene is None:
continue
counters["seen"] += 1
try:
# _process_scene wewnątrz robi własny session_scope — per-scene
# transaction znaczy że niepowodzenie jednej sceny nie zabija
# całego dnia.
_process_scene(
source_id=source_id, raw_scene=scene, counters=counters,
)
except Exception as e:
counters["errors"] += 1
log.exception("ingest scene failed %s: %s", scene.external_id, e)
meta = payload.get("meta") or {}
last_page = meta.get("last_page") or page
if page >= last_page:
break
page += 1
return counters
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--start", default="2020-01-01", help="Start date YYYY-MM-DD")
ap.add_argument("--end", default=date.today().isoformat(), help="End date YYYY-MM-DD")
ap.add_argument("--reverse", action="store_true", help="Iterate newest → oldest (default: oldest → newest)")
ap.add_argument("--delay", type=float, default=0.5, help="Sleep between days (seconds)")
args = ap.parse_args()
start = date.fromisoformat(args.start)
end = date.fromisoformat(args.end)
n_days = (end - start).days + 1
log.info("backfill range: %s%s (%d days)", start, end, n_days)
done = _load_done_dates()
log.info("already processed: %d days (resume mode)", len(done))
connector = TPDBConnector()
with session_scope() as session:
source = get_or_create_source(session, kind=SourceKind.tpdb, name="tpdb")
run = IngestRun(source_id=source.id, status=IngestStatus.running)
session.add(run)
session.flush()
source_id = source.id
run_id = run.id
log.info("ingest_run %s started", run_id)
total_counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}
days_processed = 0
failed_days = []
dates = [start + timedelta(days=i) for i in range(n_days)]
if args.reverse:
dates.reverse()
t_start = time.time()
for i, d in enumerate(dates):
day_str = d.isoformat()
if day_str in done:
continue
try:
t0 = time.time()
counters = _ingest_date(connector, source_id, day_str)
elapsed = time.time() - t0
for k in total_counters:
total_counters[k] += counters[k]
days_processed += 1
log.info(
"day=%s [%d/%d] %s elapsed=%.1fs total_new=%d total_seen=%d",
day_str, i + 1, n_days, counters, elapsed,
total_counters["new"], total_counters["seen"],
)
_mark_done(day_str)
time.sleep(args.delay)
except KeyboardInterrupt:
log.warning("interrupted at day=%s — progress saved, resume with same args", day_str)
break
except Exception as e:
failed_days.append(day_str)
log.exception("day=%s FAILED: %s", day_str, e)
time.sleep(5)
overall_elapsed = time.time() - t_start
log.info(
"backfill done in %.0fmin. days_processed=%d, failed=%d, totals=%s",
overall_elapsed / 60, days_processed, len(failed_days), total_counters,
)
with session_scope() as session:
run = session.get(IngestRun, run_id)
if run is not None:
run.finished_at = datetime.now(UTC)
run.status = IngestStatus.success if not failed_days else IngestStatus.partial
run.records_seen = total_counters["seen"]
run.records_new = total_counters["new"]
run.records_updated = total_counters["updated"]
if failed_days:
run.errors = {"failed_days": failed_days[:50]} # cap przy serialize
return 0 if not failed_days else 1
if __name__ == "__main__":
sys.exit(main())