Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
191 lines
6.8 KiB
Python
191 lines
6.8 KiB
Python
"""TPDB full sync via date-iteration backfill.
|
||
|
||
**Powód:** TPDB `/scenes` endpoint capuje `meta.total=10000` i `last_page=100` na
|
||
globalnym sortowaniu. Bez date filter dostajemy tylko najnowsze 10k scen (z czego
|
||
mamy już 54k → 0 nowych). Trick: `?date=YYYY-MM-DD` filtruje na sceny RELEASED
|
||
tego dnia — pagination per-day nie ma capu. Iterując daty back to 2010 łapiemy
|
||
pełen katalog.
|
||
|
||
**ETA:**
|
||
- Date range: configurable (default 2020-01-01 → today, ~5.5 lat × 365 = ~2000 dni)
|
||
- Per-day API: ~5-20 pages × 1.7s = 8-34s
|
||
- Total: ~10-20h API time + ingest overhead, plus DB write
|
||
- Per-day delay: 0.5s żeby nie hammerować API
|
||
|
||
**Resumability:** progress zapisuje się w `/tmp/tpdb_backfill_progress.txt` (jedna
|
||
data per linia). Re-run automatycznie skipuje przerobione daty.
|
||
|
||
**Usage on VPS:**
|
||
docker compose exec -d worker python scripts/tpdb_backfill.py \
|
||
--start 2020-01-01 --end 2026-05-12
|
||
|
||
# monitor:
|
||
docker compose logs -f worker | grep tpdb_backfill
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import logging
|
||
import sys
|
||
import time
|
||
from datetime import UTC, date, datetime, timedelta
|
||
|
||
import httpx
|
||
|
||
from app.config import get_settings
|
||
from app.connectors.tpdb import TPDBConnector, _parse_scene
|
||
from app.db import session_scope
|
||
from app.ingest import _process_scene, get_or_create_source
|
||
from app.models.ingest_run import IngestRun, IngestStatus
|
||
from app.models.source import SourceKind
|
||
|
||
log = logging.getLogger("tpdb_backfill")
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
||
)
|
||
|
||
PROGRESS_FILE = "/tmp/tpdb_backfill_progress.txt"
|
||
|
||
|
||
def _load_done_dates() -> set[str]:
|
||
try:
|
||
with open(PROGRESS_FILE) as f:
|
||
return {line.strip() for line in f if line.strip()}
|
||
except FileNotFoundError:
|
||
return set()
|
||
|
||
|
||
def _mark_done(d: str) -> None:
|
||
with open(PROGRESS_FILE, "a") as f:
|
||
f.write(d + "\n")
|
||
|
||
|
||
def _ingest_date(connector: TPDBConnector, source_id, day: str) -> dict[str, int]:
|
||
"""Ingest jednego dnia. Wszystkie strony /scenes?date=<day>."""
|
||
counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}
|
||
page = 1
|
||
with connector._client() as client:
|
||
while True:
|
||
try:
|
||
payload = connector._get(
|
||
client, "/scenes", {"per_page": 100, "page": page, "date": day}
|
||
)
|
||
except httpx.HTTPStatusError as e:
|
||
# 429 from connector — tenacity already retried 5x. Backoff + continue.
|
||
log.warning("tpdb fetch %s page=%d failed: %s — backoff 30s", day, page, e)
|
||
time.sleep(30)
|
||
continue
|
||
data = payload.get("data") or []
|
||
if not data:
|
||
break
|
||
|
||
for raw in data:
|
||
scene = _parse_scene(raw)
|
||
if scene is None:
|
||
continue
|
||
counters["seen"] += 1
|
||
try:
|
||
# _process_scene wewnątrz robi własny session_scope — per-scene
|
||
# transaction znaczy że niepowodzenie jednej sceny nie zabija
|
||
# całego dnia.
|
||
_process_scene(
|
||
source_id=source_id, raw_scene=scene, counters=counters,
|
||
)
|
||
except Exception as e:
|
||
counters["errors"] += 1
|
||
log.exception("ingest scene failed %s: %s", scene.external_id, e)
|
||
|
||
meta = payload.get("meta") or {}
|
||
last_page = meta.get("last_page") or page
|
||
if page >= last_page:
|
||
break
|
||
page += 1
|
||
|
||
return counters
|
||
|
||
|
||
def main() -> int:
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--start", default="2020-01-01", help="Start date YYYY-MM-DD")
|
||
ap.add_argument("--end", default=date.today().isoformat(), help="End date YYYY-MM-DD")
|
||
ap.add_argument("--reverse", action="store_true", help="Iterate newest → oldest (default: oldest → newest)")
|
||
ap.add_argument("--delay", type=float, default=0.5, help="Sleep between days (seconds)")
|
||
args = ap.parse_args()
|
||
|
||
start = date.fromisoformat(args.start)
|
||
end = date.fromisoformat(args.end)
|
||
n_days = (end - start).days + 1
|
||
log.info("backfill range: %s → %s (%d days)", start, end, n_days)
|
||
|
||
done = _load_done_dates()
|
||
log.info("already processed: %d days (resume mode)", len(done))
|
||
|
||
connector = TPDBConnector()
|
||
with session_scope() as session:
|
||
source = get_or_create_source(session, kind=SourceKind.tpdb, name="tpdb")
|
||
run = IngestRun(source_id=source.id, status=IngestStatus.running)
|
||
session.add(run)
|
||
session.flush()
|
||
source_id = source.id
|
||
run_id = run.id
|
||
|
||
log.info("ingest_run %s started", run_id)
|
||
|
||
total_counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}
|
||
days_processed = 0
|
||
failed_days = []
|
||
|
||
dates = [start + timedelta(days=i) for i in range(n_days)]
|
||
if args.reverse:
|
||
dates.reverse()
|
||
|
||
t_start = time.time()
|
||
for i, d in enumerate(dates):
|
||
day_str = d.isoformat()
|
||
if day_str in done:
|
||
continue
|
||
try:
|
||
t0 = time.time()
|
||
counters = _ingest_date(connector, source_id, day_str)
|
||
elapsed = time.time() - t0
|
||
for k in total_counters:
|
||
total_counters[k] += counters[k]
|
||
days_processed += 1
|
||
log.info(
|
||
"day=%s [%d/%d] %s elapsed=%.1fs total_new=%d total_seen=%d",
|
||
day_str, i + 1, n_days, counters, elapsed,
|
||
total_counters["new"], total_counters["seen"],
|
||
)
|
||
_mark_done(day_str)
|
||
time.sleep(args.delay)
|
||
except KeyboardInterrupt:
|
||
log.warning("interrupted at day=%s — progress saved, resume with same args", day_str)
|
||
break
|
||
except Exception as e:
|
||
failed_days.append(day_str)
|
||
log.exception("day=%s FAILED: %s", day_str, e)
|
||
time.sleep(5)
|
||
|
||
overall_elapsed = time.time() - t_start
|
||
log.info(
|
||
"backfill done in %.0fmin. days_processed=%d, failed=%d, totals=%s",
|
||
overall_elapsed / 60, days_processed, len(failed_days), total_counters,
|
||
)
|
||
|
||
with session_scope() as session:
|
||
run = session.get(IngestRun, run_id)
|
||
if run is not None:
|
||
run.finished_at = datetime.now(UTC)
|
||
run.status = IngestStatus.success if not failed_days else IngestStatus.partial
|
||
run.records_seen = total_counters["seen"]
|
||
run.records_new = total_counters["new"]
|
||
run.records_updated = total_counters["updated"]
|
||
if failed_days:
|
||
run.errors = {"failed_days": failed_days[:50]} # cap przy serialize
|
||
|
||
return 0 if not failed_days else 1
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|