"""TPDB full sync via date-iteration backfill. **Powód:** TPDB `/scenes` endpoint capuje `meta.total=10000` i `last_page=100` na globalnym sortowaniu. Bez date filter dostajemy tylko najnowsze 10k scen (z czego mamy już 54k → 0 nowych). Trick: `?date=YYYY-MM-DD` filtruje na sceny RELEASED tego dnia — pagination per-day nie ma capu. Iterując daty back to 2010 łapiemy pełen katalog. **ETA:** - Date range: configurable (default 2020-01-01 → today, ~5.5 lat × 365 = ~2000 dni) - Per-day API: ~5-20 pages × 1.7s = 8-34s - Total: ~10-20h API time + ingest overhead, plus DB write - Per-day delay: 0.5s żeby nie hammerować API **Resumability:** progress zapisuje się w `/tmp/tpdb_backfill_progress.txt` (jedna data per linia). Re-run automatycznie skipuje przerobione daty. **Usage on VPS:** docker compose exec -d worker python scripts/tpdb_backfill.py \ --start 2020-01-01 --end 2026-05-12 # monitor: docker compose logs -f worker | grep tpdb_backfill """ from __future__ import annotations import argparse import logging import sys import time from datetime import UTC, date, datetime, timedelta import httpx from app.config import get_settings from app.connectors.tpdb import TPDBConnector, _parse_scene from app.db import session_scope from app.ingest import _process_scene, get_or_create_source from app.models.ingest_run import IngestRun, IngestStatus from app.models.source import SourceKind log = logging.getLogger("tpdb_backfill") logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s", ) PROGRESS_FILE = "/tmp/tpdb_backfill_progress.txt" def _load_done_dates() -> set[str]: try: with open(PROGRESS_FILE) as f: return {line.strip() for line in f if line.strip()} except FileNotFoundError: return set() def _mark_done(d: str) -> None: with open(PROGRESS_FILE, "a") as f: f.write(d + "\n") def _ingest_date(connector: TPDBConnector, source_id, day: str) -> dict[str, int]: """Ingest jednego dnia. Wszystkie strony /scenes?date=.""" counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0} page = 1 with connector._client() as client: while True: try: payload = connector._get( client, "/scenes", {"per_page": 100, "page": page, "date": day} ) except httpx.HTTPStatusError as e: # 429 from connector — tenacity already retried 5x. Backoff + continue. log.warning("tpdb fetch %s page=%d failed: %s — backoff 30s", day, page, e) time.sleep(30) continue data = payload.get("data") or [] if not data: break for raw in data: scene = _parse_scene(raw) if scene is None: continue counters["seen"] += 1 try: # _process_scene wewnątrz robi własny session_scope — per-scene # transaction znaczy że niepowodzenie jednej sceny nie zabija # całego dnia. _process_scene( source_id=source_id, raw_scene=scene, counters=counters, ) except Exception as e: counters["errors"] += 1 log.exception("ingest scene failed %s: %s", scene.external_id, e) meta = payload.get("meta") or {} last_page = meta.get("last_page") or page if page >= last_page: break page += 1 return counters def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--start", default="2020-01-01", help="Start date YYYY-MM-DD") ap.add_argument("--end", default=date.today().isoformat(), help="End date YYYY-MM-DD") ap.add_argument("--reverse", action="store_true", help="Iterate newest → oldest (default: oldest → newest)") ap.add_argument("--delay", type=float, default=0.5, help="Sleep between days (seconds)") args = ap.parse_args() start = date.fromisoformat(args.start) end = date.fromisoformat(args.end) n_days = (end - start).days + 1 log.info("backfill range: %s → %s (%d days)", start, end, n_days) done = _load_done_dates() log.info("already processed: %d days (resume mode)", len(done)) connector = TPDBConnector() with session_scope() as session: source = get_or_create_source(session, kind=SourceKind.tpdb, name="tpdb") run = IngestRun(source_id=source.id, status=IngestStatus.running) session.add(run) session.flush() source_id = source.id run_id = run.id log.info("ingest_run %s started", run_id) total_counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0} days_processed = 0 failed_days = [] dates = [start + timedelta(days=i) for i in range(n_days)] if args.reverse: dates.reverse() t_start = time.time() for i, d in enumerate(dates): day_str = d.isoformat() if day_str in done: continue try: t0 = time.time() counters = _ingest_date(connector, source_id, day_str) elapsed = time.time() - t0 for k in total_counters: total_counters[k] += counters[k] days_processed += 1 log.info( "day=%s [%d/%d] %s elapsed=%.1fs total_new=%d total_seen=%d", day_str, i + 1, n_days, counters, elapsed, total_counters["new"], total_counters["seen"], ) _mark_done(day_str) time.sleep(args.delay) except KeyboardInterrupt: log.warning("interrupted at day=%s — progress saved, resume with same args", day_str) break except Exception as e: failed_days.append(day_str) log.exception("day=%s FAILED: %s", day_str, e) time.sleep(5) overall_elapsed = time.time() - t_start log.info( "backfill done in %.0fmin. days_processed=%d, failed=%d, totals=%s", overall_elapsed / 60, days_processed, len(failed_days), total_counters, ) with session_scope() as session: run = session.get(IngestRun, run_id) if run is not None: run.finished_at = datetime.now(UTC) run.status = IngestStatus.success if not failed_days else IngestStatus.partial run.records_seen = total_counters["seen"] run.records_new = total_counters["new"] run.records_updated = total_counters["updated"] if failed_days: run.errors = {"failed_days": failed_days[:50]} # cap przy serialize return 0 if not failed_days else 1 if __name__ == "__main__": sys.exit(main())