"""Backfill duration_sec dla playback_sources gdzie NULL i origin w setupie. Uzasadnienie: 2026-05-17 zmieniliśmy `duration_extract.py` (dodanie `itemprop` + `PnDTnHnMnS` ISO format). Sceny ingested wcześniej mają `duration_sec=NULL` mimo że detail page faktycznie ma duration. Ten skrypt re-fetchuje i updatuje. Bezpieczny do uruchomienia wielokrotnie — `WHERE duration_sec IS NULL` filter sprawia że już zbackfillowane są pomijane. Uruchomienie: `python -m scripts.backfill_durations --origin tube:xvideoscom --limit 500` Albo wszystkie problematyczne: `--origins tube:xvideoscom,tube:xnxxcom,...` """ from __future__ import annotations import argparse import logging import time from collections.abc import Sequence from sqlalchemy import select, update from app.db import session_scope from app.extractors._fetch import browser_get from app.extractors.duration_extract import extract_duration_sec from app.models.playback_source import PlaybackSource log = logging.getLogger(__name__) def backfill(origins: Sequence[str], limit: int, throttle_sec: float) -> dict: counters = {"checked": 0, "updated": 0, "extract_fail": 0, "fetch_fail": 0} with session_scope() as session: rows = session.execute( select(PlaybackSource.id, PlaybackSource.origin, PlaybackSource.page_url) .where(PlaybackSource.origin.in_(origins)) .where(PlaybackSource.duration_sec.is_(None)) .where(PlaybackSource.dead_at.is_(None)) .limit(limit) ).all() total = len(rows) log.info("backfill start: origins=%s rows=%d", list(origins), total) for i, (pb_id, origin, url) in enumerate(rows, 1): counters["checked"] += 1 try: r = browser_get(url, timeout=10.0, follow_redirects=True) except Exception as e: counters["fetch_fail"] += 1 log.debug("fetch fail %s: %s", url, e) time.sleep(throttle_sec) continue if r.status_code >= 400: counters["fetch_fail"] += 1 time.sleep(throttle_sec) continue dur = extract_duration_sec(r.text) if dur is None: counters["extract_fail"] += 1 else: session.execute( update(PlaybackSource) .where(PlaybackSource.id == pb_id) .values(duration_sec=dur) ) counters["updated"] += 1 if i % 25 == 0: session.commit() log.info("progress %d/%d updated=%d", i, total, counters["updated"]) time.sleep(throttle_sec) session.commit() return counters def main() -> int: parser = argparse.ArgumentParser() parser.add_argument( "--origins", default="tube:xvideoscom,tube:xnxxcom,tube:pornhatcom,tube:sxylandcom,tube:0dayxxcom", help="CSV originów do backfillu", ) parser.add_argument("--limit", type=int, default=2000, help="Max rows per run") parser.add_argument("--throttle", type=float, default=0.35, help="Sekundy między requestami") parser.add_argument("--verbose", action="store_true") args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) origins = [o.strip() for o in args.origins.split(",") if o.strip()] counters = backfill(origins, args.limit, args.throttle) log.info("backfill done: %s", counters) return 0 if __name__ == "__main__": raise SystemExit(main())