"""Backfill performerów dla browse-scraped scen xvideos z 0 performerami. Kontekst: `_MODEL_RE` w xvideos_browse.py przez pewien czas nie pasował do markupu xvideos (nazwa modela w zagnieżdżonym ``, nie jako tekst anchora) → część scen wpadła z 0 performerów (bug-report 2026-06-07 "czemu nie ma aktorów, są na stronie: Rebecca Johnson"). Forward-fix poprawił parser; ten skrypt domyka zaległe sceny re-fetchem strony + dowiązaniem `/models/` przez ten sam resolver co ingest (więc nazwy mergują z canonical po name_normalized). Sample 2026-06-08: 54% zero-perf scen ma realny /models/ na stronie (~1.4 perf/scenę), 46% to amatorskie uploady bez modela (nie ruszamy). 0 fetch-failów (VPS niezablokowany). Użycie (w kontenerze worker): python scripts/backfill_xvideos_performers.py [LIMIT] [--commit] [--workers N] [--sleep S] Bez --commit = dry-run (tylko liczy yield). LIMIT pusty/0 = wszystkie zero-perf sceny. """ from __future__ import annotations import logging import sys import threading import time from concurrent.futures import ThreadPoolExecutor from sqlalchemy import select, text from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper from app.db import session_scope from app.extractors import browser_get from app.models.scene import ScenePerformer from app.models.source import Source from app.normalize.scenes import normalize_performer from app.resolve.performer_resolver import resolve_performer logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s") log = logging.getLogger("backfill_xvideos") _scraper = XVideosBrowseScraper() _lock = threading.Lock() def _args() -> tuple[int, bool, int, float]: limit = 0 commit = "--commit" in sys.argv workers = 3 sleep = 0.3 argv = sys.argv[1:] skip = False for i, a in enumerate(argv): if skip: # ta pozycja to wartość poprzedniej flagi (--workers/--sleep) — nie traktuj jako LIMIT skip = False continue if a == "--workers" and i + 1 < len(argv): workers = int(argv[i + 1]); skip = True elif a == "--sleep" and i + 1 < len(argv): sleep = float(argv[i + 1]); skip = True elif a == "--commit": continue elif a.isdigit(): limit = int(a) return limit, commit, workers, sleep def _candidates(limit: int) -> list[tuple]: sql = ( "SELECT s.id, ps.page_url FROM scenes s " "JOIN playback_sources ps ON ps.scene_id=s.id AND ps.origin='tube:xvideoscom' " "WHERE NOT EXISTS (SELECT 1 FROM scene_performers sp WHERE sp.scene_id=s.id) " "ORDER BY s.created_at DESC" ) if limit: sql += f" LIMIT {limit}" with session_scope() as s: return list(s.execute(text(sql)).all()) def _source_id() -> "uuid.UUID": # noqa: F821 with session_scope() as s: return s.execute(select(Source.id).where(Source.name == "tube-scraper")).scalar_one() _stats = {"gain": 0, "perf": 0, "nomodel": 0, "fail": 0, "done": 0} def _process(row, *, commit: bool, src_id, sleep: float) -> None: scene_id, page_url = row try: html = browser_get(page_url, timeout=20).text except Exception: with _lock: _stats["fail"] += 1 return rs = _scraper._parse_detail(page_url, html) perfs = rs.performers if rs else [] if not perfs: with _lock: _stats["nomodel"] += 1; _stats["done"] += 1 return if commit: with session_scope() as s: # re-check: ktoś mógł w międzyczasie dowiązać still_zero = not s.execute( select(ScenePerformer.scene_id).where(ScenePerformer.scene_id == scene_id).limit(1) ).first() if still_zero: for pos, rp in enumerate(perfs): perf = resolve_performer(s, norm=normalize_performer(rp), source_id=src_id) exists = s.execute( select(ScenePerformer).where( ScenePerformer.scene_id == scene_id, ScenePerformer.performer_id == perf.id, ) ).first() if not exists: s.add(ScenePerformer(scene_id=scene_id, performer_id=perf.id, position=pos)) with _lock: _stats["gain"] += 1; _stats["perf"] += len(perfs); _stats["done"] += 1 if sleep: time.sleep(sleep) def main() -> None: limit, commit, workers, sleep = _args() rows = _candidates(limit) src_id = _source_id() if commit else None print(f"candidates={len(rows)} commit={commit} workers={workers} sleep={sleep}", flush=True) with ThreadPoolExecutor(max_workers=workers) as ex: futs = [ex.submit(_process, r, commit=commit, src_id=src_id, sleep=sleep) for r in rows] last = 0 for _ in futs: pass # progress poll while any(not f.done() for f in futs): time.sleep(15) d = _stats["done"] if d != last: print(f" progress done={d}/{len(rows)} gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True) last = d print(f"DONE gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True) if __name__ == "__main__": main()