goon/scripts/backfill_xvideos_performers.py
jtrzupek d4b89f16e3 fix(scripts): backfill arg parser consumed --workers value as LIMIT
'--workers 3' set limit=3 because the bare '3' also hit the isdigit() branch.
Skip flag-value positions when scanning for a positional LIMIT.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-08 10:15:09 +02:00

143 lines
5.4 KiB
Python

"""Backfill performerów dla browse-scraped scen xvideos z 0 performerami.
Kontekst: `_MODEL_RE` w xvideos_browse.py przez pewien czas nie pasował do markupu
xvideos (nazwa modela w zagnieżdżonym `<span class="name">`, nie jako tekst anchora)
→ część scen wpadła z 0 performerów (bug-report 2026-06-07 "czemu nie ma aktorów,
są na stronie: Rebecca Johnson"). Forward-fix poprawił parser; ten skrypt domyka
zaległe sceny re-fetchem strony + dowiązaniem `/models/` przez ten sam resolver co
ingest (więc nazwy mergują z canonical po name_normalized).
Sample 2026-06-08: 54% zero-perf scen ma realny /models/ na stronie (~1.4 perf/scenę),
46% to amatorskie uploady bez modela (nie ruszamy). 0 fetch-failów (VPS niezablokowany).
Użycie (w kontenerze worker):
python scripts/backfill_xvideos_performers.py [LIMIT] [--commit] [--workers N] [--sleep S]
Bez --commit = dry-run (tylko liczy yield). LIMIT pusty/0 = wszystkie zero-perf sceny.
"""
from __future__ import annotations
import logging
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import select, text
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
from app.db import session_scope
from app.extractors import browser_get
from app.models.scene import ScenePerformer
from app.models.source import Source
from app.normalize.scenes import normalize_performer
from app.resolve.performer_resolver import resolve_performer
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s")
log = logging.getLogger("backfill_xvideos")
_scraper = XVideosBrowseScraper()
_lock = threading.Lock()
def _args() -> tuple[int, bool, int, float]:
limit = 0
commit = "--commit" in sys.argv
workers = 3
sleep = 0.3
argv = sys.argv[1:]
skip = False
for i, a in enumerate(argv):
if skip: # ta pozycja to wartość poprzedniej flagi (--workers/--sleep) — nie traktuj jako LIMIT
skip = False
continue
if a == "--workers" and i + 1 < len(argv):
workers = int(argv[i + 1]); skip = True
elif a == "--sleep" and i + 1 < len(argv):
sleep = float(argv[i + 1]); skip = True
elif a == "--commit":
continue
elif a.isdigit():
limit = int(a)
return limit, commit, workers, sleep
def _candidates(limit: int) -> list[tuple]:
sql = (
"SELECT s.id, ps.page_url FROM scenes s "
"JOIN playback_sources ps ON ps.scene_id=s.id AND ps.origin='tube:xvideoscom' "
"WHERE NOT EXISTS (SELECT 1 FROM scene_performers sp WHERE sp.scene_id=s.id) "
"ORDER BY s.created_at DESC"
)
if limit:
sql += f" LIMIT {limit}"
with session_scope() as s:
return list(s.execute(text(sql)).all())
def _source_id() -> "uuid.UUID": # noqa: F821
with session_scope() as s:
return s.execute(select(Source.id).where(Source.name == "tube-scraper")).scalar_one()
_stats = {"gain": 0, "perf": 0, "nomodel": 0, "fail": 0, "done": 0}
def _process(row, *, commit: bool, src_id, sleep: float) -> None:
scene_id, page_url = row
try:
html = browser_get(page_url, timeout=20).text
except Exception:
with _lock:
_stats["fail"] += 1
return
rs = _scraper._parse_detail(page_url, html)
perfs = rs.performers if rs else []
if not perfs:
with _lock:
_stats["nomodel"] += 1; _stats["done"] += 1
return
if commit:
with session_scope() as s:
# re-check: ktoś mógł w międzyczasie dowiązać
still_zero = not s.execute(
select(ScenePerformer.scene_id).where(ScenePerformer.scene_id == scene_id).limit(1)
).first()
if still_zero:
for pos, rp in enumerate(perfs):
perf = resolve_performer(s, norm=normalize_performer(rp), source_id=src_id)
exists = s.execute(
select(ScenePerformer).where(
ScenePerformer.scene_id == scene_id,
ScenePerformer.performer_id == perf.id,
)
).first()
if not exists:
s.add(ScenePerformer(scene_id=scene_id, performer_id=perf.id, position=pos))
with _lock:
_stats["gain"] += 1; _stats["perf"] += len(perfs); _stats["done"] += 1
if sleep:
time.sleep(sleep)
def main() -> None:
limit, commit, workers, sleep = _args()
rows = _candidates(limit)
src_id = _source_id() if commit else None
print(f"candidates={len(rows)} commit={commit} workers={workers} sleep={sleep}", flush=True)
with ThreadPoolExecutor(max_workers=workers) as ex:
futs = [ex.submit(_process, r, commit=commit, src_id=src_id, sleep=sleep) for r in rows]
last = 0
for _ in futs:
pass
# progress poll
while any(not f.done() for f in futs):
time.sleep(15)
d = _stats["done"]
if d != last:
print(f" progress done={d}/{len(rows)} gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True)
last = d
print(f"DONE gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True)
if __name__ == "__main__":
main()