From 7bf1fd6716e7c40deae77e79280fda0e39fa19dc Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Mon, 8 Jun 2026 10:13:21 +0200 Subject: [PATCH] =?UTF-8?q?fix(xvideos):=20parse=20model=20name=20from=20n?= =?UTF-8?q?ested=20span.name=20=E2=80=94=20recover=200-performer=20scenes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xvideos renders the scene's models as `... Display Name...`. The old _MODEL_RE wanted text immediately after the anchor `>` and never matched current markup → browse-scraped scenes landed with 0 performers (bug-report 2026-06-07: "no actors, but Rebecca Johnson is on the page"). New regex captures slug + nested span.name, bounded within the anchor. + backfill script for the ~11.9k existing zero-performer xvideos scenes (54% have a real /models/ link; resolver merges names to canonical by name_normalized). Co-Authored-By: Claude Opus 4.8 --- .../direct_scrapers/xvideos_browse.py | 11 +- scripts/backfill_xvideos_performers.py | 136 ++++++++++++++++++ 2 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 scripts/backfill_xvideos_performers.py diff --git a/app/connectors/direct_scrapers/xvideos_browse.py b/app/connectors/direct_scrapers/xvideos_browse.py index 4e6dd51..9d18bad 100644 --- a/app/connectors/direct_scrapers/xvideos_browse.py +++ b/app/connectors/direct_scrapers/xvideos_browse.py @@ -28,7 +28,16 @@ _SCENE_URL_RE = re.compile(r'href="(/video\.[0-9a-z]+/[a-z0-9_]+)"', re.IGNORECA _JSONLD_RE = re.compile( r']+type=["\']application/ld\+json["\'][^>]*>(.*?)', re.IGNORECASE | re.DOTALL ) -_MODEL_RE = re.compile(r'href="/models/([a-z0-9_-]+)"[^>]*>([^<]{2,60})', re.IGNORECASE) +# Model anchor: `Display Name...`. +# Nazwa siedzi w ZAGNIEŻDŻONYM , nie jako bezpośredni tekst anchora — +# poprzedni wzorzec `>([^<]{2,60})` wymagał tekstu zaraz po `>` i NIGDY nie pasował do +# obecnego markupu xvideos → wszystkie browse-scraped sceny xvideos lądowały z 0 performerami +# (bug-report 2026-06-07 "czemu nie ma aktorów, są na stronie"). `(?:(?!).)*?` trzyma match +# w obrębie jednego anchora (nie przecieka do nazwy następnego modela gdy span.name brak). +_MODEL_RE = re.compile( + r'/models/([a-z0-9_-]+)"(?:(?!).)*?\s*([^<]{2,60})', + re.IGNORECASE | re.DOTALL, +) _TAG_RE = re.compile(r'href="/tags/([a-z0-9_-]+)"', re.IGNORECASE) _SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)") _ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE) diff --git a/scripts/backfill_xvideos_performers.py b/scripts/backfill_xvideos_performers.py new file mode 100644 index 0000000..a467328 --- /dev/null +++ b/scripts/backfill_xvideos_performers.py @@ -0,0 +1,136 @@ +"""Backfill performerów dla browse-scraped scen xvideos z 0 performerami. + +Kontekst: `_MODEL_RE` w xvideos_browse.py przez pewien czas nie pasował do markupu +xvideos (nazwa modela w zagnieżdżonym ``, nie jako tekst anchora) +→ część scen wpadła z 0 performerów (bug-report 2026-06-07 "czemu nie ma aktorów, +są na stronie: Rebecca Johnson"). Forward-fix poprawił parser; ten skrypt domyka +zaległe sceny re-fetchem strony + dowiązaniem `/models/` przez ten sam resolver co +ingest (więc nazwy mergują z canonical po name_normalized). + +Sample 2026-06-08: 54% zero-perf scen ma realny /models/ na stronie (~1.4 perf/scenę), +46% to amatorskie uploady bez modela (nie ruszamy). 0 fetch-failów (VPS niezablokowany). + +Użycie (w kontenerze worker): + python scripts/backfill_xvideos_performers.py [LIMIT] [--commit] [--workers N] [--sleep S] +Bez --commit = dry-run (tylko liczy yield). LIMIT pusty/0 = wszystkie zero-perf sceny. +""" +from __future__ import annotations + +import logging +import sys +import threading +import time +from concurrent.futures import ThreadPoolExecutor + +from sqlalchemy import select, text + +from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper +from app.db import session_scope +from app.extractors import browser_get +from app.models.scene import ScenePerformer +from app.models.source import Source +from app.normalize.scenes import normalize_performer +from app.resolve.performer_resolver import resolve_performer + +logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s") +log = logging.getLogger("backfill_xvideos") + +_scraper = XVideosBrowseScraper() +_lock = threading.Lock() + + +def _args() -> tuple[int, bool, int, float]: + limit = 0 + commit = "--commit" in sys.argv + workers = 3 + sleep = 0.3 + for i, a in enumerate(sys.argv[1:], 1): + if a.isdigit(): + limit = int(a) + elif a == "--workers" and i < len(sys.argv) - 1: + workers = int(sys.argv[i + 1]) + elif a == "--sleep" and i < len(sys.argv) - 1: + sleep = float(sys.argv[i + 1]) + return limit, commit, workers, sleep + + +def _candidates(limit: int) -> list[tuple]: + sql = ( + "SELECT s.id, ps.page_url FROM scenes s " + "JOIN playback_sources ps ON ps.scene_id=s.id AND ps.origin='tube:xvideoscom' " + "WHERE NOT EXISTS (SELECT 1 FROM scene_performers sp WHERE sp.scene_id=s.id) " + "ORDER BY s.created_at DESC" + ) + if limit: + sql += f" LIMIT {limit}" + with session_scope() as s: + return list(s.execute(text(sql)).all()) + + +def _source_id() -> "uuid.UUID": # noqa: F821 + with session_scope() as s: + return s.execute(select(Source.id).where(Source.name == "tube-scraper")).scalar_one() + + +_stats = {"gain": 0, "perf": 0, "nomodel": 0, "fail": 0, "done": 0} + + +def _process(row, *, commit: bool, src_id, sleep: float) -> None: + scene_id, page_url = row + try: + html = browser_get(page_url, timeout=20).text + except Exception: + with _lock: + _stats["fail"] += 1 + return + rs = _scraper._parse_detail(page_url, html) + perfs = rs.performers if rs else [] + if not perfs: + with _lock: + _stats["nomodel"] += 1; _stats["done"] += 1 + return + if commit: + with session_scope() as s: + # re-check: ktoś mógł w międzyczasie dowiązać + still_zero = not s.execute( + select(ScenePerformer.scene_id).where(ScenePerformer.scene_id == scene_id).limit(1) + ).first() + if still_zero: + for pos, rp in enumerate(perfs): + perf = resolve_performer(s, norm=normalize_performer(rp), source_id=src_id) + exists = s.execute( + select(ScenePerformer).where( + ScenePerformer.scene_id == scene_id, + ScenePerformer.performer_id == perf.id, + ) + ).first() + if not exists: + s.add(ScenePerformer(scene_id=scene_id, performer_id=perf.id, position=pos)) + with _lock: + _stats["gain"] += 1; _stats["perf"] += len(perfs); _stats["done"] += 1 + if sleep: + time.sleep(sleep) + + +def main() -> None: + limit, commit, workers, sleep = _args() + rows = _candidates(limit) + src_id = _source_id() if commit else None + print(f"candidates={len(rows)} commit={commit} workers={workers} sleep={sleep}", flush=True) + with ThreadPoolExecutor(max_workers=workers) as ex: + futs = [ex.submit(_process, r, commit=commit, src_id=src_id, sleep=sleep) for r in rows] + last = 0 + for _ in futs: + pass + # progress poll + while any(not f.done() for f in futs): + time.sleep(15) + d = _stats["done"] + if d != last: + print(f" progress done={d}/{len(rows)} gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True) + last = d + print(f"DONE gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True) + + +if __name__ == "__main__": + main()