fix(xvideos): parse model name from nested span.name — recover 0-performer scenes

xvideos renders the scene's models as `<a href="/models/slug">...<span class="name">
Display Name</span>...`. The old _MODEL_RE wanted text immediately after the anchor
`>` and never matched current markup → browse-scraped scenes landed with 0 performers
(bug-report 2026-06-07: "no actors, but Rebecca Johnson is on the page"). New regex
captures slug + nested span.name, bounded within the anchor. + backfill script for the
~11.9k existing zero-performer xvideos scenes (54% have a real /models/ link; resolver
merges names to canonical by name_normalized).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-08 10:13:21 +02:00
parent edbffc0fa7
commit 7bf1fd6716
2 changed files with 146 additions and 1 deletions

View file

@ -28,7 +28,16 @@ _SCENE_URL_RE = re.compile(r'href="(/video\.[0-9a-z]+/[a-z0-9_]+)"', re.IGNORECA
_JSONLD_RE = re.compile(
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.IGNORECASE | re.DOTALL
)
_MODEL_RE = re.compile(r'href="/models/([a-z0-9_-]+)"[^>]*>([^<]{2,60})</a>', re.IGNORECASE)
# Model anchor: `<a href="/models/<slug>" ...><span ...></span><span class="name">Display Name</span>...`.
# Nazwa siedzi w ZAGNIEŻDŻONYM <span class="name">, nie jako bezpośredni tekst anchora —
# poprzedni wzorzec `>([^<]{2,60})</a>` wymagał tekstu zaraz po `>` i NIGDY nie pasował do
# obecnego markupu xvideos → wszystkie browse-scraped sceny xvideos lądowały z 0 performerami
# (bug-report 2026-06-07 "czemu nie ma aktorów, są na stronie"). `(?:(?!</a>).)*?` trzyma match
# w obrębie jednego anchora (nie przecieka do nazwy następnego modela gdy span.name brak).
_MODEL_RE = re.compile(
r'/models/([a-z0-9_-]+)"(?:(?!</a>).)*?<span class="name">\s*([^<]{2,60})</span>',
re.IGNORECASE | re.DOTALL,
)
_TAG_RE = re.compile(r'href="/tags/([a-z0-9_-]+)"', re.IGNORECASE)
_SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)")
_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE)

View file

@ -0,0 +1,136 @@
"""Backfill performerów dla browse-scraped scen xvideos z 0 performerami.
Kontekst: `_MODEL_RE` w xvideos_browse.py przez pewien czas nie pasował do markupu
xvideos (nazwa modela w zagnieżdżonym `<span class="name">`, nie jako tekst anchora)
część scen wpadła z 0 performerów (bug-report 2026-06-07 "czemu nie ma aktorów,
na stronie: Rebecca Johnson"). Forward-fix poprawił parser; ten skrypt domyka
zaległe sceny re-fetchem strony + dowiązaniem `/models/` przez ten sam resolver co
ingest (więc nazwy mergują z canonical po name_normalized).
Sample 2026-06-08: 54% zero-perf scen ma realny /models/ na stronie (~1.4 perf/scenę),
46% to amatorskie uploady bez modela (nie ruszamy). 0 fetch-failów (VPS niezablokowany).
Użycie (w kontenerze worker):
python scripts/backfill_xvideos_performers.py [LIMIT] [--commit] [--workers N] [--sleep S]
Bez --commit = dry-run (tylko liczy yield). LIMIT pusty/0 = wszystkie zero-perf sceny.
"""
from __future__ import annotations
import logging
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import select, text
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
from app.db import session_scope
from app.extractors import browser_get
from app.models.scene import ScenePerformer
from app.models.source import Source
from app.normalize.scenes import normalize_performer
from app.resolve.performer_resolver import resolve_performer
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s")
log = logging.getLogger("backfill_xvideos")
_scraper = XVideosBrowseScraper()
_lock = threading.Lock()
def _args() -> tuple[int, bool, int, float]:
limit = 0
commit = "--commit" in sys.argv
workers = 3
sleep = 0.3
for i, a in enumerate(sys.argv[1:], 1):
if a.isdigit():
limit = int(a)
elif a == "--workers" and i < len(sys.argv) - 1:
workers = int(sys.argv[i + 1])
elif a == "--sleep" and i < len(sys.argv) - 1:
sleep = float(sys.argv[i + 1])
return limit, commit, workers, sleep
def _candidates(limit: int) -> list[tuple]:
sql = (
"SELECT s.id, ps.page_url FROM scenes s "
"JOIN playback_sources ps ON ps.scene_id=s.id AND ps.origin='tube:xvideoscom' "
"WHERE NOT EXISTS (SELECT 1 FROM scene_performers sp WHERE sp.scene_id=s.id) "
"ORDER BY s.created_at DESC"
)
if limit:
sql += f" LIMIT {limit}"
with session_scope() as s:
return list(s.execute(text(sql)).all())
def _source_id() -> "uuid.UUID": # noqa: F821
with session_scope() as s:
return s.execute(select(Source.id).where(Source.name == "tube-scraper")).scalar_one()
_stats = {"gain": 0, "perf": 0, "nomodel": 0, "fail": 0, "done": 0}
def _process(row, *, commit: bool, src_id, sleep: float) -> None:
scene_id, page_url = row
try:
html = browser_get(page_url, timeout=20).text
except Exception:
with _lock:
_stats["fail"] += 1
return
rs = _scraper._parse_detail(page_url, html)
perfs = rs.performers if rs else []
if not perfs:
with _lock:
_stats["nomodel"] += 1; _stats["done"] += 1
return
if commit:
with session_scope() as s:
# re-check: ktoś mógł w międzyczasie dowiązać
still_zero = not s.execute(
select(ScenePerformer.scene_id).where(ScenePerformer.scene_id == scene_id).limit(1)
).first()
if still_zero:
for pos, rp in enumerate(perfs):
perf = resolve_performer(s, norm=normalize_performer(rp), source_id=src_id)
exists = s.execute(
select(ScenePerformer).where(
ScenePerformer.scene_id == scene_id,
ScenePerformer.performer_id == perf.id,
)
).first()
if not exists:
s.add(ScenePerformer(scene_id=scene_id, performer_id=perf.id, position=pos))
with _lock:
_stats["gain"] += 1; _stats["perf"] += len(perfs); _stats["done"] += 1
if sleep:
time.sleep(sleep)
def main() -> None:
limit, commit, workers, sleep = _args()
rows = _candidates(limit)
src_id = _source_id() if commit else None
print(f"candidates={len(rows)} commit={commit} workers={workers} sleep={sleep}", flush=True)
with ThreadPoolExecutor(max_workers=workers) as ex:
futs = [ex.submit(_process, r, commit=commit, src_id=src_id, sleep=sleep) for r in rows]
last = 0
for _ in futs:
pass
# progress poll
while any(not f.done() for f in futs):
time.sleep(15)
d = _stats["done"]
if d != last:
print(f" progress done={d}/{len(rows)} gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True)
last = d
print(f"DONE gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True)
if __name__ == "__main__":
main()