fix(xvideos): parse model name from nested span.name — recover 0-performer scenes
xvideos renders the scene's models as `<a href="/models/slug">...<span class="name"> Display Name</span>...`. The old _MODEL_RE wanted text immediately after the anchor `>` and never matched current markup → browse-scraped scenes landed with 0 performers (bug-report 2026-06-07: "no actors, but Rebecca Johnson is on the page"). New regex captures slug + nested span.name, bounded within the anchor. + backfill script for the ~11.9k existing zero-performer xvideos scenes (54% have a real /models/ link; resolver merges names to canonical by name_normalized). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
edbffc0fa7
commit
7bf1fd6716
2 changed files with 146 additions and 1 deletions
|
|
@ -28,7 +28,16 @@ _SCENE_URL_RE = re.compile(r'href="(/video\.[0-9a-z]+/[a-z0-9_]+)"', re.IGNORECA
|
|||
_JSONLD_RE = re.compile(
|
||||
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.IGNORECASE | re.DOTALL
|
||||
)
|
||||
_MODEL_RE = re.compile(r'href="/models/([a-z0-9_-]+)"[^>]*>([^<]{2,60})</a>', re.IGNORECASE)
|
||||
# Model anchor: `<a href="/models/<slug>" ...><span ...></span><span class="name">Display Name</span>...`.
|
||||
# Nazwa siedzi w ZAGNIEŻDŻONYM <span class="name">, nie jako bezpośredni tekst anchora —
|
||||
# poprzedni wzorzec `>([^<]{2,60})</a>` wymagał tekstu zaraz po `>` i NIGDY nie pasował do
|
||||
# obecnego markupu xvideos → wszystkie browse-scraped sceny xvideos lądowały z 0 performerami
|
||||
# (bug-report 2026-06-07 "czemu nie ma aktorów, są na stronie"). `(?:(?!</a>).)*?` trzyma match
|
||||
# w obrębie jednego anchora (nie przecieka do nazwy następnego modela gdy span.name brak).
|
||||
_MODEL_RE = re.compile(
|
||||
r'/models/([a-z0-9_-]+)"(?:(?!</a>).)*?<span class="name">\s*([^<]{2,60})</span>',
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
_TAG_RE = re.compile(r'href="/tags/([a-z0-9_-]+)"', re.IGNORECASE)
|
||||
_SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)")
|
||||
_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE)
|
||||
|
|
|
|||
136
scripts/backfill_xvideos_performers.py
Normal file
136
scripts/backfill_xvideos_performers.py
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
"""Backfill performerów dla browse-scraped scen xvideos z 0 performerami.
|
||||
|
||||
Kontekst: `_MODEL_RE` w xvideos_browse.py przez pewien czas nie pasował do markupu
|
||||
xvideos (nazwa modela w zagnieżdżonym `<span class="name">`, nie jako tekst anchora)
|
||||
→ część scen wpadła z 0 performerów (bug-report 2026-06-07 "czemu nie ma aktorów,
|
||||
są na stronie: Rebecca Johnson"). Forward-fix poprawił parser; ten skrypt domyka
|
||||
zaległe sceny re-fetchem strony + dowiązaniem `/models/` przez ten sam resolver co
|
||||
ingest (więc nazwy mergują z canonical po name_normalized).
|
||||
|
||||
Sample 2026-06-08: 54% zero-perf scen ma realny /models/ na stronie (~1.4 perf/scenę),
|
||||
46% to amatorskie uploady bez modela (nie ruszamy). 0 fetch-failów (VPS niezablokowany).
|
||||
|
||||
Użycie (w kontenerze worker):
|
||||
python scripts/backfill_xvideos_performers.py [LIMIT] [--commit] [--workers N] [--sleep S]
|
||||
Bez --commit = dry-run (tylko liczy yield). LIMIT pusty/0 = wszystkie zero-perf sceny.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from sqlalchemy import select, text
|
||||
|
||||
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
|
||||
from app.db import session_scope
|
||||
from app.extractors import browser_get
|
||||
from app.models.scene import ScenePerformer
|
||||
from app.models.source import Source
|
||||
from app.normalize.scenes import normalize_performer
|
||||
from app.resolve.performer_resolver import resolve_performer
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s")
|
||||
log = logging.getLogger("backfill_xvideos")
|
||||
|
||||
_scraper = XVideosBrowseScraper()
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
def _args() -> tuple[int, bool, int, float]:
|
||||
limit = 0
|
||||
commit = "--commit" in sys.argv
|
||||
workers = 3
|
||||
sleep = 0.3
|
||||
for i, a in enumerate(sys.argv[1:], 1):
|
||||
if a.isdigit():
|
||||
limit = int(a)
|
||||
elif a == "--workers" and i < len(sys.argv) - 1:
|
||||
workers = int(sys.argv[i + 1])
|
||||
elif a == "--sleep" and i < len(sys.argv) - 1:
|
||||
sleep = float(sys.argv[i + 1])
|
||||
return limit, commit, workers, sleep
|
||||
|
||||
|
||||
def _candidates(limit: int) -> list[tuple]:
|
||||
sql = (
|
||||
"SELECT s.id, ps.page_url FROM scenes s "
|
||||
"JOIN playback_sources ps ON ps.scene_id=s.id AND ps.origin='tube:xvideoscom' "
|
||||
"WHERE NOT EXISTS (SELECT 1 FROM scene_performers sp WHERE sp.scene_id=s.id) "
|
||||
"ORDER BY s.created_at DESC"
|
||||
)
|
||||
if limit:
|
||||
sql += f" LIMIT {limit}"
|
||||
with session_scope() as s:
|
||||
return list(s.execute(text(sql)).all())
|
||||
|
||||
|
||||
def _source_id() -> "uuid.UUID": # noqa: F821
|
||||
with session_scope() as s:
|
||||
return s.execute(select(Source.id).where(Source.name == "tube-scraper")).scalar_one()
|
||||
|
||||
|
||||
_stats = {"gain": 0, "perf": 0, "nomodel": 0, "fail": 0, "done": 0}
|
||||
|
||||
|
||||
def _process(row, *, commit: bool, src_id, sleep: float) -> None:
|
||||
scene_id, page_url = row
|
||||
try:
|
||||
html = browser_get(page_url, timeout=20).text
|
||||
except Exception:
|
||||
with _lock:
|
||||
_stats["fail"] += 1
|
||||
return
|
||||
rs = _scraper._parse_detail(page_url, html)
|
||||
perfs = rs.performers if rs else []
|
||||
if not perfs:
|
||||
with _lock:
|
||||
_stats["nomodel"] += 1; _stats["done"] += 1
|
||||
return
|
||||
if commit:
|
||||
with session_scope() as s:
|
||||
# re-check: ktoś mógł w międzyczasie dowiązać
|
||||
still_zero = not s.execute(
|
||||
select(ScenePerformer.scene_id).where(ScenePerformer.scene_id == scene_id).limit(1)
|
||||
).first()
|
||||
if still_zero:
|
||||
for pos, rp in enumerate(perfs):
|
||||
perf = resolve_performer(s, norm=normalize_performer(rp), source_id=src_id)
|
||||
exists = s.execute(
|
||||
select(ScenePerformer).where(
|
||||
ScenePerformer.scene_id == scene_id,
|
||||
ScenePerformer.performer_id == perf.id,
|
||||
)
|
||||
).first()
|
||||
if not exists:
|
||||
s.add(ScenePerformer(scene_id=scene_id, performer_id=perf.id, position=pos))
|
||||
with _lock:
|
||||
_stats["gain"] += 1; _stats["perf"] += len(perfs); _stats["done"] += 1
|
||||
if sleep:
|
||||
time.sleep(sleep)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
limit, commit, workers, sleep = _args()
|
||||
rows = _candidates(limit)
|
||||
src_id = _source_id() if commit else None
|
||||
print(f"candidates={len(rows)} commit={commit} workers={workers} sleep={sleep}", flush=True)
|
||||
with ThreadPoolExecutor(max_workers=workers) as ex:
|
||||
futs = [ex.submit(_process, r, commit=commit, src_id=src_id, sleep=sleep) for r in rows]
|
||||
last = 0
|
||||
for _ in futs:
|
||||
pass
|
||||
# progress poll
|
||||
while any(not f.done() for f in futs):
|
||||
time.sleep(15)
|
||||
d = _stats["done"]
|
||||
if d != last:
|
||||
print(f" progress done={d}/{len(rows)} gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True)
|
||||
last = d
|
||||
print(f"DONE gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Reference in a new issue