xvideos renders the scene's models as `<a href="/models/slug">...<span class="name"> Display Name</span>...`. The old _MODEL_RE wanted text immediately after the anchor `>` and never matched current markup → browse-scraped scenes landed with 0 performers (bug-report 2026-06-07: "no actors, but Rebecca Johnson is on the page"). New regex captures slug + nested span.name, bounded within the anchor. + backfill script for the ~11.9k existing zero-performer xvideos scenes (54% have a real /models/ link; resolver merges names to canonical by name_normalized). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
136 lines
5.1 KiB
Python
136 lines
5.1 KiB
Python
"""Backfill performerów dla browse-scraped scen xvideos z 0 performerami.
|
|
|
|
Kontekst: `_MODEL_RE` w xvideos_browse.py przez pewien czas nie pasował do markupu
|
|
xvideos (nazwa modela w zagnieżdżonym `<span class="name">`, nie jako tekst anchora)
|
|
→ część scen wpadła z 0 performerów (bug-report 2026-06-07 "czemu nie ma aktorów,
|
|
są na stronie: Rebecca Johnson"). Forward-fix poprawił parser; ten skrypt domyka
|
|
zaległe sceny re-fetchem strony + dowiązaniem `/models/` przez ten sam resolver co
|
|
ingest (więc nazwy mergują z canonical po name_normalized).
|
|
|
|
Sample 2026-06-08: 54% zero-perf scen ma realny /models/ na stronie (~1.4 perf/scenę),
|
|
46% to amatorskie uploady bez modela (nie ruszamy). 0 fetch-failów (VPS niezablokowany).
|
|
|
|
Użycie (w kontenerze worker):
|
|
python scripts/backfill_xvideos_performers.py [LIMIT] [--commit] [--workers N] [--sleep S]
|
|
Bez --commit = dry-run (tylko liczy yield). LIMIT pusty/0 = wszystkie zero-perf sceny.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import sys
|
|
import threading
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
from sqlalchemy import select, text
|
|
|
|
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
|
|
from app.db import session_scope
|
|
from app.extractors import browser_get
|
|
from app.models.scene import ScenePerformer
|
|
from app.models.source import Source
|
|
from app.normalize.scenes import normalize_performer
|
|
from app.resolve.performer_resolver import resolve_performer
|
|
|
|
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s")
|
|
log = logging.getLogger("backfill_xvideos")
|
|
|
|
_scraper = XVideosBrowseScraper()
|
|
_lock = threading.Lock()
|
|
|
|
|
|
def _args() -> tuple[int, bool, int, float]:
|
|
limit = 0
|
|
commit = "--commit" in sys.argv
|
|
workers = 3
|
|
sleep = 0.3
|
|
for i, a in enumerate(sys.argv[1:], 1):
|
|
if a.isdigit():
|
|
limit = int(a)
|
|
elif a == "--workers" and i < len(sys.argv) - 1:
|
|
workers = int(sys.argv[i + 1])
|
|
elif a == "--sleep" and i < len(sys.argv) - 1:
|
|
sleep = float(sys.argv[i + 1])
|
|
return limit, commit, workers, sleep
|
|
|
|
|
|
def _candidates(limit: int) -> list[tuple]:
|
|
sql = (
|
|
"SELECT s.id, ps.page_url FROM scenes s "
|
|
"JOIN playback_sources ps ON ps.scene_id=s.id AND ps.origin='tube:xvideoscom' "
|
|
"WHERE NOT EXISTS (SELECT 1 FROM scene_performers sp WHERE sp.scene_id=s.id) "
|
|
"ORDER BY s.created_at DESC"
|
|
)
|
|
if limit:
|
|
sql += f" LIMIT {limit}"
|
|
with session_scope() as s:
|
|
return list(s.execute(text(sql)).all())
|
|
|
|
|
|
def _source_id() -> "uuid.UUID": # noqa: F821
|
|
with session_scope() as s:
|
|
return s.execute(select(Source.id).where(Source.name == "tube-scraper")).scalar_one()
|
|
|
|
|
|
_stats = {"gain": 0, "perf": 0, "nomodel": 0, "fail": 0, "done": 0}
|
|
|
|
|
|
def _process(row, *, commit: bool, src_id, sleep: float) -> None:
|
|
scene_id, page_url = row
|
|
try:
|
|
html = browser_get(page_url, timeout=20).text
|
|
except Exception:
|
|
with _lock:
|
|
_stats["fail"] += 1
|
|
return
|
|
rs = _scraper._parse_detail(page_url, html)
|
|
perfs = rs.performers if rs else []
|
|
if not perfs:
|
|
with _lock:
|
|
_stats["nomodel"] += 1; _stats["done"] += 1
|
|
return
|
|
if commit:
|
|
with session_scope() as s:
|
|
# re-check: ktoś mógł w międzyczasie dowiązać
|
|
still_zero = not s.execute(
|
|
select(ScenePerformer.scene_id).where(ScenePerformer.scene_id == scene_id).limit(1)
|
|
).first()
|
|
if still_zero:
|
|
for pos, rp in enumerate(perfs):
|
|
perf = resolve_performer(s, norm=normalize_performer(rp), source_id=src_id)
|
|
exists = s.execute(
|
|
select(ScenePerformer).where(
|
|
ScenePerformer.scene_id == scene_id,
|
|
ScenePerformer.performer_id == perf.id,
|
|
)
|
|
).first()
|
|
if not exists:
|
|
s.add(ScenePerformer(scene_id=scene_id, performer_id=perf.id, position=pos))
|
|
with _lock:
|
|
_stats["gain"] += 1; _stats["perf"] += len(perfs); _stats["done"] += 1
|
|
if sleep:
|
|
time.sleep(sleep)
|
|
|
|
|
|
def main() -> None:
|
|
limit, commit, workers, sleep = _args()
|
|
rows = _candidates(limit)
|
|
src_id = _source_id() if commit else None
|
|
print(f"candidates={len(rows)} commit={commit} workers={workers} sleep={sleep}", flush=True)
|
|
with ThreadPoolExecutor(max_workers=workers) as ex:
|
|
futs = [ex.submit(_process, r, commit=commit, src_id=src_id, sleep=sleep) for r in rows]
|
|
last = 0
|
|
for _ in futs:
|
|
pass
|
|
# progress poll
|
|
while any(not f.done() for f in futs):
|
|
time.sleep(15)
|
|
d = _stats["done"]
|
|
if d != last:
|
|
print(f" progress done={d}/{len(rows)} gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True)
|
|
last = d
|
|
print(f"DONE gain={_stats['gain']} perf={_stats['perf']} nomodel={_stats['nomodel']} fail={_stats['fail']}", flush=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|