xvideos SSR's JSON-LD VideoObject (duration/title/uploadDate) + on-page /models/ (perf) + /tags/. Sample: median ~10.5min, 93% >=3min. Pilot (2 pages): 29 new, 100% playable + visible + tagged (performers sparse — xvideos 'new' is amateur-heavy; /models/ tagged mostly on studio rips). - XVideosBrowseScraper (JSON-LD + page-parse models/tags), in ALL_BROWSE_SCRAPERS. - deep_crawl._PAGE_CAP: per-sitetag depth cap; xvideoscom=1800 (~newest 50k). At the cap the tube is marked exhausted (reset -> incremental re-sweep) so a mega-tube cannot monopolize the round-robin or balloon the DB. - ported yesporn.py into the public repo (was prod-only, like hdporngg) ending the __init__ public/prod divergence. youporn rejected: JSON-LD lacks actor/keywords, its /pornstar//category/ links are A-Z nav not scene-specific. xhamster: 429/Cloudflare from the VPS IP. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
147 lines
5.6 KiB
Python
147 lines
5.6 KiB
Python
"""xvideos.com — deep-crawl browse scraper (JSON-LD + page-parse).
|
|
|
|
xvideos SSR-uje JSON-LD VideoObject (duration, name, uploadDate) ORAZ na detail-stronie
|
|
linki `/models/<slug>` (performerzy tej sceny) + `/tags/<slug>` (tagi). Sample 2026-06-03
|
|
(15 scen): median ~10.5min, 93% ≥3min — dobry full-scene content (nie trailery).
|
|
|
|
Mega-katalog (~13M) → deep_crawl z per-tube page-cap (xvideoscom w deep_crawl._PAGE_CAP),
|
|
żeby nie monopolizował round-robin ani nie zalał bazy. Listing: /new/<page> (newest).
|
|
Scene: /video.<hash>/<slug>. Playback: page_url + origin tube:xvideoscom (istniejący
|
|
extractor `xvideoscom` resolvuje stream mobile-side). Phash pominięty (xvideos robi
|
|
własne crop-thumbnaile — 0% hit do canonical, jak fullmovies/hdporn).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from datetime import date, datetime
|
|
|
|
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawTag
|
|
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper, meta_content
|
|
from app.normalize.text import slugify
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_BASE = "https://www.xvideos.com"
|
|
_SCENE_URL_RE = re.compile(r'href="(/video\.[0-9a-z]+/[a-z0-9_]+)"', re.IGNORECASE)
|
|
_JSONLD_RE = re.compile(
|
|
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.IGNORECASE | re.DOTALL
|
|
)
|
|
_MODEL_RE = re.compile(r'href="/models/([a-z0-9_-]+)"[^>]*>([^<]{2,60})</a>', re.IGNORECASE)
|
|
_TAG_RE = re.compile(r'href="/tags/([a-z0-9_-]+)"', re.IGNORECASE)
|
|
_SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)")
|
|
_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE)
|
|
|
|
|
|
def _dur_to_sec(value: str | None) -> int | None:
|
|
if not value:
|
|
return None
|
|
m = _ISO_DUR_RE.match(str(value).strip())
|
|
if not m:
|
|
return None
|
|
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
|
|
return total or None
|
|
|
|
|
|
def _iso_date(value: str | None) -> date | None:
|
|
if not value:
|
|
return None
|
|
try:
|
|
return datetime.fromisoformat(str(value).replace("Z", "+00:00")).date()
|
|
except ValueError:
|
|
m = re.match(r"(\d{4}-\d{2}-\d{2})", str(value))
|
|
return date.fromisoformat(m.group(1)) if m else None
|
|
|
|
|
|
def _video_object(html: str) -> dict | None:
|
|
for m in _JSONLD_RE.finditer(html):
|
|
raw = m.group(1).strip()
|
|
if not raw:
|
|
continue
|
|
try:
|
|
data = json.loads(raw)
|
|
except (json.JSONDecodeError, ValueError):
|
|
continue
|
|
items = data if isinstance(data, list) else (data.get("@graph", [data]) if isinstance(data, dict) else [])
|
|
for obj in items:
|
|
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
|
|
return obj
|
|
return None
|
|
|
|
|
|
class XVideosBrowseScraper(BaseBrowseScraper):
|
|
sitetag = "xvideoscom"
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
return f"{_BASE}/new/{page}"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
seen: set[str] = set()
|
|
out: list[str] = []
|
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
|
url = f"{_BASE}{m.group(1)}"
|
|
if url in seen:
|
|
continue
|
|
seen.add(url)
|
|
out.append(url)
|
|
return out
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
video = _video_object(detail_html) or {}
|
|
|
|
title = (video.get("name") or "").strip()
|
|
if not title:
|
|
m = _SETTITLE_RE.search(detail_html)
|
|
title = m.group(1).strip() if m else (meta_content(detail_html, property="og:title") or "").strip()
|
|
if not title:
|
|
return None
|
|
|
|
duration_sec = _dur_to_sec(video.get("duration"))
|
|
release_date = _iso_date(video.get("uploadDate") or video.get("datePublished"))
|
|
thumbnail_url = video.get("thumbnailUrl") or meta_content(detail_html, property="og:image")
|
|
if isinstance(thumbnail_url, list):
|
|
thumbnail_url = thumbnail_url[0] if thumbnail_url else None
|
|
|
|
# Performerzy: linki /models/<slug> (scene-specific; nav xvideos używa innego patternu).
|
|
performers: list[RawPerformer] = []
|
|
seen_perf: set[str] = set()
|
|
for m in _MODEL_RE.finditer(detail_html):
|
|
slug, name = m.group(1), m.group(2).strip()
|
|
if not name or slug in seen_perf or name.lower() in ("models", "pornstars"):
|
|
continue
|
|
seen_perf.add(slug)
|
|
performers.append(RawPerformer(external_id=f"{self.sitetag}:model:{slug}", name=name))
|
|
if len(performers) >= 8:
|
|
break
|
|
|
|
# Tagi: /tags/<slug>.
|
|
tags: list[RawTag] = []
|
|
seen_tag: set[str] = set()
|
|
for m in _TAG_RE.finditer(detail_html):
|
|
slug = m.group(1)
|
|
if slug in seen_tag or len(slug) > 60:
|
|
continue
|
|
seen_tag.add(slug)
|
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=slug.replace("-", " "), slug=slug))
|
|
if len(tags) >= 15:
|
|
break
|
|
|
|
return RawScene(
|
|
external_id=f"{self.sitetag}:{scene_url}",
|
|
title=title,
|
|
duration_sec=duration_sec,
|
|
release_date=release_date,
|
|
url=scene_url,
|
|
performers=performers,
|
|
tags=tags,
|
|
playback_sources=[
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumbnail_url,
|
|
)
|
|
],
|
|
raw={"source": "xvideos_browse"},
|
|
)
|