"""xvideos.com — deep-crawl browse scraper (JSON-LD + page-parse). xvideos SSR-uje JSON-LD VideoObject (duration, name, uploadDate) ORAZ na detail-stronie linki `/models/` (performerzy tej sceny) + `/tags/` (tagi). Sample 2026-06-03 (15 scen): median ~10.5min, 93% ≥3min — dobry full-scene content (nie trailery). Mega-katalog (~13M) → deep_crawl z per-tube page-cap (xvideoscom w deep_crawl._PAGE_CAP), żeby nie monopolizował round-robin ani nie zalał bazy. Listing: /new/ (newest). Scene: /video./. Playback: page_url + origin tube:xvideoscom (istniejący extractor `xvideoscom` resolvuje stream mobile-side). Phash pominięty (xvideos robi własne crop-thumbnaile — 0% hit do canonical, jak fullmovies/hdporn). """ from __future__ import annotations import json import logging import re from datetime import date, datetime from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawTag from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper, meta_content from app.normalize.text import slugify log = logging.getLogger(__name__) _BASE = "https://www.xvideos.com" _SCENE_URL_RE = re.compile(r'href="(/video\.[0-9a-z]+/[a-z0-9_]+)"', re.IGNORECASE) _JSONLD_RE = re.compile( r']+type=["\']application/ld\+json["\'][^>]*>(.*?)', re.IGNORECASE | re.DOTALL ) _MODEL_RE = re.compile(r'href="/models/([a-z0-9_-]+)"[^>]*>([^<]{2,60})', re.IGNORECASE) _TAG_RE = re.compile(r'href="/tags/([a-z0-9_-]+)"', re.IGNORECASE) _SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)") _ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE) def _dur_to_sec(value: str | None) -> int | None: if not value: return None m = _ISO_DUR_RE.match(str(value).strip()) if not m: return None total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0) return total or None def _iso_date(value: str | None) -> date | None: if not value: return None try: return datetime.fromisoformat(str(value).replace("Z", "+00:00")).date() except ValueError: m = re.match(r"(\d{4}-\d{2}-\d{2})", str(value)) return date.fromisoformat(m.group(1)) if m else None def _video_object(html: str) -> dict | None: for m in _JSONLD_RE.finditer(html): raw = m.group(1).strip() if not raw: continue try: data = json.loads(raw) except (json.JSONDecodeError, ValueError): continue items = data if isinstance(data, list) else (data.get("@graph", [data]) if isinstance(data, dict) else []) for obj in items: if isinstance(obj, dict) and obj.get("@type") == "VideoObject": return obj return None class XVideosBrowseScraper(BaseBrowseScraper): sitetag = "xvideoscom" def _listing_url(self, page: int) -> str: return f"{_BASE}/new/{page}" def _extract_scene_urls(self, listing_html: str) -> list[str]: seen: set[str] = set() out: list[str] = [] for m in _SCENE_URL_RE.finditer(listing_html): url = f"{_BASE}{m.group(1)}" if url in seen: continue seen.add(url) out.append(url) return out def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: video = _video_object(detail_html) or {} title = (video.get("name") or "").strip() if not title: m = _SETTITLE_RE.search(detail_html) title = m.group(1).strip() if m else (meta_content(detail_html, property="og:title") or "").strip() if not title: return None duration_sec = _dur_to_sec(video.get("duration")) release_date = _iso_date(video.get("uploadDate") or video.get("datePublished")) thumbnail_url = video.get("thumbnailUrl") or meta_content(detail_html, property="og:image") if isinstance(thumbnail_url, list): thumbnail_url = thumbnail_url[0] if thumbnail_url else None # Performerzy: linki /models/ (scene-specific; nav xvideos używa innego patternu). performers: list[RawPerformer] = [] seen_perf: set[str] = set() for m in _MODEL_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if not name or slug in seen_perf or name.lower() in ("models", "pornstars"): continue seen_perf.add(slug) performers.append(RawPerformer(external_id=f"{self.sitetag}:model:{slug}", name=name)) if len(performers) >= 8: break # Tagi: /tags/. tags: list[RawTag] = [] seen_tag: set[str] = set() for m in _TAG_RE.finditer(detail_html): slug = m.group(1) if slug in seen_tag or len(slug) > 60: continue seen_tag.add(slug) tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=slug.replace("-", " "), slug=slug)) if len(tags) >= 15: break return RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, duration_sec=duration_sec, release_date=release_date, url=scene_url, performers=performers, tags=tags, playback_sources=[ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumbnail_url, ) ], raw={"source": "xvideos_browse"}, )