diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py
index 3c6ce6c..0e9d4e8 100644
--- a/app/connectors/direct_scrapers/__init__.py
+++ b/app/connectors/direct_scrapers/__init__.py
@@ -141,9 +141,11 @@ from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402
from app.connectors.direct_scrapers.porndoe import PornDoeScraper # noqa: E402
from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402
from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401
+from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
+from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
FreshpornoScraper,
@@ -164,6 +166,16 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
# komplet sygnałów. Phash hit-rate niski (własne crop-thumbnaile), studio +
# performer + date + duration nadrabiają.
PornDoeScraper,
+ # YesPornVipScraper — dołączony 2026-05-27 (user audit). JSON-LD VideoObject
+ # + `` per scena (Goon ma
+ # duration w sekundach gotowe + ISO 8601 release_date z timezone). Studio +
+ # performerzy z `btn gold` linków (`/channels//` + `/models//`).
+ # 941k organic monthly (SE Ranking, comparable z porndoe 731k / porntrex 790k).
+ # Scraper-of-paysites (DogFart / HardX / TeamSkeet / Vixen) — wysokie expected
+ # canonical match dla studio scenes. Korekta: theporndude scorecard rank 26
+ # ('yespornvip.com', score -0.5, auth wall) dotyczył **innej domeny** — pdude.link
+ # redirect do porndudecams affiliate. Prawdziwa kanoniczna domena to TLD `.vip`.
+ YesPornVipScraper,
# FullmoviesScraper + HDPornGGScraper — dołączone 2026-06-01. KVS engine (sponsor_groups
# stack, `/videos//` + `/latest-updates/`). Studio teraz z PREFIKSU tytułu
# ("Studio - Scene") — sidebar `/networks/` listował WSZYSTKIE sieci, więc pierwszy match
@@ -177,8 +189,14 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
# publiczne JSON API (api/v2/video/search): 1 call = 100 filmów z title+length_sec+
# keywords+added+thumb. ~100k filmów, deep-crawl przez crawl_page() (API, bez detail-fetch).
EpornerApiScraper,
+ # XVideosBrowseScraper — dołączony 2026-06-03. SSR JSON-LD (duration/title/uploadDate)
+ # + page-parse /models/ (performerzy) + /tags/. Sample: median ~10.5min, 93% ≥3min.
+ # Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie
+ # full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.)
+ XVideosBrowseScraper,
# 4k69.com — NIE dołączony: homepage JS-rendered, brak og:/KVS markerów w surowym HTML
# (probe 2026-06-01). Wymagałby headless render — odłożony.
+ # porntrex/hqporner/youporn — NIE: KVS/JS bez SSR duration → niewidoczne orphany (2026-06-03).
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
]
diff --git a/app/connectors/direct_scrapers/xvideos_browse.py b/app/connectors/direct_scrapers/xvideos_browse.py
new file mode 100644
index 0000000..4e6dd51
--- /dev/null
+++ b/app/connectors/direct_scrapers/xvideos_browse.py
@@ -0,0 +1,147 @@
+"""xvideos.com — deep-crawl browse scraper (JSON-LD + page-parse).
+
+xvideos SSR-uje JSON-LD VideoObject (duration, name, uploadDate) ORAZ na detail-stronie
+linki `/models/` (performerzy tej sceny) + `/tags/` (tagi). Sample 2026-06-03
+(15 scen): median ~10.5min, 93% ≥3min — dobry full-scene content (nie trailery).
+
+Mega-katalog (~13M) → deep_crawl z per-tube page-cap (xvideoscom w deep_crawl._PAGE_CAP),
+żeby nie monopolizował round-robin ani nie zalał bazy. Listing: /new/ (newest).
+Scene: /video./. Playback: page_url + origin tube:xvideoscom (istniejący
+extractor `xvideoscom` resolvuje stream mobile-side). Phash pominięty (xvideos robi
+własne crop-thumbnaile — 0% hit do canonical, jak fullmovies/hdporn).
+"""
+from __future__ import annotations
+
+import json
+import logging
+import re
+from datetime import date, datetime
+
+from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawTag
+from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper, meta_content
+from app.normalize.text import slugify
+
+log = logging.getLogger(__name__)
+
+_BASE = "https://www.xvideos.com"
+_SCENE_URL_RE = re.compile(r'href="(/video\.[0-9a-z]+/[a-z0-9_]+)"', re.IGNORECASE)
+_JSONLD_RE = re.compile(
+ r'', re.IGNORECASE | re.DOTALL
+)
+_MODEL_RE = re.compile(r'href="/models/([a-z0-9_-]+)"[^>]*>([^<]{2,60})', re.IGNORECASE)
+_TAG_RE = re.compile(r'href="/tags/([a-z0-9_-]+)"', re.IGNORECASE)
+_SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)")
+_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE)
+
+
+def _dur_to_sec(value: str | None) -> int | None:
+ if not value:
+ return None
+ m = _ISO_DUR_RE.match(str(value).strip())
+ if not m:
+ return None
+ total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
+ return total or None
+
+
+def _iso_date(value: str | None) -> date | None:
+ if not value:
+ return None
+ try:
+ return datetime.fromisoformat(str(value).replace("Z", "+00:00")).date()
+ except ValueError:
+ m = re.match(r"(\d{4}-\d{2}-\d{2})", str(value))
+ return date.fromisoformat(m.group(1)) if m else None
+
+
+def _video_object(html: str) -> dict | None:
+ for m in _JSONLD_RE.finditer(html):
+ raw = m.group(1).strip()
+ if not raw:
+ continue
+ try:
+ data = json.loads(raw)
+ except (json.JSONDecodeError, ValueError):
+ continue
+ items = data if isinstance(data, list) else (data.get("@graph", [data]) if isinstance(data, dict) else [])
+ for obj in items:
+ if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
+ return obj
+ return None
+
+
+class XVideosBrowseScraper(BaseBrowseScraper):
+ sitetag = "xvideoscom"
+
+ def _listing_url(self, page: int) -> str:
+ return f"{_BASE}/new/{page}"
+
+ def _extract_scene_urls(self, listing_html: str) -> list[str]:
+ seen: set[str] = set()
+ out: list[str] = []
+ for m in _SCENE_URL_RE.finditer(listing_html):
+ url = f"{_BASE}{m.group(1)}"
+ if url in seen:
+ continue
+ seen.add(url)
+ out.append(url)
+ return out
+
+ def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
+ video = _video_object(detail_html) or {}
+
+ title = (video.get("name") or "").strip()
+ if not title:
+ m = _SETTITLE_RE.search(detail_html)
+ title = m.group(1).strip() if m else (meta_content(detail_html, property="og:title") or "").strip()
+ if not title:
+ return None
+
+ duration_sec = _dur_to_sec(video.get("duration"))
+ release_date = _iso_date(video.get("uploadDate") or video.get("datePublished"))
+ thumbnail_url = video.get("thumbnailUrl") or meta_content(detail_html, property="og:image")
+ if isinstance(thumbnail_url, list):
+ thumbnail_url = thumbnail_url[0] if thumbnail_url else None
+
+ # Performerzy: linki /models/ (scene-specific; nav xvideos używa innego patternu).
+ performers: list[RawPerformer] = []
+ seen_perf: set[str] = set()
+ for m in _MODEL_RE.finditer(detail_html):
+ slug, name = m.group(1), m.group(2).strip()
+ if not name or slug in seen_perf or name.lower() in ("models", "pornstars"):
+ continue
+ seen_perf.add(slug)
+ performers.append(RawPerformer(external_id=f"{self.sitetag}:model:{slug}", name=name))
+ if len(performers) >= 8:
+ break
+
+ # Tagi: /tags/.
+ tags: list[RawTag] = []
+ seen_tag: set[str] = set()
+ for m in _TAG_RE.finditer(detail_html):
+ slug = m.group(1)
+ if slug in seen_tag or len(slug) > 60:
+ continue
+ seen_tag.add(slug)
+ tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=slug.replace("-", " "), slug=slug))
+ if len(tags) >= 15:
+ break
+
+ return RawScene(
+ external_id=f"{self.sitetag}:{scene_url}",
+ title=title,
+ duration_sec=duration_sec,
+ release_date=release_date,
+ url=scene_url,
+ performers=performers,
+ tags=tags,
+ playback_sources=[
+ RawPlaybackSource(
+ origin=f"tube:{self.sitetag}",
+ page_url=scene_url,
+ duration_sec=duration_sec,
+ thumbnail_url=thumbnail_url,
+ )
+ ],
+ raw={"source": "xvideos_browse"},
+ )
diff --git a/app/connectors/direct_scrapers/yesporn.py b/app/connectors/direct_scrapers/yesporn.py
new file mode 100644
index 0000000..2354d31
--- /dev/null
+++ b/app/connectors/direct_scrapers/yesporn.py
@@ -0,0 +1,321 @@
+"""yesporn.vip — latest-vids browse scraper.
+
+Dołączony 2026-05-27. Identyfikowany przez user audit jako "scraper-of-paysites"
+(DogFart / HardX / TeamSkeet / Vixen / Brazzers content). Wcześniejszy theporndude
+audit pomylił domeny: `yespornvip.com` (z theporndude rankingu) redirectuje przez
+pdude.link do `porndudecams.com` affiliate spam — kanoniczna domena ma TLD `.vip`.
+
+Czemu wart (parity z porndoe):
+ - **JSON-LD VideoObject** w każdym scene page: name, description, uploadDate
+ (ISO `YYYY-MM-DDTHH:MM:SS`), duration (ISO `PT0H39M00S`), thumbnailUrl
+ (BunnyCDN: `yesnn.b-cdn.net/contents/videos_screenshots/...`).
+ - **``** — durations już w sekundach
+ (fallback gdy ISO-duration parse fail).
+ - **``** — ISO 8601 z timezone, redundant z
+ JSON-LD uploadDate ale czystszy format.
+ - **``** (multiple) — kanoniczna lista tagów (np.
+ "Big Ass", "Threesome"). Główne źródło tagów; alternatywnie DOM ma `btn gold`
+ linki ale te miksują performerów/studio z tagami.
+ - **Studio + Performers**: oba w sekcji ``
+ (studio, singular) i `` (performerzy,
+ multiple). Slugi mają stable per-type salt (`*-i459s7` dla modeli, `*-7p72tp`
+ dla channels) — zachowują się jak hash z site-version, ale stabilne przez
+ sesje.
+
+External_id strategia: `yespornvip:` (`/video/69841/...` → `69841`).
+Slug w URL ma `*-npu57w` suffix który wygląda na stałe-per-page-type, ale id
+numeryczne jest bezpieczniejsze gdyby site zmienił salt.
+
+URL patterns:
+ - Listing: `/latest-updates/` (page 1) / `/latest-updates/N/` (page>1)
+ - Scene: `/video///` (id numeryczny, slug = title slug + 6-char salt)
+ - Studio: `/channels//`
+ - Performer: `/models//`
+ - Search: `/search//` (nie używane w browse-mode — można dorobić jako
+ osobny tryb dla performer-driven backfill jeśli będzie potrzeba)
+
+Playback: download endpoint `/view_video_download.php?id=&format=<480|720|1080>`
+z `data-attach-session="PHPSESSID"` — wymaga session cookie, więc nie direct mp4
+z server-side. Plus jest `embedUrl: /embed/` w JSON-LD. Extractor →
+`_vps_blocked_fallback.extract` (zgodne z pre-public bandwidth/anonymity policy):
+mobile WebView fetcha embed z phone IP, INJECTED_JS scrape'uje `