From 1ca503b7be1591d62135dcc98a14063ac6a0f530 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Wed, 24 Jun 2026 15:52:32 +0200 Subject: [PATCH] feat(ingest): add xnxx browse scraper (JSON-LD only, alongside search) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Browse over /best// (SSR; xnxx has no clean /new/ and its homepage is JS-rendered) for a latest-feed freshness signal next to the performer-driven search scraper. JSON-LD VideoObject only — xnxx detail (unlike its xvideos twin) doesn't expose /models/ or /tags/ in SSR, so performers/tags come via canonical merge + the search scraper. Title is html.unescaped (JSON-LD ships ,/! entities). xhamster and sxyprn intentionally left search-only: xhamster Cloudflare-blocks the VPS on listing pages (1KB challenge), sxyprn has no clean SSR listing (IP-bound) — a flaky browse scraper would be worse than the working search + 168h watchdog. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/connectors/direct_scrapers/__init__.py | 2 + app/connectors/direct_scrapers/xnxx_browse.py | 129 ++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 app/connectors/direct_scrapers/xnxx_browse.py diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index d87de71..1292fc5 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -38,6 +38,7 @@ from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper from app.connectors.direct_scrapers.porndish import PornDishScraper from app.connectors.direct_scrapers.porntrex import PornTrexScraper from app.connectors.direct_scrapers.porntrex_browse import PornTrexBrowseScraper +from app.connectors.direct_scrapers.xnxx_browse import XnxxBrowseScraper from app.connectors.direct_scrapers.youporn_browse import YouPornBrowseScraper from app.connectors.direct_scrapers.siska import SiskaScraper from app.connectors.direct_scrapers.sxyland import SxyLandScraper @@ -151,6 +152,7 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ PornTrexBrowseScraper, MyPornerLeakBrowseScraper, YouPornBrowseScraper, + XnxxBrowseScraper, FreshpornoScraper, FpoxxxScraper, # LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven diff --git a/app/connectors/direct_scrapers/xnxx_browse.py b/app/connectors/direct_scrapers/xnxx_browse.py new file mode 100644 index 0000000..a8ec829 --- /dev/null +++ b/app/connectors/direct_scrapers/xnxx_browse.py @@ -0,0 +1,129 @@ +"""xnxx.com — BROWSE scraper (JSON-LD), obok search scrapera. + +Detail page ma JSON-LD VideoObject (name/duration/uploadDate/thumbnail) — i TYLKO to +bierzemy. Mimo wspólnego silnika z xvideos, xnxx detail NIE wystawia w SSR linków +`/models/` ani `/tags/` (0 wystąpień, ładowane JS-em) → performerów/tagi dorabia +canonical-merge + istniejący performer-search. XnxxScraper (search) zostaje; browse +dokłada sygnał świeżości. Tytuł z JSON-LD bywa HTML-encoded (`,`/`!`) → +html.unescape. + +Listing: xnxx NIE ma czystego SSR `/new/` (404), ale `/best//` jest +SSR (linki /video-/ w surowym HTML). Bierzemy bieżący miesiąc — pokrywa świeży +content (sortowanie best-of-month, nie ściśle chronologiczne, ale dla sygnału +świeżości wystarcza; ścisłą chronologię i tak daje performer-search). Homepage +xnxx jest JS-renderowany (0 linków w surowym HTML), stąd /best/. + +Playback bez zmian (extractor `xnxxcom`). Phash pominięty (xnxx crop-thumbnaile, +0% hit do canonical — jak xvideos). +""" +from __future__ import annotations + +import html +import json +import logging +import re +from datetime import date, datetime + +from app.connectors.base import RawPlaybackSource, RawScene +from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper, meta_content + +log = logging.getLogger(__name__) + +_BASE = "https://www.xnxx.com" +_SCENE_URL_RE = re.compile(r'href="(/video-[a-z0-9]+/[a-z0-9_\-]+)"', re.IGNORECASE) +_JSONLD_RE = re.compile( + r']+type=["\']application/ld\+json["\'][^>]*>(.*?)', re.IGNORECASE | re.DOTALL +) +_SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)") +_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE) + + +def _dur_to_sec(value: str | None) -> int | None: + if not value: + return None + m = _ISO_DUR_RE.match(str(value).strip()) + if not m: + return None + total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0) + return total or None + + +def _iso_date(value: str | None) -> date | None: + if not value: + return None + try: + return datetime.fromisoformat(str(value).replace("Z", "+00:00")).date() + except ValueError: + m = re.match(r"(\d{4}-\d{2}-\d{2})", str(value)) + return date.fromisoformat(m.group(1)) if m else None + + +def _video_object(html: str) -> dict | None: + for m in _JSONLD_RE.finditer(html): + raw = m.group(1).strip() + if not raw: + continue + try: + data = json.loads(raw) + except (json.JSONDecodeError, ValueError): + continue + items = data if isinstance(data, list) else (data.get("@graph", [data]) if isinstance(data, dict) else []) + for obj in items: + if isinstance(obj, dict) and obj.get("@type") == "VideoObject": + return obj + return None + + +class XnxxBrowseScraper(BaseBrowseScraper): + sitetag = "xnxxcom" + + def _listing_url(self, page: int) -> str: + month = datetime.now().strftime("%Y-%m") + return f"{_BASE}/best/{month}/{page}" + + def _extract_scene_urls(self, listing_html: str) -> list[str]: + seen: set[str] = set() + out: list[str] = [] + for m in _SCENE_URL_RE.finditer(listing_html): + url = f"{_BASE}{m.group(1)}" + if url in seen: + continue + seen.add(url) + out.append(url) + return out + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + video = _video_object(detail_html) or {} + title = (video.get("name") or "").strip() + if not title: + m = _SETTITLE_RE.search(detail_html) + title = m.group(1).strip() if m else (meta_content(detail_html, property="og:title") or "").strip() + title = html.unescape(title).strip() + if not title: + return None + + duration_sec = _dur_to_sec(video.get("duration")) + release_date = _iso_date(video.get("uploadDate") or video.get("datePublished")) + thumbnail_url = video.get("thumbnailUrl") or meta_content(detail_html, property="og:image") + if isinstance(thumbnail_url, list): + thumbnail_url = thumbnail_url[0] if thumbnail_url else None + + # performers/tags puste — xnxx detail nie ma ich w SSR (patrz docstring). + return RawScene( + external_id=f"{self.sitetag}:{scene_url}", + title=title, + duration_sec=duration_sec, + release_date=release_date, + url=scene_url, + performers=[], + tags=[], + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumbnail_url, + ) + ], + raw={"source": "xnxx_browse"}, + )