feat(ingest): add xnxx browse scraper (JSON-LD only, alongside search)
Browse over /best/<YYYY-MM>/<page> (SSR; xnxx has no clean /new/ and its homepage is JS-rendered) for a latest-feed freshness signal next to the performer-driven search scraper. JSON-LD VideoObject only — xnxx detail (unlike its xvideos twin) doesn't expose /models/ or /tags/ in SSR, so performers/tags come via canonical merge + the search scraper. Title is html.unescaped (JSON-LD ships ,/! entities). xhamster and sxyprn intentionally left search-only: xhamster Cloudflare-blocks the VPS on listing pages (1KB challenge), sxyprn has no clean SSR listing (IP-bound) — a flaky browse scraper would be worse than the working search + 168h watchdog. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2051fc1ded
commit
1ca503b7be
2 changed files with 131 additions and 0 deletions
|
|
@ -38,6 +38,7 @@ from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
|||
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
||||
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
||||
from app.connectors.direct_scrapers.porntrex_browse import PornTrexBrowseScraper
|
||||
from app.connectors.direct_scrapers.xnxx_browse import XnxxBrowseScraper
|
||||
from app.connectors.direct_scrapers.youporn_browse import YouPornBrowseScraper
|
||||
from app.connectors.direct_scrapers.siska import SiskaScraper
|
||||
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
||||
|
|
@ -151,6 +152,7 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
|||
PornTrexBrowseScraper,
|
||||
MyPornerLeakBrowseScraper,
|
||||
YouPornBrowseScraper,
|
||||
XnxxBrowseScraper,
|
||||
FreshpornoScraper,
|
||||
FpoxxxScraper,
|
||||
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
||||
|
|
|
|||
129
app/connectors/direct_scrapers/xnxx_browse.py
Normal file
129
app/connectors/direct_scrapers/xnxx_browse.py
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
"""xnxx.com — BROWSE scraper (JSON-LD), obok search scrapera.
|
||||
|
||||
Detail page ma JSON-LD VideoObject (name/duration/uploadDate/thumbnail) — i TYLKO to
|
||||
bierzemy. Mimo wspólnego silnika z xvideos, xnxx detail NIE wystawia w SSR linków
|
||||
`/models/` ani `/tags/` (0 wystąpień, ładowane JS-em) → performerów/tagi dorabia
|
||||
canonical-merge + istniejący performer-search. XnxxScraper (search) zostaje; browse
|
||||
dokłada sygnał świeżości. Tytuł z JSON-LD bywa HTML-encoded (`,`/`!`) →
|
||||
html.unescape.
|
||||
|
||||
Listing: xnxx NIE ma czystego SSR `/new/` (404), ale `/best/<YYYY-MM>/<page>` jest
|
||||
SSR (linki /video-<id>/ w surowym HTML). Bierzemy bieżący miesiąc — pokrywa świeży
|
||||
content (sortowanie best-of-month, nie ściśle chronologiczne, ale dla sygnału
|
||||
świeżości wystarcza; ścisłą chronologię i tak daje performer-search). Homepage
|
||||
xnxx jest JS-renderowany (0 linków w surowym HTML), stąd /best/.
|
||||
|
||||
Playback bez zmian (extractor `xnxxcom`). Phash pominięty (xnxx crop-thumbnaile,
|
||||
0% hit do canonical — jak xvideos).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import date, datetime
|
||||
|
||||
from app.connectors.base import RawPlaybackSource, RawScene
|
||||
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper, meta_content
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_BASE = "https://www.xnxx.com"
|
||||
_SCENE_URL_RE = re.compile(r'href="(/video-[a-z0-9]+/[a-z0-9_\-]+)"', re.IGNORECASE)
|
||||
_JSONLD_RE = re.compile(
|
||||
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.IGNORECASE | re.DOTALL
|
||||
)
|
||||
_SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)")
|
||||
_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE)
|
||||
|
||||
|
||||
def _dur_to_sec(value: str | None) -> int | None:
|
||||
if not value:
|
||||
return None
|
||||
m = _ISO_DUR_RE.match(str(value).strip())
|
||||
if not m:
|
||||
return None
|
||||
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
|
||||
return total or None
|
||||
|
||||
|
||||
def _iso_date(value: str | None) -> date | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromisoformat(str(value).replace("Z", "+00:00")).date()
|
||||
except ValueError:
|
||||
m = re.match(r"(\d{4}-\d{2}-\d{2})", str(value))
|
||||
return date.fromisoformat(m.group(1)) if m else None
|
||||
|
||||
|
||||
def _video_object(html: str) -> dict | None:
|
||||
for m in _JSONLD_RE.finditer(html):
|
||||
raw = m.group(1).strip()
|
||||
if not raw:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
continue
|
||||
items = data if isinstance(data, list) else (data.get("@graph", [data]) if isinstance(data, dict) else [])
|
||||
for obj in items:
|
||||
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
|
||||
return obj
|
||||
return None
|
||||
|
||||
|
||||
class XnxxBrowseScraper(BaseBrowseScraper):
|
||||
sitetag = "xnxxcom"
|
||||
|
||||
def _listing_url(self, page: int) -> str:
|
||||
month = datetime.now().strftime("%Y-%m")
|
||||
return f"{_BASE}/best/{month}/{page}"
|
||||
|
||||
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||
url = f"{_BASE}{m.group(1)}"
|
||||
if url in seen:
|
||||
continue
|
||||
seen.add(url)
|
||||
out.append(url)
|
||||
return out
|
||||
|
||||
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||
video = _video_object(detail_html) or {}
|
||||
title = (video.get("name") or "").strip()
|
||||
if not title:
|
||||
m = _SETTITLE_RE.search(detail_html)
|
||||
title = m.group(1).strip() if m else (meta_content(detail_html, property="og:title") or "").strip()
|
||||
title = html.unescape(title).strip()
|
||||
if not title:
|
||||
return None
|
||||
|
||||
duration_sec = _dur_to_sec(video.get("duration"))
|
||||
release_date = _iso_date(video.get("uploadDate") or video.get("datePublished"))
|
||||
thumbnail_url = video.get("thumbnailUrl") or meta_content(detail_html, property="og:image")
|
||||
if isinstance(thumbnail_url, list):
|
||||
thumbnail_url = thumbnail_url[0] if thumbnail_url else None
|
||||
|
||||
# performers/tags puste — xnxx detail nie ma ich w SSR (patrz docstring).
|
||||
return RawScene(
|
||||
external_id=f"{self.sitetag}:{scene_url}",
|
||||
title=title,
|
||||
duration_sec=duration_sec,
|
||||
release_date=release_date,
|
||||
url=scene_url,
|
||||
performers=[],
|
||||
tags=[],
|
||||
playback_sources=[
|
||||
RawPlaybackSource(
|
||||
origin=f"tube:{self.sitetag}",
|
||||
page_url=scene_url,
|
||||
duration_sec=duration_sec,
|
||||
thumbnail_url=thumbnail_url,
|
||||
)
|
||||
],
|
||||
raw={"source": "xnxx_browse"},
|
||||
)
|
||||
Loading…
Add table
Reference in a new issue