feat(ingest): add youporn browse scraper (JSON-LD only, alongside search)
Browse over /browse/time/?page=<n> (SSR) for guaranteed latest-feed freshness next to the existing performer-driven search scraper. JSON-LD VideoObject only (title / duration / uploadDate / thumbnail) — deliberately NOT scraping performers/tags from the detail page: JSON-LD has no actor field and the /pornstar//category links are sidebar-polluted with no scene-scoped container, so a naive regex attached the same 2 pornstars to every scene. Performers/tags come via canonical merge + the search scraper instead. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
55612e262b
commit
2051fc1ded
2 changed files with 120 additions and 0 deletions
|
|
@ -38,6 +38,7 @@ from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
||||||
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
||||||
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
||||||
from app.connectors.direct_scrapers.porntrex_browse import PornTrexBrowseScraper
|
from app.connectors.direct_scrapers.porntrex_browse import PornTrexBrowseScraper
|
||||||
|
from app.connectors.direct_scrapers.youporn_browse import YouPornBrowseScraper
|
||||||
from app.connectors.direct_scrapers.siska import SiskaScraper
|
from app.connectors.direct_scrapers.siska import SiskaScraper
|
||||||
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
||||||
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
|
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
|
||||||
|
|
@ -149,6 +150,7 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
# wprost z feedu (watchdog 48h zamiast 168h). Konwersja 2026-06-24 (user request).
|
# wprost z feedu (watchdog 48h zamiast 168h). Konwersja 2026-06-24 (user request).
|
||||||
PornTrexBrowseScraper,
|
PornTrexBrowseScraper,
|
||||||
MyPornerLeakBrowseScraper,
|
MyPornerLeakBrowseScraper,
|
||||||
|
YouPornBrowseScraper,
|
||||||
FreshpornoScraper,
|
FreshpornoScraper,
|
||||||
FpoxxxScraper,
|
FpoxxxScraper,
|
||||||
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
||||||
|
|
|
||||||
118
app/connectors/direct_scrapers/youporn_browse.py
Normal file
118
app/connectors/direct_scrapers/youporn_browse.py
Normal file
|
|
@ -0,0 +1,118 @@
|
||||||
|
"""youporn.com — latest BROWSE scraper (JSON-LD + page-parse), obok search scrapera.
|
||||||
|
|
||||||
|
YouPornScraper (search) zostaje w ALL_DIRECT_SCRAPERS; ten browse dokłada świeżość
|
||||||
|
wprost z `/browse/time/?page=<n>` (newest-first, SSR). Detail page ma JSON-LD
|
||||||
|
VideoObject (name/duration/uploadDate/thumbnail) — i TYLKO to bierzemy.
|
||||||
|
|
||||||
|
UWAGA: performerów/tagów z detail-strony NIE wyciągamy. JSON-LD nie ma pola `actor`,
|
||||||
|
a linki `/pornstar/` i `/category/` na stronie są zaśmiecone sidebarem (popularne
|
||||||
|
pornstars/related) bez czystego scene-scoped kontenera — naiwny regex podpinał te
|
||||||
|
same 2 pornstars do KAŻDEJ sceny (mass-misattribution). Browse to tylko sygnał
|
||||||
|
świeżości (próg watchdog 48h); performerów/tagi dorabia canonical-merge + istniejący
|
||||||
|
search scraper (performer-driven). Listing SSR (/watch/<id>/); homepage JS-renderowany.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
|
from app.connectors.base import RawPlaybackSource, RawScene
|
||||||
|
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper, meta_content
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://www.youporn.com"
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(/watch/\d+[^"]*)"', re.IGNORECASE)
|
||||||
|
_JSONLD_RE = re.compile(
|
||||||
|
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.IGNORECASE | re.DOTALL
|
||||||
|
)
|
||||||
|
_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _dur_to_sec(value: str | None) -> int | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
m = _ISO_DUR_RE.match(str(value).strip())
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
|
||||||
|
return total or None
|
||||||
|
|
||||||
|
|
||||||
|
def _iso_date(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(str(value).replace("Z", "+00:00")).date()
|
||||||
|
except ValueError:
|
||||||
|
m = re.match(r"(\d{4}-\d{2}-\d{2})", str(value))
|
||||||
|
return date.fromisoformat(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _video_object(html: str) -> dict | None:
|
||||||
|
for m in _JSONLD_RE.finditer(html):
|
||||||
|
raw = m.group(1).strip()
|
||||||
|
if not raw:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(raw)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
continue
|
||||||
|
items = data if isinstance(data, list) else (data.get("@graph", [data]) if isinstance(data, dict) else [])
|
||||||
|
for obj in items:
|
||||||
|
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
|
||||||
|
return obj
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class YouPornBrowseScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "youporncom"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
return f"{_BASE}/browse/time/?page={page}"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
url = f"{_BASE}{m.group(1)}"
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
video = _video_object(detail_html) or {}
|
||||||
|
title = (video.get("name") or "").strip() or (meta_content(detail_html, property="og:title") or "").strip()
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
duration_sec = _dur_to_sec(video.get("duration"))
|
||||||
|
release_date = _iso_date(video.get("uploadDate") or video.get("datePublished"))
|
||||||
|
thumbnail_url = video.get("thumbnailUrl") or meta_content(detail_html, property="og:image")
|
||||||
|
if isinstance(thumbnail_url, list):
|
||||||
|
thumbnail_url = thumbnail_url[0] if thumbnail_url else None
|
||||||
|
|
||||||
|
# performers/tags celowo puste — patrz docstring (sidebar pollution, brak
|
||||||
|
# scene-scoped kontenera). Dorabia canonical-merge + search scraper.
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
release_date=release_date,
|
||||||
|
url=scene_url,
|
||||||
|
performers=[],
|
||||||
|
tags=[],
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
raw={"source": "youporn_browse"},
|
||||||
|
)
|
||||||
Loading…
Add table
Reference in a new issue