feat(superporn): browse scraper via Bright Data residential proxy
superporn hard-blocks the VPS IP with Cloudflare 403 on every TLS impersonation, so HTML ingest routes through Bright Data residential (BRIGHTDATA_PROXY_URL, parsed in config). First scraper to use a proxy: optional _proxy on the browse base, threaded into browser_get. JSON-LD VideoObject (title/desc/uploadDate/thumb/duration) + pornstar and category chips; superporn double-encodes HTML entities so titles are unescaped twice. Thumbnails fetch fine from the VPS (no proxy). Playback stays off-proxy: the <source> mp4 token is IP-bound to the fetcher, so resolve is phone-side via WebView (extractor superporncom -> _vps_blocked_fallback), same as porndoe. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
80fd83cb4e
commit
21bc8bf1fe
6 changed files with 242 additions and 4 deletions
|
|
@ -121,6 +121,22 @@ class Settings(BaseSettings):
|
||||||
hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT")
|
hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT")
|
||||||
hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT")
|
hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT")
|
||||||
|
|
||||||
|
# Bright Data residential proxy — używany TYLKO do ingestu HTML (scrape) tubów
|
||||||
|
# które blokują VPS IP twardym Cloudflare 403 nawet z browser-TLS (superporn).
|
||||||
|
# NIE do streamowania wideo (transfer leciałby przez płatne proxy + tokeny i tak
|
||||||
|
# IP-bound). Format env: `host:port:user:pass` (panel Bright Data). Pusty = brak.
|
||||||
|
brightdata_proxy_raw: str = Field(default="", validation_alias="BRIGHTDATA_PROXY_URL")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def brightdata_proxy_url(self) -> str | None:
|
||||||
|
"""`host:port:user:pass` → `http://user:pass@host:port` dla curl_cffi/httpx.
|
||||||
|
None gdy nieustawiony lub w złym formacie."""
|
||||||
|
parts = self.brightdata_proxy_raw.split(":")
|
||||||
|
if len(parts) != 4 or not all(parts):
|
||||||
|
return None
|
||||||
|
host, port, user, pwd = parts
|
||||||
|
return f"http://{user}:{pwd}@{host}:{port}"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def api_keys(self) -> set[str]:
|
def api_keys(self) -> set[str]:
|
||||||
return {k.strip() for k in self.api_keys_raw.split(",") if k.strip()}
|
return {k.strip() for k in self.api_keys_raw.split(",") if k.strip()}
|
||||||
|
|
|
||||||
|
|
@ -156,6 +156,7 @@ from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E40
|
||||||
from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402
|
from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402
|
from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402
|
from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402
|
||||||
|
from app.connectors.direct_scrapers.superporn import SuperpornScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
|
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402
|
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402
|
||||||
|
|
||||||
|
|
@ -224,6 +225,13 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
# "- HardX Update - ..." — fuzzy match po tytule). Resolve server-side _kvs,
|
# "- HardX Update - ..." — fuzzy match po tytule). Resolve server-side _kvs,
|
||||||
# finalny remote_control.php portable cross-IP.
|
# finalny remote_control.php portable cross-IP.
|
||||||
NepornScraper,
|
NepornScraper,
|
||||||
|
# SuperpornScraper — dołączony 2026-06-10 (user request). superporn blokuje VPS IP
|
||||||
|
# twardym CF 403 (każda impersonacja TLS), więc ingest HTML idzie przez Bright Data
|
||||||
|
# residential proxy (BRIGHTDATA_PROXY_URL). Pierwszy scraper z proxy — `_proxy` w
|
||||||
|
# _browse_base. JSON-LD (title+desc+uploadDate+thumb+duration) + chipy pornstar/
|
||||||
|
# kategorie. Playback IP-bound → WebView (extractor superporncom → _vps_blocked_fallback).
|
||||||
|
# Bez proxy: scraper no-op (pusty iterator).
|
||||||
|
SuperpornScraper,
|
||||||
# porntrex/hqporner/youporn — NIE: KVS/JS bez SSR duration → niewidoczne orphany (2026-06-03).
|
# porntrex/hqporner/youporn — NIE: KVS/JS bez SSR duration → niewidoczne orphany (2026-06-03).
|
||||||
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
|
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,11 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC):
|
||||||
_timeout: float = 30.0
|
_timeout: float = 30.0
|
||||||
"""HTTP timeout per request."""
|
"""HTTP timeout per request."""
|
||||||
|
|
||||||
|
_proxy: str | None = None
|
||||||
|
"""Opcjonalny proxy (http://user:pass@host:port) dla listing+detail fetchy.
|
||||||
|
Ustawiany przez scrapery tubów blokujących VPS IP (superporn → Bright Data
|
||||||
|
residential). None = bezpośredni fetch (domyślnie)."""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def _listing_url(self, page: int) -> str:
|
def _listing_url(self, page: int) -> str:
|
||||||
"""URL listing page'a 'latest-vids' (page 1 = newest)."""
|
"""URL listing page'a 'latest-vids' (page 1 = newest)."""
|
||||||
|
|
@ -78,7 +83,7 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC):
|
||||||
"""
|
"""
|
||||||
url = self._listing_url(page)
|
url = self._listing_url(page)
|
||||||
try:
|
try:
|
||||||
res = browser_get(url, timeout=self._timeout)
|
res = browser_get(url, timeout=self._timeout, proxy=self._proxy)
|
||||||
html = res.text if hasattr(res, "text") else res
|
html = res.text if hasattr(res, "text") else res
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e)
|
log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e)
|
||||||
|
|
@ -92,7 +97,7 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC):
|
||||||
out: list[RawScene] = []
|
out: list[RawScene] = []
|
||||||
for scene_url in urls:
|
for scene_url in urls:
|
||||||
try:
|
try:
|
||||||
res = browser_get(scene_url, timeout=self._timeout)
|
res = browser_get(scene_url, timeout=self._timeout, proxy=self._proxy)
|
||||||
detail_html = res.text if hasattr(res, "text") else res
|
detail_html = res.text if hasattr(res, "text") else res
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e)
|
log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e)
|
||||||
|
|
|
||||||
194
app/connectors/direct_scrapers/superporn.py
Normal file
194
app/connectors/direct_scrapers/superporn.py
Normal file
|
|
@ -0,0 +1,194 @@
|
||||||
|
"""superporn.com — latest-vids browse scraper (przez Bright Data residential proxy).
|
||||||
|
|
||||||
|
Dołączony 2026-06-10 (user request). superporn twardo blokuje VPS IP Cloudflarem
|
||||||
|
(403 na KAŻDEJ impersonacji TLS — chrome/safari/firefox), więc ingest HTML idzie
|
||||||
|
przez Bright Data residential proxy (`settings.brightdata_proxy_url`). Gdy proxy
|
||||||
|
nieskonfigurowane → scraper no-op (pusty iterator, log warning).
|
||||||
|
|
||||||
|
Proxy używamy TYLKO do scrape HTML. Playback NIE idzie przez proxy: `<source>` mp4
|
||||||
|
(cdnst*.superporn.com) ma token IP-bound do fetchera (403 cross-IP), więc resolve
|
||||||
|
musi nastąpić po stronie telefonu — extractor `superporncom` → `_vps_blocked_fallback`
|
||||||
|
(mobile WebView ładuje stronę z residential IP telefonu, INJECTED_JS bierze video.src).
|
||||||
|
Thumbnaile (img*.superporn.com) schodzą z VPS bez proxy (image proxy działa).
|
||||||
|
|
||||||
|
Sygnały (SSR HTML):
|
||||||
|
- JSON-LD VideoObject: name, description, uploadDate, thumbnailUrl, duration
|
||||||
|
ISO 8601 (`P0DT0H38M48S`). `author` = uploader (NIE performer — pomijamy).
|
||||||
|
- Performerzy: chip `<a class="chip-link" href=".../pornstar/<slug>"><span>Name</span>`
|
||||||
|
- Kategorie: chip `<a class="chip-link" href="/<slug>"><span>Name</span>` (bez
|
||||||
|
`/pornstar/`) — w bloku `#collapse-categories`.
|
||||||
|
- Duration backup: `<video ... data-video-duration="2328">` (sekundy).
|
||||||
|
|
||||||
|
Listing: `?page=N` (newest-first; sitemap lastmod jest z 2024, bezużyteczny).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._playtube import (
|
||||||
|
_extract_video_object,
|
||||||
|
_parse_iso_date,
|
||||||
|
)
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://www.superporn.com"
|
||||||
|
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(https://www\.superporn\.com/video/[a-z0-9\-]+)"', re.IGNORECASE)
|
||||||
|
# Chip pornstar: href .../pornstar/<slug> (abs lub rel), nazwa w <span>.
|
||||||
|
_PERF_CHIP_RE = re.compile(
|
||||||
|
r'<a[^>]+class="chip-link"[^>]+href="[^"]*/pornstar/[^"]*"[^>]*>.*?<span>([^<]+)</span>',
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
# Chip kategorii: href="/<slug>" (NIE /pornstar/, NIE /video/, NIE absolutny http).
|
||||||
|
_CAT_CHIP_RE = re.compile(
|
||||||
|
r'<a[^>]+class="chip-link"[^>]+href="/(?!pornstar/|video/)([a-z0-9\-]+)"[^>]*>.*?<span>([^<]+)</span>',
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
_DATA_DUR_RE = re.compile(r'data-video-duration="(\d+)"')
|
||||||
|
# ISO 8601 z dniami: `P0DT0H38M48S`.
|
||||||
|
_ISO_DUR_FULL_RE = re.compile(
|
||||||
|
r"P(?:(\d+)D)?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _clean(text: str) -> str:
|
||||||
|
"""superporn dwukrotnie HTML-koduje encje w JSON-LD (`&#39;` zamiast `'`).
|
||||||
|
Unescape iteracyjnie (max 2) — bez over-decode dla zwykłych nazw."""
|
||||||
|
for _ in range(2):
|
||||||
|
new = html.unescape(text)
|
||||||
|
if new == text:
|
||||||
|
break
|
||||||
|
text = new
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_iso_duration_days(value: str | None) -> int | None:
|
||||||
|
"""`P0DT0H38M48S` → 2328. None gdy zero/parse fail."""
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
m = _ISO_DUR_FULL_RE.match(value.strip())
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
d, h, mn, s = (int(g or 0) for g in m.groups())
|
||||||
|
total = d * 86400 + h * 3600 + mn * 60 + s
|
||||||
|
return total or None
|
||||||
|
|
||||||
|
|
||||||
|
class SuperpornScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "superporncom"
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
# Bright Data residential — bez niego superporn jest nieosiągalny z VPS.
|
||||||
|
self._proxy = get_settings().brightdata_proxy_url
|
||||||
|
if not self._proxy:
|
||||||
|
log.warning("superporn: BRIGHTDATA_PROXY_URL unset — scraper disabled")
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
if page <= 1:
|
||||||
|
return f"{_BASE}/"
|
||||||
|
return f"{_BASE}/?page={page}"
|
||||||
|
|
||||||
|
def crawl_page(self, page: int):
|
||||||
|
# Bez proxy nie ma sensu uderzać (gwarantowany CF 403) — sygnalizuj
|
||||||
|
# "exhausted" (pusta lista), żeby deep-crawl nie retry'ował w kółko.
|
||||||
|
if not self._proxy:
|
||||||
|
return []
|
||||||
|
return super().crawl_page(page)
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
url = m.group(1)
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
video = _extract_video_object(detail_html) or {}
|
||||||
|
|
||||||
|
title = _clean(video.get("name") or "")
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
slug_m = re.search(r"/video/([a-z0-9\-]+)", scene_url, re.IGNORECASE)
|
||||||
|
scene_slug = slug_m.group(1) if slug_m else None
|
||||||
|
|
||||||
|
duration_sec = _parse_iso_duration_days(video.get("duration"))
|
||||||
|
if duration_sec is None:
|
||||||
|
dm = _DATA_DUR_RE.search(detail_html)
|
||||||
|
if dm:
|
||||||
|
duration_sec = int(dm.group(1)) or None
|
||||||
|
|
||||||
|
release_date = _parse_iso_date(video.get("uploadDate"))
|
||||||
|
description = _clean(video.get("description") or "") or None
|
||||||
|
thumbnail_url = (video.get("thumbnailUrl") or "").strip() or None
|
||||||
|
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for m in _PERF_CHIP_RE.finditer(detail_html):
|
||||||
|
name = _clean(m.group(1))
|
||||||
|
slug = slugify(name)
|
||||||
|
if not slug or slug in seen_perf:
|
||||||
|
continue
|
||||||
|
seen_perf.add(slug)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name)
|
||||||
|
)
|
||||||
|
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for m in _CAT_CHIP_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1).strip(), _clean(m.group(2))
|
||||||
|
if not name or slug in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(slug)
|
||||||
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug))
|
||||||
|
|
||||||
|
# Phash z thumbnaila — proxy NIE potrzebny (img*.superporn.com działa z VPS).
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumbnail_url:
|
||||||
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_slug or scene_url}",
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
release_date=release_date,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
|
|
@ -188,6 +188,11 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# neporn — KVS function/0 + license (jak freshporno). Server-side _kvs resolve →
|
# neporn — KVS function/0 + license (jak freshporno). Server-side _kvs resolve →
|
||||||
# data001.neporn.com/remote_control.php portable (cross-IP 206, 2026-06-10).
|
# data001.neporn.com/remote_control.php portable (cross-IP 206, 2026-06-10).
|
||||||
"neporncom": neporn.extract,
|
"neporncom": neporn.extract,
|
||||||
|
# superporn — `<source>` mp4 (cdnst*.superporn.com) token IP-bound do fetchera
|
||||||
|
# (403 cross-IP, test 2026-06-10), a sama strona CF-blocked z VPS. Resolve MUSI
|
||||||
|
# być phone-side: WebView ładuje stronę z residential IP telefonu, INJECTED_JS
|
||||||
|
# bierze video.src. Ingest HTML idzie osobno przez Bright Data proxy (scraper).
|
||||||
|
"superporncom": _vps_blocked_fallback.extract,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -56,10 +56,18 @@ def browser_get(
|
||||||
timeout: float = 60.0,
|
timeout: float = 60.0,
|
||||||
follow_redirects: bool = True,
|
follow_redirects: bool = True,
|
||||||
impersonate: str = _DEFAULT_IMPERSONATE,
|
impersonate: str = _DEFAULT_IMPERSONATE,
|
||||||
|
proxy: str | None = None,
|
||||||
) -> FetchResult:
|
) -> FetchResult:
|
||||||
"""GET z Chrome TLS fingerprint (curl_cffi). Spada do httpx gdy curl_cffi brak."""
|
"""GET z Chrome TLS fingerprint (curl_cffi). Spada do httpx gdy curl_cffi brak.
|
||||||
|
|
||||||
|
`proxy` (http://user:pass@host:port) — routuje request przez proxy. Używane
|
||||||
|
tylko do ingestu HTML tubów blokujących VPS IP (np. superporn przez Bright Data
|
||||||
|
residential). NIE dla streamów."""
|
||||||
if not _HAS_CURL_CFFI:
|
if not _HAS_CURL_CFFI:
|
||||||
with httpx.Client(timeout=timeout, follow_redirects=follow_redirects) as http:
|
proxies = {"http://": proxy, "https://": proxy} if proxy else None
|
||||||
|
with httpx.Client(
|
||||||
|
timeout=timeout, follow_redirects=follow_redirects, proxies=proxies
|
||||||
|
) as http:
|
||||||
r = http.get(url, headers=dict(headers or {}))
|
r = http.get(url, headers=dict(headers or {}))
|
||||||
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
|
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
|
||||||
|
|
||||||
|
|
@ -69,6 +77,8 @@ def browser_get(
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
impersonate=impersonate,
|
impersonate=impersonate,
|
||||||
allow_redirects=follow_redirects,
|
allow_redirects=follow_redirects,
|
||||||
|
proxies={"http": proxy, "https": proxy} if proxy else None,
|
||||||
|
verify=not proxy, # Bright Data MITM CA — curl_cffi nie ma go w bundlu
|
||||||
)
|
)
|
||||||
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
|
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue