goon/app/connectors/direct_scrapers/superporn.py
jtrzupek 21bc8bf1fe feat(superporn): browse scraper via Bright Data residential proxy
superporn hard-blocks the VPS IP with Cloudflare 403 on every TLS
impersonation, so HTML ingest routes through Bright Data residential
(BRIGHTDATA_PROXY_URL, parsed in config). First scraper to use a proxy:
optional _proxy on the browse base, threaded into browser_get.

JSON-LD VideoObject (title/desc/uploadDate/thumb/duration) + pornstar
and category chips; superporn double-encodes HTML entities so titles
are unescaped twice. Thumbnails fetch fine from the VPS (no proxy).

Playback stays off-proxy: the <source> mp4 token is IP-bound to the
fetcher, so resolve is phone-side via WebView (extractor superporncom
-> _vps_blocked_fallback), same as porndoe.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 18:47:45 +02:00

194 lines
7.1 KiB
Python

"""superporn.com — latest-vids browse scraper (przez Bright Data residential proxy).
Dołączony 2026-06-10 (user request). superporn twardo blokuje VPS IP Cloudflarem
(403 na KAŻDEJ impersonacji TLS — chrome/safari/firefox), więc ingest HTML idzie
przez Bright Data residential proxy (`settings.brightdata_proxy_url`). Gdy proxy
nieskonfigurowane → scraper no-op (pusty iterator, log warning).
Proxy używamy TYLKO do scrape HTML. Playback NIE idzie przez proxy: `<source>` mp4
(cdnst*.superporn.com) ma token IP-bound do fetchera (403 cross-IP), więc resolve
musi nastąpić po stronie telefonu — extractor `superporncom` → `_vps_blocked_fallback`
(mobile WebView ładuje stronę z residential IP telefonu, INJECTED_JS bierze video.src).
Thumbnaile (img*.superporn.com) schodzą z VPS bez proxy (image proxy działa).
Sygnały (SSR HTML):
- JSON-LD VideoObject: name, description, uploadDate, thumbnailUrl, duration
ISO 8601 (`P0DT0H38M48S`). `author` = uploader (NIE performer — pomijamy).
- Performerzy: chip `<a class="chip-link" href=".../pornstar/<slug>"><span>Name</span>`
- Kategorie: chip `<a class="chip-link" href="/<slug>"><span>Name</span>` (bez
`/pornstar/`) — w bloku `#collapse-categories`.
- Duration backup: `<video ... data-video-duration="2328">` (sekundy).
Listing: `?page=N` (newest-first; sitemap lastmod jest z 2024, bezużyteczny).
"""
from __future__ import annotations
import html
import logging
import re
from app.config import get_settings
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
from app.connectors.direct_scrapers._playtube import (
_extract_video_object,
_parse_iso_date,
)
from app.normalize.text import slugify
log = logging.getLogger(__name__)
_BASE = "https://www.superporn.com"
_SCENE_URL_RE = re.compile(r'href="(https://www\.superporn\.com/video/[a-z0-9\-]+)"', re.IGNORECASE)
# Chip pornstar: href .../pornstar/<slug> (abs lub rel), nazwa w <span>.
_PERF_CHIP_RE = re.compile(
r'<a[^>]+class="chip-link"[^>]+href="[^"]*/pornstar/[^"]*"[^>]*>.*?<span>([^<]+)</span>',
re.IGNORECASE | re.DOTALL,
)
# Chip kategorii: href="/<slug>" (NIE /pornstar/, NIE /video/, NIE absolutny http).
_CAT_CHIP_RE = re.compile(
r'<a[^>]+class="chip-link"[^>]+href="/(?!pornstar/|video/)([a-z0-9\-]+)"[^>]*>.*?<span>([^<]+)</span>',
re.IGNORECASE | re.DOTALL,
)
_DATA_DUR_RE = re.compile(r'data-video-duration="(\d+)"')
# ISO 8601 z dniami: `P0DT0H38M48S`.
_ISO_DUR_FULL_RE = re.compile(
r"P(?:(\d+)D)?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE
)
def _clean(text: str) -> str:
"""superporn dwukrotnie HTML-koduje encje w JSON-LD (`&amp;#39;` zamiast `'`).
Unescape iteracyjnie (max 2) — bez over-decode dla zwykłych nazw."""
for _ in range(2):
new = html.unescape(text)
if new == text:
break
text = new
return text.strip()
def _parse_iso_duration_days(value: str | None) -> int | None:
"""`P0DT0H38M48S` → 2328. None gdy zero/parse fail."""
if not value:
return None
m = _ISO_DUR_FULL_RE.match(value.strip())
if not m:
return None
d, h, mn, s = (int(g or 0) for g in m.groups())
total = d * 86400 + h * 3600 + mn * 60 + s
return total or None
class SuperpornScraper(BaseBrowseScraper):
sitetag = "superporncom"
def __init__(self) -> None:
super().__init__()
# Bright Data residential — bez niego superporn jest nieosiągalny z VPS.
self._proxy = get_settings().brightdata_proxy_url
if not self._proxy:
log.warning("superporn: BRIGHTDATA_PROXY_URL unset — scraper disabled")
def _listing_url(self, page: int) -> str:
if page <= 1:
return f"{_BASE}/"
return f"{_BASE}/?page={page}"
def crawl_page(self, page: int):
# Bez proxy nie ma sensu uderzać (gwarantowany CF 403) — sygnalizuj
# "exhausted" (pusta lista), żeby deep-crawl nie retry'ował w kółko.
if not self._proxy:
return []
return super().crawl_page(page)
def _extract_scene_urls(self, listing_html: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in _SCENE_URL_RE.finditer(listing_html):
url = m.group(1)
if url in seen:
continue
seen.add(url)
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
video = _extract_video_object(detail_html) or {}
title = _clean(video.get("name") or "")
if not title:
return None
slug_m = re.search(r"/video/([a-z0-9\-]+)", scene_url, re.IGNORECASE)
scene_slug = slug_m.group(1) if slug_m else None
duration_sec = _parse_iso_duration_days(video.get("duration"))
if duration_sec is None:
dm = _DATA_DUR_RE.search(detail_html)
if dm:
duration_sec = int(dm.group(1)) or None
release_date = _parse_iso_date(video.get("uploadDate"))
description = _clean(video.get("description") or "") or None
thumbnail_url = (video.get("thumbnailUrl") or "").strip() or None
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for m in _PERF_CHIP_RE.finditer(detail_html):
name = _clean(m.group(1))
slug = slugify(name)
if not slug or slug in seen_perf:
continue
seen_perf.add(slug)
performers.append(
RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name)
)
tags: list[RawTag] = []
seen_tag: set[str] = set()
for m in _CAT_CHIP_RE.finditer(detail_html):
slug, name = m.group(1).strip(), _clean(m.group(2))
if not name or slug in seen_tag:
continue
seen_tag.add(slug)
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug))
# Phash z thumbnaila — proxy NIE potrzebny (img*.superporn.com działa z VPS).
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{scene_slug or scene_url}",
title=title,
description=description,
release_date=release_date,
duration_sec=duration_sec,
url=scene_url,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)