diff --git a/app/config.py b/app/config.py index 3bcea64..1022bd5 100644 --- a/app/config.py +++ b/app/config.py @@ -121,6 +121,22 @@ class Settings(BaseSettings): hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT") hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT") + # Bright Data residential proxy — używany TYLKO do ingestu HTML (scrape) tubów + # które blokują VPS IP twardym Cloudflare 403 nawet z browser-TLS (superporn). + # NIE do streamowania wideo (transfer leciałby przez płatne proxy + tokeny i tak + # IP-bound). Format env: `host:port:user:pass` (panel Bright Data). Pusty = brak. + brightdata_proxy_raw: str = Field(default="", validation_alias="BRIGHTDATA_PROXY_URL") + + @property + def brightdata_proxy_url(self) -> str | None: + """`host:port:user:pass` → `http://user:pass@host:port` dla curl_cffi/httpx. + None gdy nieustawiony lub w złym formacie.""" + parts = self.brightdata_proxy_raw.split(":") + if len(parts) != 4 or not all(parts): + return None + host, port, user, pwd = parts + return f"http://{user}:{pwd}@{host}:{port}" + @property def api_keys(self) -> set[str]: return {k.strip() for k in self.api_keys_raw.split(",") if k.strip()} diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index 6968a98..71f1f59 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -156,6 +156,7 @@ from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E40 from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402 from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402 from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402 +from app.connectors.direct_scrapers.superporn import SuperpornScraper # noqa: E402 from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402 from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402 @@ -224,6 +225,13 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ # "- HardX Update - ..." — fuzzy match po tytule). Resolve server-side _kvs, # finalny remote_control.php portable cross-IP. NepornScraper, + # SuperpornScraper — dołączony 2026-06-10 (user request). superporn blokuje VPS IP + # twardym CF 403 (każda impersonacja TLS), więc ingest HTML idzie przez Bright Data + # residential proxy (BRIGHTDATA_PROXY_URL). Pierwszy scraper z proxy — `_proxy` w + # _browse_base. JSON-LD (title+desc+uploadDate+thumb+duration) + chipy pornstar/ + # kategorie. Playback IP-bound → WebView (extractor superporncom → _vps_blocked_fallback). + # Bez proxy: scraper no-op (pusty iterator). + SuperpornScraper, # porntrex/hqporner/youporn — NIE: KVS/JS bez SSR duration → niewidoczne orphany (2026-06-03). # ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory). ] diff --git a/app/connectors/direct_scrapers/_browse_base.py b/app/connectors/direct_scrapers/_browse_base.py index c94a646..6f3b50b 100644 --- a/app/connectors/direct_scrapers/_browse_base.py +++ b/app/connectors/direct_scrapers/_browse_base.py @@ -50,6 +50,11 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC): _timeout: float = 30.0 """HTTP timeout per request.""" + _proxy: str | None = None + """Opcjonalny proxy (http://user:pass@host:port) dla listing+detail fetchy. + Ustawiany przez scrapery tubów blokujących VPS IP (superporn → Bright Data + residential). None = bezpośredni fetch (domyślnie).""" + @abc.abstractmethod def _listing_url(self, page: int) -> str: """URL listing page'a 'latest-vids' (page 1 = newest).""" @@ -78,7 +83,7 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC): """ url = self._listing_url(page) try: - res = browser_get(url, timeout=self._timeout) + res = browser_get(url, timeout=self._timeout, proxy=self._proxy) html = res.text if hasattr(res, "text") else res except Exception as e: log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e) @@ -92,7 +97,7 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC): out: list[RawScene] = [] for scene_url in urls: try: - res = browser_get(scene_url, timeout=self._timeout) + res = browser_get(scene_url, timeout=self._timeout, proxy=self._proxy) detail_html = res.text if hasattr(res, "text") else res except Exception as e: log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e) diff --git a/app/connectors/direct_scrapers/superporn.py b/app/connectors/direct_scrapers/superporn.py new file mode 100644 index 0000000..7e5fff4 --- /dev/null +++ b/app/connectors/direct_scrapers/superporn.py @@ -0,0 +1,194 @@ +"""superporn.com — latest-vids browse scraper (przez Bright Data residential proxy). + +Dołączony 2026-06-10 (user request). superporn twardo blokuje VPS IP Cloudflarem +(403 na KAŻDEJ impersonacji TLS — chrome/safari/firefox), więc ingest HTML idzie +przez Bright Data residential proxy (`settings.brightdata_proxy_url`). Gdy proxy +nieskonfigurowane → scraper no-op (pusty iterator, log warning). + +Proxy używamy TYLKO do scrape HTML. Playback NIE idzie przez proxy: `` mp4 +(cdnst*.superporn.com) ma token IP-bound do fetchera (403 cross-IP), więc resolve +musi nastąpić po stronie telefonu — extractor `superporncom` → `_vps_blocked_fallback` +(mobile WebView ładuje stronę z residential IP telefonu, INJECTED_JS bierze video.src). +Thumbnaile (img*.superporn.com) schodzą z VPS bez proxy (image proxy działa). + +Sygnały (SSR HTML): + - JSON-LD VideoObject: name, description, uploadDate, thumbnailUrl, duration + ISO 8601 (`P0DT0H38M48S`). `author` = uploader (NIE performer — pomijamy). + - Performerzy: chip `Name` + - Kategorie: chip `Name` (bez + `/pornstar/`) — w bloku `#collapse-categories`. + - Duration backup: `