From 956a0feb22d25f0f2c6cd31c80ff67ae58a36374 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Wed, 10 Jun 2026 19:18:40 +0200 Subject: [PATCH] docs: correct Bright Data proxy type (ISP, flat-rate not per-GB) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is an ISP proxy (static ISP IPs, flat billing), not residential — so HTML-ingest bandwidth is free and the full deep-crawl is fine. Co-Authored-By: Claude Fable 5 --- app/config.py | 6 +++--- app/connectors/direct_scrapers/__init__.py | 2 +- app/connectors/direct_scrapers/_browse_base.py | 2 +- app/connectors/direct_scrapers/superporn.py | 6 +++--- app/extractors/_fetch.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/app/config.py b/app/config.py index 1022bd5..dc60d4b 100644 --- a/app/config.py +++ b/app/config.py @@ -121,9 +121,9 @@ class Settings(BaseSettings): hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT") hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT") - # Bright Data residential proxy — używany TYLKO do ingestu HTML (scrape) tubów - # które blokują VPS IP twardym Cloudflare 403 nawet z browser-TLS (superporn). - # NIE do streamowania wideo (transfer leciałby przez płatne proxy + tokeny i tak + # Bright Data ISP proxy (stałe IP od ISP, rozliczane ryczałtem NIE per-GB) — + # używany do ingestu HTML (scrape) tubów które blokują VPS IP twardym Cloudflare + # 403 nawet z browser-TLS (superporn). Streamu i tak nie ruszamy proxy (tokeny CDN # IP-bound). Format env: `host:port:user:pass` (panel Bright Data). Pusty = brak. brightdata_proxy_raw: str = Field(default="", validation_alias="BRIGHTDATA_PROXY_URL") diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index 71f1f59..a68a2bc 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -227,7 +227,7 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ NepornScraper, # SuperpornScraper — dołączony 2026-06-10 (user request). superporn blokuje VPS IP # twardym CF 403 (każda impersonacja TLS), więc ingest HTML idzie przez Bright Data - # residential proxy (BRIGHTDATA_PROXY_URL). Pierwszy scraper z proxy — `_proxy` w + # ISP proxy (BRIGHTDATA_PROXY_URL, ryczałt nie per-GB). Pierwszy scraper z proxy — `_proxy` w # _browse_base. JSON-LD (title+desc+uploadDate+thumb+duration) + chipy pornstar/ # kategorie. Playback IP-bound → WebView (extractor superporncom → _vps_blocked_fallback). # Bez proxy: scraper no-op (pusty iterator). diff --git a/app/connectors/direct_scrapers/_browse_base.py b/app/connectors/direct_scrapers/_browse_base.py index 6f3b50b..fe1fcf6 100644 --- a/app/connectors/direct_scrapers/_browse_base.py +++ b/app/connectors/direct_scrapers/_browse_base.py @@ -53,7 +53,7 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC): _proxy: str | None = None """Opcjonalny proxy (http://user:pass@host:port) dla listing+detail fetchy. Ustawiany przez scrapery tubów blokujących VPS IP (superporn → Bright Data - residential). None = bezpośredni fetch (domyślnie).""" + ISP proxy). None = bezpośredni fetch (domyślnie).""" @abc.abstractmethod def _listing_url(self, page: int) -> str: diff --git a/app/connectors/direct_scrapers/superporn.py b/app/connectors/direct_scrapers/superporn.py index 7e5fff4..2fa233e 100644 --- a/app/connectors/direct_scrapers/superporn.py +++ b/app/connectors/direct_scrapers/superporn.py @@ -1,8 +1,8 @@ -"""superporn.com — latest-vids browse scraper (przez Bright Data residential proxy). +"""superporn.com — latest-vids browse scraper (przez Bright Data ISP proxy). Dołączony 2026-06-10 (user request). superporn twardo blokuje VPS IP Cloudflarem (403 na KAŻDEJ impersonacji TLS — chrome/safari/firefox), więc ingest HTML idzie -przez Bright Data residential proxy (`settings.brightdata_proxy_url`). Gdy proxy +przez Bright Data ISP proxy (`settings.brightdata_proxy_url`). Gdy proxy nieskonfigurowane → scraper no-op (pusty iterator, log warning). Proxy używamy TYLKO do scrape HTML. Playback NIE idzie przez proxy: `` mp4 @@ -95,7 +95,7 @@ class SuperpornScraper(BaseBrowseScraper): def __init__(self) -> None: super().__init__() - # Bright Data residential — bez niego superporn jest nieosiągalny z VPS. + # Bright Data ISP proxy — bez niego superporn jest nieosiągalny z VPS. self._proxy = get_settings().brightdata_proxy_url if not self._proxy: log.warning("superporn: BRIGHTDATA_PROXY_URL unset — scraper disabled") diff --git a/app/extractors/_fetch.py b/app/extractors/_fetch.py index b102390..089ffd0 100644 --- a/app/extractors/_fetch.py +++ b/app/extractors/_fetch.py @@ -62,7 +62,7 @@ def browser_get( `proxy` (http://user:pass@host:port) — routuje request przez proxy. Używane tylko do ingestu HTML tubów blokujących VPS IP (np. superporn przez Bright Data - residential). NIE dla streamów.""" + ISP proxy). NIE dla streamów.""" if not _HAS_CURL_CFFI: proxies = {"http://": proxy, "https://": proxy} if proxy else None with httpx.Client(