"""Browser-impersonation HTTP fetcher dla tube'ów blokujących Pythonowy TLS fingerprint. Niektóre Cloudflare-fronted tube'y (np. perverzija) blokują httpx na podstawie JA3 TLS hash (charakterystycznego dla Pythonowego stacka), zwracając 403 nawet z dobrym UA + Referer. `curl_cffi` używa libcurl + skompilowanej wersji TLS lib z prawdziwego Chrome'a, dzięki czemu ja3 hash jest identyczny jak browser → CF wpuszcza. Fallback na httpx tylko gdy curl_cffi nie zainstalowany (zachowujemy backwards-compat w razie problemów z buildem libcurl-impersonate). """ from __future__ import annotations import logging from collections.abc import Mapping from dataclasses import dataclass from urllib.parse import urlparse import httpx from app.extractors._models import TubePageError log = logging.getLogger(__name__) try: from curl_cffi import requests as _cf_requests # type: ignore[import-not-found] _HAS_CURL_CFFI = True except ImportError: # pragma: no cover _HAS_CURL_CFFI = False log.warning("curl_cffi not installed — fallback to httpx (CF-protected tubes will fail)") _DEFAULT_IMPERSONATE = "chrome120" _DEFAULT_UA = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" ) @dataclass class FetchResult: """Mini response-like object — drop-in dla httpx.Response w naszych use case'ach.""" status_code: int text: str url: str def raise_for_status(self) -> None: if 400 <= self.status_code < 600: raise TubePageError(self.status_code, self.url) def browser_get( url: str, *, headers: Mapping[str, str] | None = None, timeout: float = 60.0, follow_redirects: bool = True, impersonate: str = _DEFAULT_IMPERSONATE, proxy: str | None = None, ) -> FetchResult: """GET z Chrome TLS fingerprint (curl_cffi). Spada do httpx gdy curl_cffi brak. `proxy` (http://user:pass@host:port) — routuje request przez proxy. Używane tylko do ingestu HTML tubów blokujących VPS IP (np. superporn przez Bright Data ISP proxy). NIE dla streamów.""" if not _HAS_CURL_CFFI: proxies = {"http://": proxy, "https://": proxy} if proxy else None with httpx.Client( timeout=timeout, follow_redirects=follow_redirects, proxies=proxies ) as http: r = http.get(url, headers=dict(headers or {})) return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url)) r = _cf_requests.get( url, headers=dict(headers or {}), timeout=timeout, impersonate=impersonate, allow_redirects=follow_redirects, proxies={"http": proxy, "https": proxy} if proxy else None, verify=not proxy, # Bright Data MITM CA — curl_cffi nie ma go w bundlu ) return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url)) def fetch_tube_html(url: str, *, timeout: float = 60.0, max_retries: int = 2) -> str: """Fetch HTML strony tube'a z Chrome UA + retry dla transient failures. Standalone replacement dla `PornAppClient.fetch_tube_html`. Używa curl_cffi (browser_get) żeby ominąć JA3 fingerprint blocks na CF-fronted tube'ach. Retry: 5xx i empty body retry max_retries razy z exponential backoff (0.5s, 1s). Dla freshporno itp. które czasem zwracają 503/empty — bez retry user dostawał "extractor None" z transient hiccup. """ import time as _time host = urlparse(url).hostname or "" headers = { "User-Agent": _DEFAULT_UA, "Accept": "text/html,application/xhtml+xml", "Accept-Language": "en-US,en;q=0.9", "x-site": host, } last_err: Exception | None = None for attempt in range(max_retries + 1): try: resp = browser_get(url, headers=headers, timeout=timeout, follow_redirects=True) except Exception as e: last_err = e log.info("fetch_tube_html attempt %d/%d for %s: %s", attempt + 1, max_retries + 1, url, e) if attempt < max_retries: _time.sleep(0.5 * (attempt + 1)) continue raise # Retry on 5xx (transient server error) lub puste body (CDN cache miss) if 500 <= resp.status_code < 600 or (resp.status_code == 200 and len(resp.text) < 500): if attempt < max_retries: log.info("fetch_tube_html %s attempt %d/%d: status=%d len=%d — retry", url, attempt + 1, max_retries + 1, resp.status_code, len(resp.text)) _time.sleep(0.5 * (attempt + 1)) continue if resp.status_code >= 400: raise TubePageError(resp.status_code, url) return resp.text if last_err: raise last_err raise TubePageError(0, url) __all__ = ["browser_get", "fetch_tube_html", "FetchResult", "_DEFAULT_UA"]