goon/app/extractors/_fetch.py

"""Browser-impersonation HTTP fetcher dla tube'ów blokujących Pythonowy TLS fingerprint.

Niektóre Cloudflare-fronted tube'y (np. perverzija) blokują httpx na podstawie JA3
TLS hash (charakterystycznego dla Pythonowego stacka), zwracając 403 nawet z dobrym
UA + Referer. `curl_cffi` używa libcurl + skompilowanej wersji TLS lib z prawdziwego
Chrome'a, dzięki czemu ja3 hash jest identyczny jak browser → CF wpuszcza.

Fallback na httpx tylko gdy curl_cffi nie zainstalowany (zachowujemy backwards-compat
w razie problemów z buildem libcurl-impersonate).
"""
from __future__ import annotations

import logging
from collections.abc import Mapping
from dataclasses import dataclass
from urllib.parse import urlparse

import httpx

from app.extractors._models import TubePageError

log = logging.getLogger(__name__)

try:
    from curl_cffi import requests as _cf_requests  # type: ignore[import-not-found]
    _HAS_CURL_CFFI = True
except ImportError:  # pragma: no cover
    _HAS_CURL_CFFI = False
    log.warning("curl_cffi not installed — fallback to httpx (CF-protected tubes will fail)")


_DEFAULT_IMPERSONATE = "chrome120"
_DEFAULT_UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)


@dataclass
class FetchResult:
    """Mini response-like object — drop-in dla httpx.Response w naszych use case'ach."""

    status_code: int
    text: str
    url: str

    def raise_for_status(self) -> None:
        if 400 <= self.status_code < 600:
            raise TubePageError(self.status_code, self.url)


def browser_get(
    url: str,
    *,
    headers: Mapping[str, str] | None = None,
    timeout: float = 60.0,
    follow_redirects: bool = True,
    impersonate: str = _DEFAULT_IMPERSONATE,
) -> FetchResult:
    """GET z Chrome TLS fingerprint (curl_cffi). Spada do httpx gdy curl_cffi brak."""
    if not _HAS_CURL_CFFI:
        with httpx.Client(timeout=timeout, follow_redirects=follow_redirects) as http:
            r = http.get(url, headers=dict(headers or {}))
            return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))

    r = _cf_requests.get(
        url,
        headers=dict(headers or {}),
        timeout=timeout,
        impersonate=impersonate,
        allow_redirects=follow_redirects,
    )
    return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))


def fetch_tube_html(url: str, *, timeout: float = 60.0, max_retries: int = 2) -> str:
    """Fetch HTML strony tube'a z Chrome UA + retry dla transient failures.

    Standalone replacement dla `PornAppClient.fetch_tube_html`. Używa curl_cffi
    (browser_get) żeby ominąć JA3 fingerprint blocks na CF-fronted tube'ach.

    Retry: 5xx i empty body retry max_retries razy z exponential backoff (0.5s, 1s).
    Dla freshporno itp. które czasem zwracają 503/empty — bez retry user dostawał
    "extractor None" z transient hiccup.
    """
    import time as _time
    host = urlparse(url).hostname or ""
    headers = {
        "User-Agent": _DEFAULT_UA,
        "Accept": "text/html,application/xhtml+xml",
        "Accept-Language": "en-US,en;q=0.9",
        "x-site": host,
    }
    last_err: Exception | None = None
    for attempt in range(max_retries + 1):
        try:
            resp = browser_get(url, headers=headers, timeout=timeout, follow_redirects=True)
        except Exception as e:
            last_err = e
            log.info("fetch_tube_html attempt %d/%d for %s: %s", attempt + 1, max_retries + 1, url, e)
            if attempt < max_retries:
                _time.sleep(0.5 * (attempt + 1))
                continue
            raise
        # Retry on 5xx (transient server error) lub puste body (CDN cache miss)
        if 500 <= resp.status_code < 600 or (resp.status_code == 200 and len(resp.text) < 500):
            if attempt < max_retries:
                log.info("fetch_tube_html %s attempt %d/%d: status=%d len=%d — retry",
                         url, attempt + 1, max_retries + 1, resp.status_code, len(resp.text))
                _time.sleep(0.5 * (attempt + 1))
                continue
        if resp.status_code >= 400:
            raise TubePageError(resp.status_code, url)
        return resp.text
    if last_err:
        raise last_err
    raise TubePageError(0, url)


__all__ = ["browser_get", "fetch_tube_html", "FetchResult", "_DEFAULT_UA"]