goon/app/extractors/_fetch.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

120 lines
4.3 KiB
Python

"""Browser-impersonation HTTP fetcher dla tube'ów blokujących Pythonowy TLS fingerprint.
Niektóre Cloudflare-fronted tube'y (np. perverzija) blokują httpx na podstawie JA3
TLS hash (charakterystycznego dla Pythonowego stacka), zwracając 403 nawet z dobrym
UA + Referer. `curl_cffi` używa libcurl + skompilowanej wersji TLS lib z prawdziwego
Chrome'a, dzięki czemu ja3 hash jest identyczny jak browser → CF wpuszcza.
Fallback na httpx tylko gdy curl_cffi nie zainstalowany (zachowujemy backwards-compat
w razie problemów z buildem libcurl-impersonate).
"""
from __future__ import annotations
import logging
from collections.abc import Mapping
from dataclasses import dataclass
from urllib.parse import urlparse
import httpx
from app.extractors._models import TubePageError
log = logging.getLogger(__name__)
try:
from curl_cffi import requests as _cf_requests # type: ignore[import-not-found]
_HAS_CURL_CFFI = True
except ImportError: # pragma: no cover
_HAS_CURL_CFFI = False
log.warning("curl_cffi not installed — fallback to httpx (CF-protected tubes will fail)")
_DEFAULT_IMPERSONATE = "chrome120"
_DEFAULT_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
@dataclass
class FetchResult:
"""Mini response-like object — drop-in dla httpx.Response w naszych use case'ach."""
status_code: int
text: str
url: str
def raise_for_status(self) -> None:
if 400 <= self.status_code < 600:
raise TubePageError(self.status_code, self.url)
def browser_get(
url: str,
*,
headers: Mapping[str, str] | None = None,
timeout: float = 60.0,
follow_redirects: bool = True,
impersonate: str = _DEFAULT_IMPERSONATE,
) -> FetchResult:
"""GET z Chrome TLS fingerprint (curl_cffi). Spada do httpx gdy curl_cffi brak."""
if not _HAS_CURL_CFFI:
with httpx.Client(timeout=timeout, follow_redirects=follow_redirects) as http:
r = http.get(url, headers=dict(headers or {}))
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
r = _cf_requests.get(
url,
headers=dict(headers or {}),
timeout=timeout,
impersonate=impersonate,
allow_redirects=follow_redirects,
)
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
def fetch_tube_html(url: str, *, timeout: float = 60.0, max_retries: int = 2) -> str:
"""Fetch HTML strony tube'a z Chrome UA + retry dla transient failures.
Standalone replacement dla `PornAppClient.fetch_tube_html`. Używa curl_cffi
(browser_get) żeby ominąć JA3 fingerprint blocks na CF-fronted tube'ach.
Retry: 5xx i empty body retry max_retries razy z exponential backoff (0.5s, 1s).
Dla freshporno itp. które czasem zwracają 503/empty — bez retry user dostawał
"extractor None" z transient hiccup.
"""
import time as _time
host = urlparse(url).hostname or ""
headers = {
"User-Agent": _DEFAULT_UA,
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
"x-site": host,
}
last_err: Exception | None = None
for attempt in range(max_retries + 1):
try:
resp = browser_get(url, headers=headers, timeout=timeout, follow_redirects=True)
except Exception as e:
last_err = e
log.info("fetch_tube_html attempt %d/%d for %s: %s", attempt + 1, max_retries + 1, url, e)
if attempt < max_retries:
_time.sleep(0.5 * (attempt + 1))
continue
raise
# Retry on 5xx (transient server error) lub puste body (CDN cache miss)
if 500 <= resp.status_code < 600 or (resp.status_code == 200 and len(resp.text) < 500):
if attempt < max_retries:
log.info("fetch_tube_html %s attempt %d/%d: status=%d len=%d — retry",
url, attempt + 1, max_retries + 1, resp.status_code, len(resp.text))
_time.sleep(0.5 * (attempt + 1))
continue
if resp.status_code >= 400:
raise TubePageError(resp.status_code, url)
return resp.text
if last_err:
raise last_err
raise TubePageError(0, url)
__all__ = ["browser_get", "fetch_tube_html", "FetchResult", "_DEFAULT_UA"]