Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
120 lines
4.3 KiB
Python
120 lines
4.3 KiB
Python
"""Browser-impersonation HTTP fetcher dla tube'ów blokujących Pythonowy TLS fingerprint.
|
|
|
|
Niektóre Cloudflare-fronted tube'y (np. perverzija) blokują httpx na podstawie JA3
|
|
TLS hash (charakterystycznego dla Pythonowego stacka), zwracając 403 nawet z dobrym
|
|
UA + Referer. `curl_cffi` używa libcurl + skompilowanej wersji TLS lib z prawdziwego
|
|
Chrome'a, dzięki czemu ja3 hash jest identyczny jak browser → CF wpuszcza.
|
|
|
|
Fallback na httpx tylko gdy curl_cffi nie zainstalowany (zachowujemy backwards-compat
|
|
w razie problemów z buildem libcurl-impersonate).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from collections.abc import Mapping
|
|
from dataclasses import dataclass
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
|
|
from app.extractors._models import TubePageError
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
try:
|
|
from curl_cffi import requests as _cf_requests # type: ignore[import-not-found]
|
|
_HAS_CURL_CFFI = True
|
|
except ImportError: # pragma: no cover
|
|
_HAS_CURL_CFFI = False
|
|
log.warning("curl_cffi not installed — fallback to httpx (CF-protected tubes will fail)")
|
|
|
|
|
|
_DEFAULT_IMPERSONATE = "chrome120"
|
|
_DEFAULT_UA = (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class FetchResult:
|
|
"""Mini response-like object — drop-in dla httpx.Response w naszych use case'ach."""
|
|
|
|
status_code: int
|
|
text: str
|
|
url: str
|
|
|
|
def raise_for_status(self) -> None:
|
|
if 400 <= self.status_code < 600:
|
|
raise TubePageError(self.status_code, self.url)
|
|
|
|
|
|
def browser_get(
|
|
url: str,
|
|
*,
|
|
headers: Mapping[str, str] | None = None,
|
|
timeout: float = 60.0,
|
|
follow_redirects: bool = True,
|
|
impersonate: str = _DEFAULT_IMPERSONATE,
|
|
) -> FetchResult:
|
|
"""GET z Chrome TLS fingerprint (curl_cffi). Spada do httpx gdy curl_cffi brak."""
|
|
if not _HAS_CURL_CFFI:
|
|
with httpx.Client(timeout=timeout, follow_redirects=follow_redirects) as http:
|
|
r = http.get(url, headers=dict(headers or {}))
|
|
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
|
|
|
|
r = _cf_requests.get(
|
|
url,
|
|
headers=dict(headers or {}),
|
|
timeout=timeout,
|
|
impersonate=impersonate,
|
|
allow_redirects=follow_redirects,
|
|
)
|
|
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
|
|
|
|
|
|
def fetch_tube_html(url: str, *, timeout: float = 60.0, max_retries: int = 2) -> str:
|
|
"""Fetch HTML strony tube'a z Chrome UA + retry dla transient failures.
|
|
|
|
Standalone replacement dla `PornAppClient.fetch_tube_html`. Używa curl_cffi
|
|
(browser_get) żeby ominąć JA3 fingerprint blocks na CF-fronted tube'ach.
|
|
|
|
Retry: 5xx i empty body retry max_retries razy z exponential backoff (0.5s, 1s).
|
|
Dla freshporno itp. które czasem zwracają 503/empty — bez retry user dostawał
|
|
"extractor None" z transient hiccup.
|
|
"""
|
|
import time as _time
|
|
host = urlparse(url).hostname or ""
|
|
headers = {
|
|
"User-Agent": _DEFAULT_UA,
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"x-site": host,
|
|
}
|
|
last_err: Exception | None = None
|
|
for attempt in range(max_retries + 1):
|
|
try:
|
|
resp = browser_get(url, headers=headers, timeout=timeout, follow_redirects=True)
|
|
except Exception as e:
|
|
last_err = e
|
|
log.info("fetch_tube_html attempt %d/%d for %s: %s", attempt + 1, max_retries + 1, url, e)
|
|
if attempt < max_retries:
|
|
_time.sleep(0.5 * (attempt + 1))
|
|
continue
|
|
raise
|
|
# Retry on 5xx (transient server error) lub puste body (CDN cache miss)
|
|
if 500 <= resp.status_code < 600 or (resp.status_code == 200 and len(resp.text) < 500):
|
|
if attempt < max_retries:
|
|
log.info("fetch_tube_html %s attempt %d/%d: status=%d len=%d — retry",
|
|
url, attempt + 1, max_retries + 1, resp.status_code, len(resp.text))
|
|
_time.sleep(0.5 * (attempt + 1))
|
|
continue
|
|
if resp.status_code >= 400:
|
|
raise TubePageError(resp.status_code, url)
|
|
return resp.text
|
|
if last_err:
|
|
raise last_err
|
|
raise TubePageError(0, url)
|
|
|
|
|
|
__all__ = ["browser_get", "fetch_tube_html", "FetchResult", "_DEFAULT_UA"]
|