goon/app/extractors/__init__.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

157 lines
7 KiB
Python

"""Stream URL extractors per-tube.
Public API:
- `try_extract(sitetag, page_url) -> list[StreamSource] | None`
- `StreamSource` (dataclass)
- `HosterDead` (exception)
- `extract_stream_from_hoster(iframe_url, *, referer)` — generic packer-based hoster extract
- `fetch_tube_html(url)` — Chrome TLS fingerprint fetch (curl_cffi)
- `browser_get(url)` — low-level
Architektura: każdy tube ma osobny moduł `app.extractors.tubes.<tube>` który eksportuje
`extract(page_url) -> list[StreamSource] | None`. Registry niżej mapuje sitetag →
modułowy extractor. `try_extract()` to thin wrapper z exception handlingiem.
Po removalu porn-app dependency, ten moduł jest jedynym mechanizmem rozwiązywania
streamów — playback.py nie wpada już do porn-app /stream API.
"""
from __future__ import annotations
import logging
from collections.abc import Callable
from app.extractors._fetch import browser_get, fetch_tube_html
from app.extractors._models import HosterDead, StreamSource, TubePageError
from app.extractors.hoster import extract_stream_from_hoster, unpack_packer
from app.extractors.tubes import (
_embed_iframe,
_vps_blocked_fallback,
_ytdlp,
eporner,
freshporno,
hqporner,
latestpornvideo,
paradisehill,
porn00,
pornhat,
pornxp,
sxyprn,
)
log = logging.getLogger(__name__)
# Sitetag → extractor function. Sitetag pasuje do format'u z origin: `pornapp:<sitetag>`
# (lub po Fazie 2 migracji: `tube:<sitetag>`).
#
# Mainstream tubes (pornhub/xvideos/xnxx/xhamster/redtube/youporn/porntrex) używają
# yt-dlp jako extractor — battle-tested, aktualizowane przez upstream przy zmianach
# HTML. Aggregator tubes (xmoviesforyou/watchporn/siska/...) używają generic
# embed-iframe extractor (page → /e/<id> iframe → P.A.C.K.E.R. unpack). Custom kod
# tylko tam gdzie tube ma niestandardowy schemat (eporner XHR, sxyprn URL transform).
_REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
# Custom (zoptymalizowane / niestandardowy player)
# hqporner — CDN URL (bigcdn.cc, video.flyflv.com z `ip=` parametrem) IP-bound do
# requestera. VPS resolve daje 200 ale mobile direct = 404/403. Switch na WebView
# fallback: mobile pobiera embed iframe (mydaddy.cc/hqwo.cc) z phone IP, FluidPlayer
# JS decoduje mp4 URL z mobile session. Plus INJECTED_JS skanuje `<source>.src`.
# ~32k scen (drugi po porntrex największy single saving). Verified 2026-05-18.
"hqpornercom": _vps_blocked_fallback.extract,
"epornercom": eporner.extract,
"sxyprncom": sxyprn.extract,
# Mainstream tubes — yt-dlp
# NB: 2026-05-18 cross-IP test potwierdził że xvideos/xnxx/pornhub/youporn/redtube
# CDN URLs są **time-bound** (nie IP-bound) — mobile_direct_ok auto-detect w
# playback.py daje mobile direct fetch, zero VPS bandwidth.
"pornhubcom": _ytdlp.extract,
"redtubecom": _ytdlp.extract,
"xvideoscom": _ytdlp.extract,
"xnxxcom": _ytdlp.extract,
"youporncom": _ytdlp.extract,
# porntrex KVS get_file — `kt_ips=<vps_ip>` cookie + single-use token (410 po reuse).
# CDN IP-bound do VPS, mobile direct = 403. Switch na _vps_blocked_fallback:
# mobile WebView z phone IP → KVS player JS dekoduje video.src → INJECTED_JS scrape.
# 137k scen oszczędzone z VPS bandwidth (largest single saving).
"porntrexcom": _vps_blocked_fallback.extract,
# VPS-blocked tubes — KVS / Cloudflare blokuje Hetzner IP, ale działają z residential
# IP (potwierdzone Chrome DevTools MCP 2026-05-15). Mobile WebView + INJECTED_JS
# (PlayerScreen.tsx:805) skanuje <video>.src + XHR — łapie URL po decode-ie player JS.
"xhamstercom": _vps_blocked_fallback.extract,
"porndittcom": _vps_blocked_fallback.extract,
"fpoxxx": _vps_blocked_fallback.extract,
"sxylandcom": _vps_blocked_fallback.extract,
# Aggregator tubes — generic embed-iframe → hoster unpacker
"latestpornvideocom": latestpornvideo.extract,
"xmoviesforyoucom": _embed_iframe.extract,
"watchporn": _embed_iframe.extract,
"siskavideo": _embed_iframe.extract,
"porn4dayspw": _embed_iframe.extract,
"porndishcom": _embed_iframe.extract,
# xxxfreewatch — DELISTED 2026-05-18. 790 solo-orphan scen, 0% match, CF-walled z VPS.
"latestleaksco": _embed_iframe.extract,
"mypornerleakcom": _embed_iframe.extract,
# PornHat — dedicated extractor: tylko `<source>` z player area (skip sidebar
# trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy
# follow_redirects=True wymagane (fix w stream_proxy.py).
"pornhatcom": pornhat.extract,
# Freshporno KVS — `cv=` HMAC signed token IP-bound. Server-side resolve dało
# 200 z VPS, ale laptop dostał 302+SSL error → token validate'uje requester IP.
# Switch na WebView fallback: mobile pobiera embed page, KVS player decoduje
# video_url w-page, ExoPlayer dostaje URL z phone session. ~15k scen.
"freshpornoorg": _vps_blocked_fallback.extract,
# porn00 / pornxp — force_proxy=True wprost (IP-bound CDN). Switch na WebView
# fallback. Niski volume (84 scen), trivial saving ale konsystencja flow.
"porn00org": _vps_blocked_fallback.extract,
"pornxpph": _vps_blocked_fallback.extract,
# Direct-scraping tubes (mają też search scraper w connectors/direct_scrapers/)
# — używają identycznego embed-iframe pattern dla streamingu.
# hdporn92com — DELISTED 2026-05-18. Scene pages to SEO shell bez player iframe,
# JS hijackuje kliki na popunder. Wszystkie playback_sources mass-marked dead.
# 0dayxx wraps watchporn.to embed. watchporn.to/get_file/ token IP-bound (302→410
# cross-IP). Switch na WebView fallback. ~5k scen.
"0dayxxcom": _vps_blocked_fallback.extract,
# CF-protected tube — curl_cffi w fetch_tube_html bypassa JA3, embed-iframe pattern.
"perverzijacom": _embed_iframe.extract,
# Special: WebView-only (Yii2 session-bound player).
"paradisehillcc": paradisehill.extract,
}
def try_extract(sitetag: str, page_url: str) -> list[StreamSource] | None:
"""Próbuje rozwiązać stream URL dla danego tube'a + page_url.
Zwraca listę StreamSource (różne quality/kontener) lub None gdy:
- brak extractora dla tego sitetag
- extractor zwrócił None / nie znalazł URL'a
Raises HosterDead gdy embed page wprost mówi że video deleted/not found —
caller (playback.py) łapie i oznacza playback_source.dead_at.
"""
extractor = _REGISTRY.get(sitetag)
if extractor is None:
return None
try:
return extractor(page_url)
except (HosterDead, TubePageError):
raise
except Exception as e:
log.warning("extractor for %s failed on %s: %s", sitetag, page_url, e)
return None
def supported_sitetags() -> tuple[str, ...]:
"""Zwraca listę sitetag-ów które mają zarejestrowany extractor."""
return tuple(_REGISTRY.keys())
__all__ = [
"try_extract",
"supported_sitetags",
"StreamSource",
"HosterDead",
"TubePageError",
"extract_stream_from_hoster",
"unpack_packer",
"fetch_tube_html",
"browser_get",
]