Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
163 lines
6.2 KiB
Python
163 lines
6.2 KiB
Python
"""yt-dlp wrapper — generic stream URL extractor dla mainstream tubes.
|
||
|
||
yt-dlp ma battle-tested extractory dla pornhub, xvideos, xnxx, xhamster, redtube,
|
||
youporn, porntrex i ~30 innych — pełna lista w yt_dlp/extractor/_extractors.py.
|
||
Tu używamy go jako jeden adapter dla mainstream tubes których nie ma sensu pisać
|
||
od zera (zmieniają HTML co kilka miesięcy, mają anti-bot, obfuscation w JS playerach).
|
||
|
||
Output yt-dlp:
|
||
- `info["url"]` lub `info["formats"]` — formats lista zawiera wszystkie quality variants
|
||
- każdy format ma `url`, `format_id`, `height`, `ext`, `protocol`
|
||
|
||
Mapowanie format → StreamSource:
|
||
- `protocol == 'm3u8' / 'm3u8_native' / 'hls'` → type='m3u8'
|
||
- `ext == 'mp4'` → type='mp4'
|
||
- `ext == 'webm'` → type='webm'
|
||
- quality = `f"{height}p"` jeśli height present, else `format_id`
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from typing import Any
|
||
|
||
from app.extractors._models import StreamSource, TubePageError
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
|
||
def _format_to_source(fmt: dict[str, Any]) -> StreamSource | None:
|
||
url = fmt.get("url")
|
||
if not url:
|
||
return None
|
||
|
||
protocol = (fmt.get("protocol") or "").lower()
|
||
ext = (fmt.get("ext") or "").lower()
|
||
if "m3u8" in protocol or "hls" in protocol or ext == "m3u8":
|
||
type_hint: str | None = "m3u8"
|
||
elif ext == "mp4":
|
||
type_hint = "mp4"
|
||
elif ext == "webm":
|
||
type_hint = "webm"
|
||
elif ext == "mpd" or "dash" in protocol:
|
||
type_hint = "mpd"
|
||
else:
|
||
type_hint = ext or None
|
||
|
||
height = fmt.get("height")
|
||
if isinstance(height, int) and height > 0:
|
||
quality = f"{height}p"
|
||
else:
|
||
quality = fmt.get("format_note") or fmt.get("format_id")
|
||
|
||
# yt-dlp w `http_headers` zwraca Referer pasujący do CDN — np. dla 0dayxx →
|
||
# watchporn.to embed iframe → `Referer: https://watchporn.to/embed/143412`.
|
||
# Bez tego CDN watchporn.to/get_file/... zwraca 410 (cookie binding).
|
||
referer = (fmt.get("http_headers") or {}).get("Referer")
|
||
|
||
return StreamSource(link=url, quality=quality, type=type_hint, raw=fmt, referer=referer)
|
||
|
||
|
||
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
||
"""Wywołuje yt-dlp w extract-only mode (bez pobierania) i mapuje formats na StreamSource.
|
||
|
||
Raises TubePageError gdy yt-dlp dostał 404/410 dla tube page.
|
||
"""
|
||
from yt_dlp import YoutubeDL
|
||
from yt_dlp.networking.impersonate import ImpersonateTarget
|
||
from yt_dlp.utils import DownloadError, ExtractorError
|
||
|
||
# Chrome UA + TLS impersonation — bez tego xhamster (i kilka innych) Cloudflare
|
||
# zwraca 403 dla default `yt-dlp/<version>` UA. `impersonate` wymaga curl_cffi
|
||
# (downgrade do 0.14 wymagany — 0.15 łamie yt-dlp's `_AVAILABLE_IMPERSONATE_TARGETS`
|
||
# check). yt-dlp 2026.03.17 wymaga `ImpersonateTarget` OBJECT, nie string — wczesnie
|
||
# przekazywałem `"chrome"` co poprzedni release przyjmował, teraz AssertionError
|
||
# w `is_supported_target()` (bug-report 2026-05-16: youporn/xnxx/xvideos broken).
|
||
ydl_opts = {
|
||
"quiet": True,
|
||
"no_warnings": True,
|
||
"skip_download": True,
|
||
"noplaylist": True,
|
||
"socket_timeout": int(timeout),
|
||
"impersonate": ImpersonateTarget("chrome"),
|
||
"http_headers": {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept-Language": "en-US,en;q=0.9",
|
||
},
|
||
}
|
||
|
||
try:
|
||
with YoutubeDL(ydl_opts) as ydl:
|
||
info = ydl.extract_info(page_url, download=False)
|
||
except DownloadError as e:
|
||
msg = str(e).lower()
|
||
# yt-dlp opakowuje HTTP errors w DownloadError — wykrywamy 404/410 po treści.
|
||
if "http error 404" in msg or "http error 410" in msg or "video unavailable" in msg or "removed" in msg:
|
||
raise TubePageError(404, page_url) from e
|
||
log.warning("yt-dlp DownloadError on %s: %s", page_url, e)
|
||
return None
|
||
except ExtractorError as e:
|
||
log.warning("yt-dlp ExtractorError on %s: %s", page_url, e)
|
||
return None
|
||
except Exception as e:
|
||
log.warning("yt-dlp unexpected error on %s: %s", page_url, e)
|
||
return None
|
||
|
||
if info is None:
|
||
return None
|
||
|
||
formats = info.get("formats") or []
|
||
sources: list[StreamSource] = []
|
||
for fmt in formats:
|
||
if not isinstance(fmt, dict):
|
||
continue
|
||
s = _format_to_source(fmt)
|
||
if s is not None:
|
||
sources.append(s)
|
||
|
||
# Niektóre tubes zwracają single-format info bez "formats" listy.
|
||
if not sources:
|
||
single = _format_to_source(info)
|
||
if single is not None:
|
||
sources.append(single)
|
||
|
||
return _dedupe_formats(sources) or None
|
||
|
||
|
||
def _dedupe_formats(sources: list[StreamSource]) -> list[StreamSource]:
|
||
"""Dedupe yt-dlp formats per (quality, type) — niektóre tubes (xhamster) zwracają
|
||
24+ formatów: każda jakość × {mp4, hls} × kilka CDN mirrors. Większość mirror'ów
|
||
jest IP-bound albo geo-restricted i daje 502/404. yt-dlp ordering: worst→best,
|
||
czyli OSTATNI wpis dla danej (quality, type) jest najwyższego bitrate'a/preferencji.
|
||
Bierzemy go.
|
||
|
||
Output: dla każdej jakości jeden HLS + jeden MP4 (jeśli istnieje), HLS preferred.
|
||
Sortujemy descending po quality (1080p → 144p) bo gracz domyślnie bierze pierwszy.
|
||
"""
|
||
if not sources:
|
||
return sources
|
||
|
||
# Grupowanie: (quality, type) → ostatni StreamSource
|
||
by_key: dict[tuple[str | None, str | None], StreamSource] = {}
|
||
for s in sources:
|
||
key = (s.quality, s.type)
|
||
by_key[key] = s
|
||
|
||
# Ranking: HLS przed MP4 (HLS ma adaptive segments → lepszy fallback gdy CDN flaky).
|
||
# Quality numeric sort descending — "1080p" → 1080, "720p" → 720, "240p" → 240.
|
||
def _quality_int(q: str | None) -> int:
|
||
if not q:
|
||
return 0
|
||
try:
|
||
return int(q.rstrip("pP").rstrip())
|
||
except ValueError:
|
||
return 0
|
||
|
||
def _type_rank(t: str | None) -> int:
|
||
return {"m3u8": 0, "mp4": 1, "webm": 2, "mpd": 3}.get(t or "", 9)
|
||
|
||
deduped = list(by_key.values())
|
||
deduped.sort(key=lambda s: (-_quality_int(s.quality), _type_rank(s.type)))
|
||
return deduped
|