goon/app/extractors/tubes/_ytdlp.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

163 lines
6.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""yt-dlp wrapper — generic stream URL extractor dla mainstream tubes.
yt-dlp ma battle-tested extractory dla pornhub, xvideos, xnxx, xhamster, redtube,
youporn, porntrex i ~30 innych — pełna lista w yt_dlp/extractor/_extractors.py.
Tu używamy go jako jeden adapter dla mainstream tubes których nie ma sensu pisać
od zera (zmieniają HTML co kilka miesięcy, mają anti-bot, obfuscation w JS playerach).
Output yt-dlp:
- `info["url"]` lub `info["formats"]` — formats lista zawiera wszystkie quality variants
- każdy format ma `url`, `format_id`, `height`, `ext`, `protocol`
Mapowanie format → StreamSource:
- `protocol == 'm3u8' / 'm3u8_native' / 'hls'` → type='m3u8'
- `ext == 'mp4'` → type='mp4'
- `ext == 'webm'` → type='webm'
- quality = `f"{height}p"` jeśli height present, else `format_id`
"""
from __future__ import annotations
import logging
from typing import Any
from app.extractors._models import StreamSource, TubePageError
log = logging.getLogger(__name__)
def _format_to_source(fmt: dict[str, Any]) -> StreamSource | None:
url = fmt.get("url")
if not url:
return None
protocol = (fmt.get("protocol") or "").lower()
ext = (fmt.get("ext") or "").lower()
if "m3u8" in protocol or "hls" in protocol or ext == "m3u8":
type_hint: str | None = "m3u8"
elif ext == "mp4":
type_hint = "mp4"
elif ext == "webm":
type_hint = "webm"
elif ext == "mpd" or "dash" in protocol:
type_hint = "mpd"
else:
type_hint = ext or None
height = fmt.get("height")
if isinstance(height, int) and height > 0:
quality = f"{height}p"
else:
quality = fmt.get("format_note") or fmt.get("format_id")
# yt-dlp w `http_headers` zwraca Referer pasujący do CDN — np. dla 0dayxx →
# watchporn.to embed iframe → `Referer: https://watchporn.to/embed/143412`.
# Bez tego CDN watchporn.to/get_file/... zwraca 410 (cookie binding).
referer = (fmt.get("http_headers") or {}).get("Referer")
return StreamSource(link=url, quality=quality, type=type_hint, raw=fmt, referer=referer)
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
"""Wywołuje yt-dlp w extract-only mode (bez pobierania) i mapuje formats na StreamSource.
Raises TubePageError gdy yt-dlp dostał 404/410 dla tube page.
"""
from yt_dlp import YoutubeDL
from yt_dlp.networking.impersonate import ImpersonateTarget
from yt_dlp.utils import DownloadError, ExtractorError
# Chrome UA + TLS impersonation — bez tego xhamster (i kilka innych) Cloudflare
# zwraca 403 dla default `yt-dlp/<version>` UA. `impersonate` wymaga curl_cffi
# (downgrade do 0.14 wymagany — 0.15 łamie yt-dlp's `_AVAILABLE_IMPERSONATE_TARGETS`
# check). yt-dlp 2026.03.17 wymaga `ImpersonateTarget` OBJECT, nie string — wczesnie
# przekazywałem `"chrome"` co poprzedni release przyjmował, teraz AssertionError
# w `is_supported_target()` (bug-report 2026-05-16: youporn/xnxx/xvideos broken).
ydl_opts = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noplaylist": True,
"socket_timeout": int(timeout),
"impersonate": ImpersonateTarget("chrome"),
"http_headers": {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
},
}
try:
with YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(page_url, download=False)
except DownloadError as e:
msg = str(e).lower()
# yt-dlp opakowuje HTTP errors w DownloadError — wykrywamy 404/410 po treści.
if "http error 404" in msg or "http error 410" in msg or "video unavailable" in msg or "removed" in msg:
raise TubePageError(404, page_url) from e
log.warning("yt-dlp DownloadError on %s: %s", page_url, e)
return None
except ExtractorError as e:
log.warning("yt-dlp ExtractorError on %s: %s", page_url, e)
return None
except Exception as e:
log.warning("yt-dlp unexpected error on %s: %s", page_url, e)
return None
if info is None:
return None
formats = info.get("formats") or []
sources: list[StreamSource] = []
for fmt in formats:
if not isinstance(fmt, dict):
continue
s = _format_to_source(fmt)
if s is not None:
sources.append(s)
# Niektóre tubes zwracają single-format info bez "formats" listy.
if not sources:
single = _format_to_source(info)
if single is not None:
sources.append(single)
return _dedupe_formats(sources) or None
def _dedupe_formats(sources: list[StreamSource]) -> list[StreamSource]:
"""Dedupe yt-dlp formats per (quality, type) — niektóre tubes (xhamster) zwracają
24+ formatów: każda jakość × {mp4, hls} × kilka CDN mirrors. Większość mirror'ów
jest IP-bound albo geo-restricted i daje 502/404. yt-dlp ordering: worst→best,
czyli OSTATNI wpis dla danej (quality, type) jest najwyższego bitrate'a/preferencji.
Bierzemy go.
Output: dla każdej jakości jeden HLS + jeden MP4 (jeśli istnieje), HLS preferred.
Sortujemy descending po quality (1080p → 144p) bo gracz domyślnie bierze pierwszy.
"""
if not sources:
return sources
# Grupowanie: (quality, type) → ostatni StreamSource
by_key: dict[tuple[str | None, str | None], StreamSource] = {}
for s in sources:
key = (s.quality, s.type)
by_key[key] = s
# Ranking: HLS przed MP4 (HLS ma adaptive segments → lepszy fallback gdy CDN flaky).
# Quality numeric sort descending — "1080p" → 1080, "720p" → 720, "240p" → 240.
def _quality_int(q: str | None) -> int:
if not q:
return 0
try:
return int(q.rstrip("pP").rstrip())
except ValueError:
return 0
def _type_rank(t: str | None) -> int:
return {"m3u8": 0, "mp4": 1, "webm": 2, "mpd": 3}.get(t or "", 9)
deduped = list(by_key.values())
deduped.sort(key=lambda s: (-_quality_int(s.quality), _type_rank(s.type)))
return deduped