"""yt-dlp wrapper — generic stream URL extractor dla mainstream tubes. yt-dlp ma battle-tested extractory dla pornhub, xvideos, xnxx, xhamster, redtube, youporn, porntrex i ~30 innych — pełna lista w yt_dlp/extractor/_extractors.py. Tu używamy go jako jeden adapter dla mainstream tubes których nie ma sensu pisać od zera (zmieniają HTML co kilka miesięcy, mają anti-bot, obfuscation w JS playerach). Output yt-dlp: - `info["url"]` lub `info["formats"]` — formats lista zawiera wszystkie quality variants - każdy format ma `url`, `format_id`, `height`, `ext`, `protocol` Mapowanie format → StreamSource: - `protocol == 'm3u8' / 'm3u8_native' / 'hls'` → type='m3u8' - `ext == 'mp4'` → type='mp4' - `ext == 'webm'` → type='webm' - quality = `f"{height}p"` jeśli height present, else `format_id` """ from __future__ import annotations import logging from typing import Any from app.extractors._models import StreamSource, TubePageError log = logging.getLogger(__name__) def _format_to_source(fmt: dict[str, Any]) -> StreamSource | None: url = fmt.get("url") if not url: return None protocol = (fmt.get("protocol") or "").lower() ext = (fmt.get("ext") or "").lower() if "m3u8" in protocol or "hls" in protocol or ext == "m3u8": type_hint: str | None = "m3u8" elif ext == "mp4": type_hint = "mp4" elif ext == "webm": type_hint = "webm" elif ext == "mpd" or "dash" in protocol: type_hint = "mpd" else: type_hint = ext or None height = fmt.get("height") if isinstance(height, int) and height > 0: quality = f"{height}p" else: quality = fmt.get("format_note") or fmt.get("format_id") # yt-dlp w `http_headers` zwraca Referer pasujący do CDN — np. dla 0dayxx → # watchporn.to embed iframe → `Referer: https://watchporn.to/embed/143412`. # Bez tego CDN watchporn.to/get_file/... zwraca 410 (cookie binding). referer = (fmt.get("http_headers") or {}).get("Referer") return StreamSource(link=url, quality=quality, type=type_hint, raw=fmt, referer=referer) def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None: """Wywołuje yt-dlp w extract-only mode (bez pobierania) i mapuje formats na StreamSource. Raises TubePageError gdy yt-dlp dostał 404/410 dla tube page. """ from yt_dlp import YoutubeDL from yt_dlp.networking.impersonate import ImpersonateTarget from yt_dlp.utils import DownloadError, ExtractorError # Chrome UA + TLS impersonation — bez tego xhamster (i kilka innych) Cloudflare # zwraca 403 dla default `yt-dlp/` UA. `impersonate` wymaga curl_cffi # (downgrade do 0.14 wymagany — 0.15 łamie yt-dlp's `_AVAILABLE_IMPERSONATE_TARGETS` # check). yt-dlp 2026.03.17 wymaga `ImpersonateTarget` OBJECT, nie string — wczesnie # przekazywałem `"chrome"` co poprzedni release przyjmował, teraz AssertionError # w `is_supported_target()` (bug-report 2026-05-16: youporn/xnxx/xvideos broken). ydl_opts = { "quiet": True, "no_warnings": True, "skip_download": True, "noplaylist": True, "socket_timeout": int(timeout), "impersonate": ImpersonateTarget("chrome"), "http_headers": { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9", }, } try: with YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(page_url, download=False) except DownloadError as e: msg = str(e).lower() # yt-dlp opakowuje HTTP errors w DownloadError — wykrywamy 404/410 po treści. if "http error 404" in msg or "http error 410" in msg or "video unavailable" in msg or "removed" in msg: raise TubePageError(404, page_url) from e log.warning("yt-dlp DownloadError on %s: %s", page_url, e) return None except ExtractorError as e: log.warning("yt-dlp ExtractorError on %s: %s", page_url, e) return None except Exception as e: log.warning("yt-dlp unexpected error on %s: %s", page_url, e) return None if info is None: return None formats = info.get("formats") or [] sources: list[StreamSource] = [] for fmt in formats: if not isinstance(fmt, dict): continue s = _format_to_source(fmt) if s is not None: sources.append(s) # Niektóre tubes zwracają single-format info bez "formats" listy. if not sources: single = _format_to_source(info) if single is not None: sources.append(single) return _dedupe_formats(sources) or None def _dedupe_formats(sources: list[StreamSource]) -> list[StreamSource]: """Dedupe yt-dlp formats per (quality, type) — niektóre tubes (xhamster) zwracają 24+ formatów: każda jakość × {mp4, hls} × kilka CDN mirrors. Większość mirror'ów jest IP-bound albo geo-restricted i daje 502/404. yt-dlp ordering: worst→best, czyli OSTATNI wpis dla danej (quality, type) jest najwyższego bitrate'a/preferencji. Bierzemy go. Output: dla każdej jakości jeden HLS + jeden MP4 (jeśli istnieje), HLS preferred. Sortujemy descending po quality (1080p → 144p) bo gracz domyślnie bierze pierwszy. """ if not sources: return sources # Grupowanie: (quality, type) → ostatni StreamSource by_key: dict[tuple[str | None, str | None], StreamSource] = {} for s in sources: key = (s.quality, s.type) by_key[key] = s # Ranking: HLS przed MP4 (HLS ma adaptive segments → lepszy fallback gdy CDN flaky). # Quality numeric sort descending — "1080p" → 1080, "720p" → 720, "240p" → 240. def _quality_int(q: str | None) -> int: if not q: return 0 try: return int(q.rstrip("pP").rstrip()) except ValueError: return 0 def _type_rank(t: str | None) -> int: return {"m3u8": 0, "mp4": 1, "webm": 2, "mpd": 3}.get(t or "", 9) deduped = list(by_key.values()) deduped.sort(key=lambda s: (-_quality_int(s.quality), _type_rank(s.type))) return deduped