goon/app/extractors/tubes/_ytdlp.py

"""yt-dlp wrapper — generic stream URL extractor dla mainstream tubes.

yt-dlp ma battle-tested extractory dla pornhub, xvideos, xnxx, xhamster, redtube,
youporn, porntrex i ~30 innych — pełna lista w yt_dlp/extractor/_extractors.py.
Tu używamy go jako jeden adapter dla mainstream tubes których nie ma sensu pisać
od zera (zmieniają HTML co kilka miesięcy, mają anti-bot, obfuscation w JS playerach).

Output yt-dlp:
  - `info["url"]` lub `info["formats"]` — formats lista zawiera wszystkie quality variants
  - każdy format ma `url`, `format_id`, `height`, `ext`, `protocol`

Mapowanie format → StreamSource:
  - `protocol == 'm3u8' / 'm3u8_native' / 'hls'` → type='m3u8'
  - `ext == 'mp4'` → type='mp4'
  - `ext == 'webm'` → type='webm'
  - quality = `f"{height}p"` jeśli height present, else `format_id`
"""
from __future__ import annotations

import logging
from typing import Any

from app.extractors._models import StreamSource, TubePageError

log = logging.getLogger(__name__)


def _format_to_source(fmt: dict[str, Any]) -> StreamSource | None:
    url = fmt.get("url")
    if not url:
        return None

    protocol = (fmt.get("protocol") or "").lower()
    ext = (fmt.get("ext") or "").lower()
    if "m3u8" in protocol or "hls" in protocol or ext == "m3u8":
        type_hint: str | None = "m3u8"
    elif ext == "mp4":
        type_hint = "mp4"
    elif ext == "webm":
        type_hint = "webm"
    elif ext == "mpd" or "dash" in protocol:
        type_hint = "mpd"
    else:
        type_hint = ext or None

    height = fmt.get("height")
    if isinstance(height, int) and height > 0:
        quality = f"{height}p"
    else:
        quality = fmt.get("format_note") or fmt.get("format_id")

    # yt-dlp w `http_headers` zwraca Referer pasujący do CDN — np. dla 0dayxx →
    # watchporn.to embed iframe → `Referer: https://watchporn.to/embed/143412`.
    # Bez tego CDN watchporn.to/get_file/... zwraca 410 (cookie binding).
    referer = (fmt.get("http_headers") or {}).get("Referer")

    return StreamSource(link=url, quality=quality, type=type_hint, raw=fmt, referer=referer)


def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
    """Wywołuje yt-dlp w extract-only mode (bez pobierania) i mapuje formats na StreamSource.

    Raises TubePageError gdy yt-dlp dostał 404/410 dla tube page.
    """
    from yt_dlp import YoutubeDL
    from yt_dlp.networking.impersonate import ImpersonateTarget
    from yt_dlp.utils import DownloadError, ExtractorError

    # Chrome UA + TLS impersonation — bez tego xhamster (i kilka innych) Cloudflare
    # zwraca 403 dla default `yt-dlp/<version>` UA. `impersonate` wymaga curl_cffi
    # (downgrade do 0.14 wymagany — 0.15 łamie yt-dlp's `_AVAILABLE_IMPERSONATE_TARGETS`
    # check). yt-dlp 2026.03.17 wymaga `ImpersonateTarget` OBJECT, nie string — wczesnie
    # przekazywałem `"chrome"` co poprzedni release przyjmował, teraz AssertionError
    # w `is_supported_target()` (bug-report 2026-05-16: youporn/xnxx/xvideos broken).
    ydl_opts = {
        "quiet": True,
        "no_warnings": True,
        "skip_download": True,
        "noplaylist": True,
        "socket_timeout": int(timeout),
        "impersonate": ImpersonateTarget("chrome"),
        "http_headers": {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
            ),
            "Accept-Language": "en-US,en;q=0.9",
        },
    }

    try:
        with YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(page_url, download=False)
    except DownloadError as e:
        msg = str(e).lower()
        # yt-dlp opakowuje HTTP errors w DownloadError — wykrywamy 404/410 po treści.
        if "http error 404" in msg or "http error 410" in msg or "video unavailable" in msg or "removed" in msg:
            raise TubePageError(404, page_url) from e
        log.warning("yt-dlp DownloadError on %s: %s", page_url, e)
        return None
    except ExtractorError as e:
        log.warning("yt-dlp ExtractorError on %s: %s", page_url, e)
        return None
    except Exception as e:
        log.warning("yt-dlp unexpected error on %s: %s", page_url, e)
        return None

    if info is None:
        return None

    formats = info.get("formats") or []
    sources: list[StreamSource] = []
    for fmt in formats:
        if not isinstance(fmt, dict):
            continue
        s = _format_to_source(fmt)
        if s is not None:
            sources.append(s)

    # Niektóre tubes zwracają single-format info bez "formats" listy.
    if not sources:
        single = _format_to_source(info)
        if single is not None:
            sources.append(single)

    return _dedupe_formats(sources) or None


def _dedupe_formats(sources: list[StreamSource]) -> list[StreamSource]:
    """Dedupe yt-dlp formats per (quality, type) — niektóre tubes (xhamster) zwracają
    24+ formatów: każda jakość × {mp4, hls} × kilka CDN mirrors. Większość mirror'ów
    jest IP-bound albo geo-restricted i daje 502/404. yt-dlp ordering: worst→best,
    czyli OSTATNI wpis dla danej (quality, type) jest najwyższego bitrate'a/preferencji.
    Bierzemy go.

    Output: dla każdej jakości jeden HLS + jeden MP4 (jeśli istnieje), HLS preferred.
    Sortujemy descending po quality (1080p → 144p) bo gracz domyślnie bierze pierwszy.
    """
    if not sources:
        return sources

    # Grupowanie: (quality, type) → ostatni StreamSource
    by_key: dict[tuple[str | None, str | None], StreamSource] = {}
    for s in sources:
        key = (s.quality, s.type)
        by_key[key] = s

    # Ranking: HLS przed MP4 (HLS ma adaptive segments → lepszy fallback gdy CDN flaky).
    # Quality numeric sort descending — "1080p" → 1080, "720p" → 720, "240p" → 240.
    def _quality_int(q: str | None) -> int:
        if not q:
            return 0
        try:
            return int(q.rstrip("pP").rstrip())
        except ValueError:
            return 0

    def _type_rank(t: str | None) -> int:
        return {"m3u8": 0, "mp4": 1, "webm": 2, "mpd": 3}.get(t or "", 9)

    deduped = list(by_key.values())
    deduped.sort(key=lambda s: (-_quality_int(s.quality), _type_rank(s.type)))
    return deduped