goon/app/extractors/tubes/_kvs_source.py

"""Shared KVS engine extractor: `<source src="...get_file/.../<scene_id>_<quality>.mp4/">`.

KVS (Kernel Video Sharing) to commercial CMS używany przez wiele tube'ów. Player
emituje `<source>` tagi z URL `<host>/get_file/<bucket_id>/<token>/<X>/<scene_id>/<scene_id>[_<quality>p].mp4/`.
Token jest IP-bound (signed dla VPS który fetchował embed).

Różnice per-tube:
  - pornhat: get_file 302 → HLS m3u8 manifest. Type='m3u8'.
  - freshporno: get_file 302 → direct mp4 CDN (remote_control.php?...&file=...). Type='mp4'.

Generic `_embed_iframe.extract` Stage 0.5 łapie WSZYSTKIE mp4 URLs z page HTML —
w tym `_preview*.mp4` z sidebar suggested videos (różne scene IDs z innym tokenem
→ 404 po fetch). Plus duplikaty `<source>` (multi-CDN load balancing).

Tu wyciągamy tylko `<source>` z player area, filtrujemy `_preview` URL-e, dedupe
po basename — żeby user nie widział 18 entries quality modal dla jednej sceny.
"""
from __future__ import annotations

import logging
import re
from urllib.parse import urlparse

from app.extractors._fetch import fetch_tube_html
from app.extractors._models import StreamSource

log = logging.getLogger(__name__)

_SOURCE_RE = re.compile(
    r'<source[^>]+src="(?P<url>https?://[^"]+/get_file/[^"]+\.mp4/?)"',
    re.IGNORECASE,
)
_QUALITY_RE = re.compile(r"_(?P<q>\d{3,4}p)\.mp4(?:/|$|\?)")


def extract_kvs_sources(
    page_url: str,
    *,
    stream_type: str = "mp4",
    timeout: float = 60.0,
    log_tag: str = "kvs",
) -> list[StreamSource] | None:
    """Wyciąga `<source>` URLs z page'a KVS, dedupe + skip preview trailers.

    Args:
        stream_type: 'mp4' (freshporno direct mp4) lub 'm3u8' (pornhat HLS manifest).
        log_tag: prefix dla log lines (tube name).
    """
    html = fetch_tube_html(page_url, timeout=timeout)

    sources = _SOURCE_RE.findall(html)
    if not sources:
        log.info("%s: no <source> tags found on %s", log_tag, page_url)
        return None

    sources = [u for u in sources if "_preview" not in urlparse(u).path]
    if not sources:
        log.info("%s: all sources were _preview trailers on %s", log_tag, page_url)
        return None

    seen_keys: set[str] = set()
    result: list[StreamSource] = []
    for url in sources:
        path = urlparse(url).path
        parts = [p for p in path.split("/") if p]
        key = parts[-1] if parts else url
        if key in seen_keys:
            continue
        seen_keys.add(key)
        q_match = _QUALITY_RE.search(url)
        quality = q_match.group("q") if q_match else None
        result.append(StreamSource(link=url, type=stream_type, quality=quality))

    def _quality_key(s: StreamSource) -> int:
        if not s.quality:
            return -1
        try:
            return int(s.quality.rstrip("p"))
        except ValueError:
            return -1

    result.sort(key=_quality_key, reverse=True)
    return result