"""Shared KVS engine extractor: ``. KVS (Kernel Video Sharing) to commercial CMS używany przez wiele tube'ów. Player emituje `` tagi z URL `/get_file/////[_p].mp4/`. Token jest IP-bound (signed dla VPS który fetchował embed). Różnice per-tube: - pornhat: get_file 302 → HLS m3u8 manifest. Type='m3u8'. - freshporno: get_file 302 → direct mp4 CDN (remote_control.php?...&file=...). Type='mp4'. Generic `_embed_iframe.extract` Stage 0.5 łapie WSZYSTKIE mp4 URLs z page HTML — w tym `_preview*.mp4` z sidebar suggested videos (różne scene IDs z innym tokenem → 404 po fetch). Plus duplikaty `` (multi-CDN load balancing). Tu wyciągamy tylko `` z player area, filtrujemy `_preview` URL-e, dedupe po basename — żeby user nie widział 18 entries quality modal dla jednej sceny. """ from __future__ import annotations import logging import re from urllib.parse import urlparse from app.extractors._fetch import fetch_tube_html from app.extractors._models import StreamSource log = logging.getLogger(__name__) _SOURCE_RE = re.compile( r']+src="(?Phttps?://[^"]+/get_file/[^"]+\.mp4/?)"', re.IGNORECASE, ) _QUALITY_RE = re.compile(r"_(?P\d{3,4}p)\.mp4(?:/|$|\?)") def extract_kvs_sources( page_url: str, *, stream_type: str = "mp4", timeout: float = 60.0, log_tag: str = "kvs", ) -> list[StreamSource] | None: """Wyciąga `` URLs z page'a KVS, dedupe + skip preview trailers. Args: stream_type: 'mp4' (freshporno direct mp4) lub 'm3u8' (pornhat HLS manifest). log_tag: prefix dla log lines (tube name). """ html = fetch_tube_html(page_url, timeout=timeout) sources = _SOURCE_RE.findall(html) if not sources: log.info("%s: no tags found on %s", log_tag, page_url) return None sources = [u for u in sources if "_preview" not in urlparse(u).path] if not sources: log.info("%s: all sources were _preview trailers on %s", log_tag, page_url) return None seen_keys: set[str] = set() result: list[StreamSource] = [] for url in sources: path = urlparse(url).path parts = [p for p in path.split("/") if p] key = parts[-1] if parts else url if key in seen_keys: continue seen_keys.add(key) q_match = _QUALITY_RE.search(url) quality = q_match.group("q") if q_match else None result.append(StreamSource(link=url, type=stream_type, quality=quality)) def _quality_key(s: StreamSource) -> int: if not s.quality: return -1 try: return int(s.quality.rstrip("p")) except ValueError: return -1 result.sort(key=_quality_key, reverse=True) return result