goon/app/extractors/tubes/_kvs_source.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

83 lines
2.8 KiB
Python

"""Shared KVS engine extractor: `<source src="...get_file/.../<scene_id>_<quality>.mp4/">`.
KVS (Kernel Video Sharing) to commercial CMS używany przez wiele tube'ów. Player
emituje `<source>` tagi z URL `<host>/get_file/<bucket_id>/<token>/<X>/<scene_id>/<scene_id>[_<quality>p].mp4/`.
Token jest IP-bound (signed dla VPS który fetchował embed).
Różnice per-tube:
- pornhat: get_file 302 → HLS m3u8 manifest. Type='m3u8'.
- freshporno: get_file 302 → direct mp4 CDN (remote_control.php?...&file=...). Type='mp4'.
Generic `_embed_iframe.extract` Stage 0.5 łapie WSZYSTKIE mp4 URLs z page HTML —
w tym `_preview*.mp4` z sidebar suggested videos (różne scene IDs z innym tokenem
→ 404 po fetch). Plus duplikaty `<source>` (multi-CDN load balancing).
Tu wyciągamy tylko `<source>` z player area, filtrujemy `_preview` URL-e, dedupe
po basename — żeby user nie widział 18 entries quality modal dla jednej sceny.
"""
from __future__ import annotations
import logging
import re
from urllib.parse import urlparse
from app.extractors._fetch import fetch_tube_html
from app.extractors._models import StreamSource
log = logging.getLogger(__name__)
_SOURCE_RE = re.compile(
r'<source[^>]+src="(?P<url>https?://[^"]+/get_file/[^"]+\.mp4/?)"',
re.IGNORECASE,
)
_QUALITY_RE = re.compile(r"_(?P<q>\d{3,4}p)\.mp4(?:/|$|\?)")
def extract_kvs_sources(
page_url: str,
*,
stream_type: str = "mp4",
timeout: float = 60.0,
log_tag: str = "kvs",
) -> list[StreamSource] | None:
"""Wyciąga `<source>` URLs z page'a KVS, dedupe + skip preview trailers.
Args:
stream_type: 'mp4' (freshporno direct mp4) lub 'm3u8' (pornhat HLS manifest).
log_tag: prefix dla log lines (tube name).
"""
html = fetch_tube_html(page_url, timeout=timeout)
sources = _SOURCE_RE.findall(html)
if not sources:
log.info("%s: no <source> tags found on %s", log_tag, page_url)
return None
sources = [u for u in sources if "_preview" not in urlparse(u).path]
if not sources:
log.info("%s: all sources were _preview trailers on %s", log_tag, page_url)
return None
seen_keys: set[str] = set()
result: list[StreamSource] = []
for url in sources:
path = urlparse(url).path
parts = [p for p in path.split("/") if p]
key = parts[-1] if parts else url
if key in seen_keys:
continue
seen_keys.add(key)
q_match = _QUALITY_RE.search(url)
quality = q_match.group("q") if q_match else None
result.append(StreamSource(link=url, type=stream_type, quality=quality))
def _quality_key(s: StreamSource) -> int:
if not s.quality:
return -1
try:
return int(s.quality.rstrip("p"))
except ValueError:
return -1
result.sort(key=_quality_key, reverse=True)
return result