Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
83 lines
2.8 KiB
Python
83 lines
2.8 KiB
Python
"""Shared KVS engine extractor: `<source src="...get_file/.../<scene_id>_<quality>.mp4/">`.
|
|
|
|
KVS (Kernel Video Sharing) to commercial CMS używany przez wiele tube'ów. Player
|
|
emituje `<source>` tagi z URL `<host>/get_file/<bucket_id>/<token>/<X>/<scene_id>/<scene_id>[_<quality>p].mp4/`.
|
|
Token jest IP-bound (signed dla VPS który fetchował embed).
|
|
|
|
Różnice per-tube:
|
|
- pornhat: get_file 302 → HLS m3u8 manifest. Type='m3u8'.
|
|
- freshporno: get_file 302 → direct mp4 CDN (remote_control.php?...&file=...). Type='mp4'.
|
|
|
|
Generic `_embed_iframe.extract` Stage 0.5 łapie WSZYSTKIE mp4 URLs z page HTML —
|
|
w tym `_preview*.mp4` z sidebar suggested videos (różne scene IDs z innym tokenem
|
|
→ 404 po fetch). Plus duplikaty `<source>` (multi-CDN load balancing).
|
|
|
|
Tu wyciągamy tylko `<source>` z player area, filtrujemy `_preview` URL-e, dedupe
|
|
po basename — żeby user nie widział 18 entries quality modal dla jednej sceny.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
from app.extractors._fetch import fetch_tube_html
|
|
from app.extractors._models import StreamSource
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_SOURCE_RE = re.compile(
|
|
r'<source[^>]+src="(?P<url>https?://[^"]+/get_file/[^"]+\.mp4/?)"',
|
|
re.IGNORECASE,
|
|
)
|
|
_QUALITY_RE = re.compile(r"_(?P<q>\d{3,4}p)\.mp4(?:/|$|\?)")
|
|
|
|
|
|
def extract_kvs_sources(
|
|
page_url: str,
|
|
*,
|
|
stream_type: str = "mp4",
|
|
timeout: float = 60.0,
|
|
log_tag: str = "kvs",
|
|
) -> list[StreamSource] | None:
|
|
"""Wyciąga `<source>` URLs z page'a KVS, dedupe + skip preview trailers.
|
|
|
|
Args:
|
|
stream_type: 'mp4' (freshporno direct mp4) lub 'm3u8' (pornhat HLS manifest).
|
|
log_tag: prefix dla log lines (tube name).
|
|
"""
|
|
html = fetch_tube_html(page_url, timeout=timeout)
|
|
|
|
sources = _SOURCE_RE.findall(html)
|
|
if not sources:
|
|
log.info("%s: no <source> tags found on %s", log_tag, page_url)
|
|
return None
|
|
|
|
sources = [u for u in sources if "_preview" not in urlparse(u).path]
|
|
if not sources:
|
|
log.info("%s: all sources were _preview trailers on %s", log_tag, page_url)
|
|
return None
|
|
|
|
seen_keys: set[str] = set()
|
|
result: list[StreamSource] = []
|
|
for url in sources:
|
|
path = urlparse(url).path
|
|
parts = [p for p in path.split("/") if p]
|
|
key = parts[-1] if parts else url
|
|
if key in seen_keys:
|
|
continue
|
|
seen_keys.add(key)
|
|
q_match = _QUALITY_RE.search(url)
|
|
quality = q_match.group("q") if q_match else None
|
|
result.append(StreamSource(link=url, type=stream_type, quality=quality))
|
|
|
|
def _quality_key(s: StreamSource) -> int:
|
|
if not s.quality:
|
|
return -1
|
|
try:
|
|
return int(s.quality.rstrip("p"))
|
|
except ValueError:
|
|
return -1
|
|
|
|
result.sort(key=_quality_key, reverse=True)
|
|
return result
|