Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
343 lines
14 KiB
Python
343 lines
14 KiB
Python
"""Generic hoster (StreamWish/doodporn/mixdrop/filemoon/luluvdo) stream URL extractor.
|
||
|
||
Hostery embed-page'y stosują JWPlayer + P.A.C.K.E.R. obfuskację:
|
||
eval(function(p,a,c,k,e,d){...}('PAYLOAD', BASE, COUNT, 'kw1|kw2|...'.split('|'),...))
|
||
i chowają `sources: [{file: "https://...m3u8"}]` w packed JS.
|
||
|
||
Tu jest:
|
||
- `unpack_packer(js)` — dekoder P.A.C.K.E.R.
|
||
- `extract_stream_from_hoster(iframe_url, *, referer)` — fetch embed → unpack → m3u8/mp4
|
||
|
||
Te funkcje są używane przez:
|
||
1. Per-tube extractors (latestpornvideo, hqporner fallback) — page → embed iframe → tu
|
||
2. Movies playback (api/playback.py movies_router) — direct hoster URL → tu
|
||
|
||
Nie ma już zależności od PornAppClient / porn-app API.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import re
|
||
|
||
from app.extractors._fetch import _DEFAULT_UA, browser_get
|
||
from app.extractors._models import HosterDead
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
|
||
# P.A.C.K.E.R. javascript unpacker — odwraca obfuskację wzorca:
|
||
# eval(function(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(...);return p}
|
||
# ('PAYLOAD', BASE, COUNT, 'kw1|kw2|...'.split('|'), 0, {}))
|
||
# StreamWish, doodporn, mixdrop, filemoon — wszystkie używają tego packera do schowania
|
||
# `sources: [{file: "https://...m3u8"}]` w JWPlayer config.
|
||
_PACKER_ARGS_RE = re.compile(
|
||
r"\}\s*\(\s*'((?:\\'|[^'])+)'\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*'((?:\\'|[^'])*)'\s*\.split\('\|'\)",
|
||
re.DOTALL,
|
||
)
|
||
|
||
|
||
def _base_n(token: str, base: int) -> int | None:
|
||
"""Parsuje token jako liczbę w bazie 'base' (max 62 dla a-zA-Z0-9)."""
|
||
try:
|
||
result = 0
|
||
for ch in token:
|
||
if ch.isdigit():
|
||
d = ord(ch) - ord("0")
|
||
elif "a" <= ch <= "z":
|
||
d = ord(ch) - ord("a") + 10
|
||
elif "A" <= ch <= "Z":
|
||
d = ord(ch) - ord("A") + 36
|
||
else:
|
||
return None
|
||
if d >= base:
|
||
return None
|
||
result = result * base + d
|
||
return result
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def unpack_packer(js: str) -> str | None:
|
||
"""Unpack P.A.C.K.E.R. obfuscated JS. Zwraca None gdy wzorca nie ma."""
|
||
m = _PACKER_ARGS_RE.search(js)
|
||
if not m:
|
||
return None
|
||
payload, base_str, count_str, kw_str = m.groups()
|
||
base = int(base_str)
|
||
count = int(count_str)
|
||
keywords = kw_str.split("|")
|
||
payload = payload.replace("\\'", "'").replace('\\"', '"').replace("\\\\", "\\")
|
||
|
||
def replace_token(match: re.Match[str]) -> str:
|
||
token = match.group(0)
|
||
idx = _base_n(token, base)
|
||
if idx is None or idx >= count or idx >= len(keywords):
|
||
return token
|
||
kw = keywords[idx]
|
||
return kw if kw else token
|
||
|
||
return re.sub(r"\b\w+\b", replace_token, payload)
|
||
|
||
|
||
_HOSTER_FILE_RE = re.compile(
|
||
r'(?:["\']?file["\']?|sources?)\s*[:=]\s*["\'](https?://[^"\']+\.(?:m3u8|mp4|mpd)[^"\']*)["\']',
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
# Ad-rolls embedded w player config (xtremestream.xyz, niektóre KVS forki).
|
||
# Bez filtra extractor wracał preroll.mp4 jako "scena" → user widział 20s reklamy
|
||
# zamiast filmu (zgłoszone 2026-05-10, bug-report #30c4d3cf perverzija).
|
||
# Pattern obejmuje typowe nazwy ad-rolli + CDN-y które serwują reklamy
|
||
# (opencdn.b-cdn.net to bunnycdn alias dla reklam).
|
||
_AD_VIDEO_RE = re.compile(
|
||
r"/(?:preroll|midroll|postroll|preplay|ads?|advert|promo)\d*\.(?:mp4|m3u8|webm)"
|
||
r"|opencdn\.b-cdn\.net/video/(?:pre|mid|post|ad)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
def _looks_like_ad(url: str) -> bool:
|
||
return bool(_AD_VIDEO_RE.search(url))
|
||
|
||
# Niektóre hostery (doodporn) chowają mp4/m3u8 w słowniku zmiennych i odwołują się do
|
||
# nich w `sources: [{file: links.hls2}]`. Wtedy regex powyżej nie złapie. Drugi pass
|
||
# bierze pierwszy `.m3u8|.mp4|.mpd` URL z całego unpacked HTML — heurystyka, ale
|
||
# pierwszy taki URL to zwykle master playlist video.
|
||
_HOSTER_FALLBACK_URL_RE = re.compile(
|
||
r'https?://[^\s"\'<>]+\.(?:m3u8|mp4|mpd)(?:\?[^\s"\'<>]*)?',
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
# Sygnatury "video not found" / "deleted" które hostery wstawiają w HTML embed page.
|
||
# Gdy widzimy te markery, to wiemy że link jest martwy — raise HosterDead, caller w
|
||
# playback.py oznaczy playback_source.dead_at.
|
||
_HOSTER_DEAD_PATTERNS = (
|
||
"Video not found",
|
||
"video not found",
|
||
"Video Not Found",
|
||
"File was deleted",
|
||
"video is deleted",
|
||
"Video is deleted",
|
||
"This video is no longer available",
|
||
)
|
||
|
||
|
||
# KVS (Kernel Video Sharing) player markers — kt_player.js + license_code w HTML.
|
||
# Używają go fpo.xxx, 0day.kim, hdporn92, sxyland, i wiele innych WordPress-based
|
||
# tubes. KVS encryptuje URL `function/0/<encrypted>` license_code'em — regex fallback
|
||
# (`_HOSTER_FALLBACK_URL_RE`) złapie zamiast tego URL `event_reporting2` (tracking
|
||
# pixel zwracający 1×1 GIF zamiast video). Jak widzimy markery KVS, idziemy od razu
|
||
# do yt-dlp którego generic extractor poprawnie deszyfruje URL.
|
||
_KVS_MARKERS = ("kt_player(", "license_code")
|
||
|
||
|
||
# File hosters / known dead — rapidgator/nitroflare/frdl wymagają premium account
|
||
# (zwracają HTML z formularzem logowania zamiast video). Zwróć None bez fetch'u —
|
||
# caller w movies playback dorzuci embed-only fallback i mobile i tak otworzy
|
||
# WebView (gdzie user może zalogować się premium jeśli chce).
|
||
# Streamtape USUNIĘTY z blacklistu 2026-05-15 — ma dedicated extractor (innerHTML
|
||
# substring decode → /get_video → 302 → tapecontent.net mp4). Większość 12k URLów
|
||
# w naszej DB jest DMCA-dead ale ~5% żyje.
|
||
_FILE_HOSTER_RE = re.compile(
|
||
r"(?:rapidgator|nitroflare|filer\.net|frdl\.[a-z]+|"
|
||
r"streamcrypt\.net|"
|
||
r"openload\.co|openload\.io|oload\.[a-z]+)", # openload offline od 2019
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
|
||
|
||
def extract_stream_from_hoster(
|
||
iframe_url: str,
|
||
*,
|
||
referer: str,
|
||
timeout: float = 60.0,
|
||
) -> str | None:
|
||
"""Fetch hoster embed HTML → unpack P.A.C.K.E.R. JS → wyłuskaj video URL.
|
||
|
||
Działa dla większości popularnych hosterów (StreamWish, doodporn, mixdrop, filemoon)
|
||
bo wszyscy oni hostują JWPlayer z `sources` w packed JS. Zwraca pierwszy znaleziony
|
||
URL .m3u8 / .mp4 / .mpd lub None gdy nie udało się wyciągnąć.
|
||
|
||
Raises HosterDead gdy embed page wprost mówi że video deleted/not found.
|
||
"""
|
||
if _FILE_HOSTER_RE.search(iframe_url):
|
||
log.debug("hoster %s: file-hoster blacklist (premium-walled), skipping", iframe_url)
|
||
return None
|
||
# Per-hoster dedicated extractors (specific URL shapes / decode patterns).
|
||
# Mixdrop: P.A.C.K.E.R. → MDCore.wurl protocol-relative `//host/v2/<id>.mp4?s=...`
|
||
# — generic packer fallback regex `https?://...\.mp4` mija ten URL (no scheme).
|
||
if re.search(r"(?:mixdrop|m1xdrop|mxdrop)\.[a-z]+/", iframe_url, re.IGNORECASE):
|
||
from app.extractors.hosters import mixdrop
|
||
sources = mixdrop.extract(iframe_url, timeout=timeout)
|
||
if sources:
|
||
return sources[0].link
|
||
# Fall through to generic logic gdyby dedicated zwrócił None.
|
||
# Streamtape: 4 `document.getElementById(...).innerHTML = prefix + (...).substring(N)`
|
||
# assignmenty, z czego 2 są DECOY z połamanym hostname. Dedicated decode picks
|
||
# correct one + builds `/get_video?id=...&token=...` URL.
|
||
if re.search(r"streamtape\.[a-z]+/", iframe_url, re.IGNORECASE):
|
||
from app.extractors.hosters import streamtape
|
||
sources = streamtape.extract(iframe_url, timeout=timeout)
|
||
if sources:
|
||
return sources[0].link
|
||
return None # streamtape ma własną HosterDead obsługę — generic fallback by się sypał
|
||
# Shared SPA+AES-CBC engine: embedseek/seekplayer/rpmplay/upns/player4me/easyvidplayer
|
||
# — wszystkie używają tego samego silnika (`/api/v1/video` z AES-CBC encrypted
|
||
# m3u8 source). Razem ~159k playback sources w DB.
|
||
from app.extractors.hosters import seekplayer_engine
|
||
if seekplayer_engine.matches(iframe_url):
|
||
sources = seekplayer_engine.extract(iframe_url, timeout=timeout)
|
||
if sources:
|
||
return sources[0].link
|
||
return None
|
||
# voe.sx: JS redirect do losowego mirroru → custom 7-step decoder
|
||
# (ROT13 → strip 7 magic seps → atob → -3 shift → reverse → atob → JSON.parse)
|
||
# → HLS m3u8 + mp4 fallback. ~21k movies.
|
||
if re.search(
|
||
r"//(?:voe\.sx|"
|
||
r"rebeccasciencestreet\.[a-z]+|"
|
||
r"darnobedienceupscale\.[a-z]+|"
|
||
r"[a-z]+upscale\.com|[a-z]+street\.com)/",
|
||
iframe_url,
|
||
re.IGNORECASE,
|
||
):
|
||
from app.extractors.hosters import voe
|
||
sources = voe.extract(iframe_url, timeout=timeout)
|
||
if sources:
|
||
return sources[0].link
|
||
return None
|
||
headers = {
|
||
"User-Agent": _DEFAULT_UA,
|
||
"Accept": "text/html,application/xhtml+xml",
|
||
"Accept-Language": "en-US,en;q=0.9",
|
||
"Referer": referer,
|
||
}
|
||
try:
|
||
r = browser_get(iframe_url, headers=headers, timeout=timeout, follow_redirects=True)
|
||
r.raise_for_status()
|
||
except Exception as e:
|
||
log.warning("hoster fetch %s failed: %s", iframe_url, e)
|
||
return None
|
||
html = r.text
|
||
|
||
if any(p in html for p in _HOSTER_DEAD_PATTERNS):
|
||
raise HosterDead(f"hoster {iframe_url} reports video deleted/not found")
|
||
|
||
def _first_non_ad(pattern: re.Pattern[str], text: str, group: int = 1) -> str | None:
|
||
"""Iterate matches, pomiń preroll/ad URLs. Zwraca pierwszy clean lub None."""
|
||
for m in pattern.finditer(text):
|
||
url = m.group(group)
|
||
if not _looks_like_ad(url):
|
||
return url
|
||
return None
|
||
|
||
# 1) Direct match w raw HTML (gdy hoster nie zaobfuskował)
|
||
if (url := _first_non_ad(_HOSTER_FILE_RE, html, 1)):
|
||
return url
|
||
|
||
# KVS player → idź od razu do yt-dlp żeby ominąć regex-fallback który łapie
|
||
# gif-trap URL `event_reporting2`. yt-dlp generic deszyfruje `function/0/<enc>`
|
||
# license_code'em i zwraca prawdziwy `get_file/<N>/...mp4` URL.
|
||
is_kvs = all(marker in html for marker in _KVS_MARKERS)
|
||
if is_kvs:
|
||
ytdlp_url = _try_ytdlp_hoster(iframe_url, timeout=timeout)
|
||
if ytdlp_url and not _looks_like_ad(ytdlp_url):
|
||
return ytdlp_url
|
||
log.warning("hoster %s: KVS markers but yt-dlp failed", iframe_url)
|
||
return None
|
||
|
||
# 2) Unpack P.A.C.K.E.R. → match na unpacked, najpierw structurally,
|
||
# potem fallback na pierwszy m3u8/mp4 w stringu.
|
||
unpacked = unpack_packer(html)
|
||
if unpacked:
|
||
if (url := _first_non_ad(_HOSTER_FILE_RE, unpacked, 1)):
|
||
return url
|
||
if (url := _first_non_ad(_HOSTER_FALLBACK_URL_RE, unpacked, 0)):
|
||
return url
|
||
|
||
# 3) Fallback na raw HTML (URL może być poza packerem)
|
||
if (url := _first_non_ad(_HOSTER_FALLBACK_URL_RE, html, 0)):
|
||
return url
|
||
|
||
# 4) yt-dlp last resort — battle-tested extractory dla streamtape, dood, mixdrop,
|
||
# filemoon, voe, vidoza, etc. Nie używamy go domyślnie (slow + lots of HTTP),
|
||
# tylko gdy nasze własne metody zawiodły.
|
||
ytdlp_url = _try_ytdlp_hoster(iframe_url, timeout=timeout)
|
||
if ytdlp_url:
|
||
return ytdlp_url
|
||
|
||
log.warning(
|
||
"hoster %s: no video URL in embed (packer unpack=%s, yt-dlp fail)",
|
||
iframe_url,
|
||
unpacked is not None,
|
||
)
|
||
return None
|
||
|
||
|
||
def _try_ytdlp_hoster(iframe_url: str, *, timeout: float) -> str | None:
|
||
"""yt-dlp wrapper dla hosters których nasz P.A.C.K.E.R. unpacker nie ogarnął.
|
||
|
||
yt-dlp ma extractory dla popularnych hosterów (streamtape, dood, mixdrop, filemoon,
|
||
voe, vidoza, streamwish, ...) — bezpośredni dostęp do `_extract_info`. Te extractory
|
||
robią multi-step AJAX / token rotation / regex unpacking dla każdego hostera.
|
||
|
||
Catch-all exception handling: jeśli yt-dlp nie ma extractora dla tego hostera lub
|
||
coś się sypie (timeout, anti-bot blokada, format change), wracamy None i caller
|
||
spadnie do hoster-fallback (mobile WebView).
|
||
"""
|
||
try:
|
||
from yt_dlp import YoutubeDL
|
||
except ImportError:
|
||
return None
|
||
|
||
ydl_opts = {
|
||
"quiet": True,
|
||
"no_warnings": True,
|
||
"skip_download": True,
|
||
"noplaylist": True,
|
||
"socket_timeout": int(timeout),
|
||
}
|
||
try:
|
||
with YoutubeDL(ydl_opts) as ydl:
|
||
info = ydl.extract_info(iframe_url, download=False)
|
||
except Exception as e:
|
||
log.debug("yt-dlp hoster fallback failed for %s: %s", iframe_url, type(e).__name__)
|
||
return None
|
||
|
||
if info is None:
|
||
return None
|
||
|
||
def _looks_like_video_url(u: str | None) -> bool:
|
||
if not u:
|
||
return False
|
||
if _looks_like_ad(u):
|
||
return False
|
||
low = u.lower()
|
||
# Standardowe formaty video. yt-dlp generic czasem zwraca page URL jako
|
||
# "info[url]" gdy nie rozpoznał stream'a (np. xtremestream.xyz player
|
||
# bez KVS markers). Bez tego checka extractor wracał iframe URL jako
|
||
# "stream", mobile próbował go odtwarzać przez ExoPlayer i dostawał
|
||
# "fake video" lub błąd (zgłoszone 2026-05-10 #30c4d3cf perverzija).
|
||
return any(ext in low for ext in (".m3u8", ".mp4", ".mpd", ".webm", ".ts"))
|
||
|
||
# Best video format URL — yt-dlp już rankuje formats, pierwszy w `formats` zwykle jest
|
||
# najlepszy, albo `info["url"]` dla single-format extractorów.
|
||
formats = info.get("formats") or [info]
|
||
for fmt in formats:
|
||
if not isinstance(fmt, dict):
|
||
continue
|
||
url = fmt.get("url")
|
||
if _looks_like_video_url(url):
|
||
return url
|
||
# Fallback: top-level URL — ale tylko gdy faktycznie wygląda na video.
|
||
top = info.get("url")
|
||
if _looks_like_video_url(top):
|
||
return top
|
||
return None
|
||
|
||
|
||
__all__ = ["extract_stream_from_hoster", "unpack_packer", "HosterDead"]
|