fix(extractor/hqporner): wire dedicated extractor + reject ad iframes/CDNs
Registry mapowanie `hqpornercom` -> `_vps_blocked_fallback.extract` zwracało scene page URL do mobile WebView. Page ma 3 ad-iframes (adtng/goaserv/ mavrtracktor) + pop-under triggery -> user widział reklame zamiast video. Powrot do `hqporner.extract` (multi-quality bigcdn.cc mp4 + force_proxy=True). Plus hardening: iframe regex bound do `<div id="playerWrapper">...</div>`, whitelist hostow embed (mydaddy.cc/hqwo.cc) i CDN mp4 (bigcdn/hqwo/flyflv).
This commit is contained in:
parent
7979d5fa61
commit
aac6b10d77
2 changed files with 58 additions and 14 deletions
|
|
@ -51,14 +51,17 @@ log = logging.getLogger(__name__)
|
||||||
# embed-iframe extractor (page → /e/<id> iframe → P.A.C.K.E.R. unpack). Custom kod
|
# embed-iframe extractor (page → /e/<id> iframe → P.A.C.K.E.R. unpack). Custom kod
|
||||||
# tylko tam gdzie tube ma niestandardowy schemat (eporner XHR, sxyprn URL transform).
|
# tylko tam gdzie tube ma niestandardowy schemat (eporner XHR, sxyprn URL transform).
|
||||||
_REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
_REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# hqporner — CDN URLs IP-bound do VPS, force_proxy wymusza ruch przez VPS proxy.
|
# hqporner — dedicated extractor zwraca multi-quality `<source>` mp4 URLs
|
||||||
# 2026-05-20 (pre-public): bandwidth + anonimowość VPS > UX. Switch na WebView
|
# (bigcdn.cc / hqwo.cc / flyflv) z `force_proxy=True`. CDN URLs IP-bound do
|
||||||
# fallback — mobile pobiera embed iframe z phone IP, FluidPlayer JS decoduje
|
# VPS, więc playback.py routuje przez proxy — mobile dostaje quality picker
|
||||||
# mp4, ExoPlayer odtwarza direct z phone CDN session. **0 VPS bandwidth + VPS
|
# + natywny ExoPlayer, bez WebView.
|
||||||
# IP nie ujawniony** (mobile nie łączy się z VPS proxy URL).
|
# Bug-report e8ddd8d4: WebView fallback (`_vps_blocked_fallback`) ładował
|
||||||
# Trade-off: WebView ma 1 extra step (page → player JS) ale bez popup-ads jak
|
# hqporner.com scene page w WebView, ale ta strona ma ad-iframes (adtng,
|
||||||
# hqporner.com bo INJECTED_JS w PlayerScreen.tsx blokuje + scrape `<source>.src`.
|
# goaserv, mavrtracktor) + pop-under-triggery → user klikał i widział
|
||||||
"hqpornercom": _vps_blocked_fallback.extract,
|
# reklamę zamiast video. INJECTED_JS w PlayerScreen.tsx nie chwytał
|
||||||
|
# popupów dośc szybko. Powrót do natywnego = `<source>` mp4 picker omija
|
||||||
|
# tę ścieżkę całkowicie.
|
||||||
|
"hqpornercom": hqporner.extract,
|
||||||
"epornercom": eporner.extract,
|
"epornercom": eporner.extract,
|
||||||
"sxyprncom": sxyprn.extract,
|
"sxyprncom": sxyprn.extract,
|
||||||
# Mainstream tubes — yt-dlp
|
# Mainstream tubes — yt-dlp
|
||||||
|
|
|
||||||
|
|
@ -32,10 +32,25 @@ from app.extractors.hoster import extract_stream_from_hoster
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
_IFRAME_RE = re.compile(
|
# Wyciągamy zawartość `<div id="playerWrapper">…</div>` osobno, potem szukamy
|
||||||
r'<div[^>]+id=["\']?playerWrapper["\']?[^>]*>.*?<iframe[^>]+src=["\']([^"\']+)',
|
# `<iframe>` TYLKO wewnątrz. Wcześniej regex `playerWrapper>.*?<iframe` z DOTALL
|
||||||
|
# przelatywał przez pusty/JS-loaded wrapper i łapał kolejny `<iframe>` w
|
||||||
|
# dokumencie — a hqporner ma 2-3 ad-iframes (adtng/goaserv/mavrtracktor) wokół
|
||||||
|
# playera, więc trafialiśmy w reklamę zamiast w mydaddy.cc/hqwo.cc.
|
||||||
|
_PLAYER_WRAPPER_RE = re.compile(
|
||||||
|
r'<div[^>]+id=["\']?playerWrapper["\']?[^>]*>(.*?)</div>',
|
||||||
re.IGNORECASE | re.DOTALL,
|
re.IGNORECASE | re.DOTALL,
|
||||||
)
|
)
|
||||||
|
_PLAYER_IFRAME_RE = re.compile(r'<iframe[^>]+src=["\']([^"\']+)', re.IGNORECASE)
|
||||||
|
|
||||||
|
# Whitelist hostów embed iframe'a. Hqporner rotuje między mydaddy.cc i hqwo.cc
|
||||||
|
# (zmiany typowo co kilka miesięcy). Wszystko inne (adtng, goaserv, mavrtracktor,
|
||||||
|
# smartpop, popcash, reebr) → reklama. Brak match = fail safe (return None),
|
||||||
|
# nie próbujemy go odpalić jako hostera bo to ad-redirect → pop-under.
|
||||||
|
_VIDEO_IFRAME_HOST_RE = re.compile(
|
||||||
|
r"//(?:[a-z0-9-]+\.)?(?:mydaddy|hqwo|hqporner)\.[a-z]{2,4}/",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
# Match `<source src="...mp4" title="...">` z opcjonalnym title. Po unescape
|
# Match `<source src="...mp4" title="...">` z opcjonalnym title. Po unescape
|
||||||
# (`\"` → `"`) ten regex łapie zarówno raw HTML (mydaddy.cc) jak i JS-embedded
|
# (`\"` → `"`) ten regex łapie zarówno raw HTML (mydaddy.cc) jak i JS-embedded
|
||||||
|
|
@ -45,19 +60,36 @@ _SOURCE_RE = re.compile(
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Whitelist CDN-ów mp4. Real video URLs lecą z bigcdn.cc (s12./s68./...),
|
||||||
|
# hqwo.cc/pubs, flyflv. Wszystko spoza listy w `<source>` tagu = pre-roll /
|
||||||
|
# interstitial / ad injection (hipoteza z bug-reportu: hqporner zaczął
|
||||||
|
# wrzucać ad mp4 URLs do `<source>` w 2026).
|
||||||
|
_VIDEO_CDN_HOST_RE = re.compile(
|
||||||
|
r"//(?:[a-z0-9-]+\.)?(?:bigcdn|hqwo|flyflv|hqwallcdn)\.[a-z]{2,4}/",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
||||||
page_html = fetch_tube_html(page_url, timeout=timeout)
|
page_html = fetch_tube_html(page_url, timeout=timeout)
|
||||||
m = _IFRAME_RE.search(page_html)
|
wrapper_m = _PLAYER_WRAPPER_RE.search(page_html)
|
||||||
if not m:
|
if not wrapper_m:
|
||||||
log.warning("hqporner: no iframe in %s", page_url)
|
log.warning("hqporner: no playerWrapper div in %s", page_url)
|
||||||
return None
|
return None
|
||||||
iframe_src = m.group(1).strip()
|
iframe_m = _PLAYER_IFRAME_RE.search(wrapper_m.group(1))
|
||||||
|
if not iframe_m:
|
||||||
|
log.warning("hqporner: no iframe inside playerWrapper for %s", page_url)
|
||||||
|
return None
|
||||||
|
iframe_src = iframe_m.group(1).strip()
|
||||||
if iframe_src.startswith("//"):
|
if iframe_src.startswith("//"):
|
||||||
iframe_src = "https:" + iframe_src
|
iframe_src = "https:" + iframe_src
|
||||||
elif iframe_src.startswith("/"):
|
elif iframe_src.startswith("/"):
|
||||||
iframe_src = f"https://hqporner.com{iframe_src}"
|
iframe_src = f"https://hqporner.com{iframe_src}"
|
||||||
|
|
||||||
|
if not _VIDEO_IFRAME_HOST_RE.search(iframe_src):
|
||||||
|
log.warning("hqporner: iframe host not whitelisted (likely ad): %s", iframe_src)
|
||||||
|
return None
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": _DEFAULT_UA,
|
"User-Agent": _DEFAULT_UA,
|
||||||
"Accept": "text/html,application/xhtml+xml",
|
"Accept": "text/html,application/xhtml+xml",
|
||||||
|
|
@ -94,6 +126,13 @@ def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | Non
|
||||||
if url in seen_urls:
|
if url in seen_urls:
|
||||||
continue
|
continue
|
||||||
seen_urls.add(url)
|
seen_urls.add(url)
|
||||||
|
# Drop `<source>` URLs spoza znanych CDN-ów. Jeśli hqporner wstrzyknie
|
||||||
|
# `<source src="//ads.example.com/preroll.mp4">` (hipoteza z bug-reportu)
|
||||||
|
# — bez whitelist'a quality picker w mobile mógłby wystrzelić mu URL
|
||||||
|
# reklamy zamiast 1080p mp4.
|
||||||
|
if not _VIDEO_CDN_HOST_RE.search(url):
|
||||||
|
log.info("hqporner: skip non-CDN source URL: %s", url)
|
||||||
|
continue
|
||||||
title = (sm.group(2) or "").strip()
|
title = (sm.group(2) or "").strip()
|
||||||
# `force_proxy=True` (2026-05-20): CDN-y bigcdn.cc/flyflv IP-bound + flyflv ma
|
# `force_proxy=True` (2026-05-20): CDN-y bigcdn.cc/flyflv IP-bound + flyflv ma
|
||||||
# `ip=46.62.219.154` w URL path. Mobile direct = 404/403 → fallback proxy
|
# `ip=46.62.219.154` w URL path. Mobile direct = 404/403 → fallback proxy
|
||||||
|
|
@ -118,5 +157,7 @@ def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | Non
|
||||||
|
|
||||||
# Fallback 2: oddaj iframe URL jako hoster type — mobile otworzy w WebView,
|
# Fallback 2: oddaj iframe URL jako hoster type — mobile otworzy w WebView,
|
||||||
# FluidPlayer JS sam wyciągnie URL po user click / przejściu adblock check.
|
# FluidPlayer JS sam wyciągnie URL po user click / przejściu adblock check.
|
||||||
|
# Iframe_src ma już zwalidowany host whitelist (mydaddy.cc/hqwo.cc), więc
|
||||||
|
# WebView nie wpadnie w ad-domain redirect.
|
||||||
log.info("hqporner: using hoster fallback for %s", iframe_src)
|
log.info("hqporner: using hoster fallback for %s", iframe_src)
|
||||||
return [StreamSource(link=iframe_src, type="hoster")]
|
return [StreamSource(link=iframe_src, type="hoster")]
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue