goon/app/extractors/tubes/porntrex.py
jtrzupek 32919d6a6c feat(extractors): detect deleted porntrex videos and mark dead
Porntrex soft-deletes: a removed video returns HTTP 200 with a "this video was deleted"
message instead of a player, so extract returned [] (transient) and the source was never
marked dead, leaving users on a permanently broken link (report 75dbf53e). Match the
deletion message and raise HosterDead so resolve marks the source dead.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-13 19:04:10 +02:00

163 lines
6.8 KiB
Python

"""porntrex.com — KVS engine direct stream extractor.
2026-05-22: VPS Hetzner IP znów dociera do porntrex (HTTP 200) — wcześniej blokada
trzymała `porntrexcom` na `_vps_blocked_fallback`. Patrz [[goon_porntrex_vps_unblocked]].
KVS player: detail page ma `flashvars` z `video_url` / `video_alt_url` / `video_alt_url2`
(480p / 720p / 1080p), każdy to `get_file/<srv>/<token>/<path>.mp4/` URL.
`get_file` 302 → `cdn.pcdn.cloudswitches.com/...mp4?expires=<ts>&md5=<sig>` — to
**time-bound signed URL** (nie IP-bound, NIE cookie-bound) → po rozwiązaniu jest
portable: mobile gra direct z CDN, zero VPS bandwidth.
REVISION 2026-05-31 (bug usera "porntrex wolno + brak wyboru jakości + chyba proxy"):
Wcześniejsze założenie "mobile zużyje get_file sam" było BŁĘDNE — `get_file` token jest
**cookie/session-bound**: działa tylko w tej samej sesji curl_cffi która pobrała stronę.
Osobny request mobile (ExoPlayer, bez cookies) → 410 → mobile spadał na VPS proxy
(stąd flicker = nav.replace + wolne odtwarzanie). Zweryfikowane: same-session follow
get_file → 200 (streamuje wideo); fresh session → 410. Finalny CDN url (cloudswitches,
expires+md5) jest natomiast portable (fresh session → 206).
FIX: resolvujemy 302 NA BACKENDZIE (w tej samej sesji co fetch strony) i oddajemy
**finalny CDN url** per jakość. Mobile gra direct, multi-quality picker działa, zero proxy.
Token get_file zużywamy raz tu; CDN url jest time-bound (nie single-use) → starcza na
sesję odtwarzania.
"""
from __future__ import annotations
import logging
import re
import time
from app.extractors._fetch import _DEFAULT_IMPERSONATE, _DEFAULT_UA, _HAS_CURL_CFFI, fetch_tube_html
from app.extractors._models import HosterDead, StreamSource
log = logging.getLogger(__name__)
_BASE = "https://www.porntrex.com"
# Porntrex soft-delete: usunięte wideo zwraca HTTP 200 ze stroną z komunikatem (np.
# "this video was deleted per copyright owner request") zamiast playera → extract bez
# tego zwracał [] = transient → źródło NIGDY nie oznaczone dead → user wciąż klika
# martwy link (zgłoszenie 75dbf53e). Match → raise HosterDead → resolve mark-dead.
_DEAD_RE = re.compile(
r"this video (?:was|has been) deleted|video (?:was|has been) removed"
r"|no longer available|video is unavailable",
re.IGNORECASE,
)
# flashvars: `video_url: 'https://.../get_file/...mp4/'` + `video_url_text: '480p'`.
# Warianty: video_url, video_alt_url, video_alt_url2, video_alt_url3...
_URL_RE = re.compile(
r"(video(?:_alt)?_url\d*)\s*:\s*'(https?://[^']+/get_file/[^']+)'",
re.IGNORECASE,
)
_TEXT_RE = re.compile(
r"(video(?:_alt)?_url\d*)_text\s*:\s*'([^']*)'",
re.IGNORECASE,
)
def _quality_rank(label: str | None) -> int:
"""`1080p` → 1080, `720p HD` → 720. Do sortowania malejąco."""
if not label:
return -1
m = re.search(r"(\d{3,4})\s*p", label, re.IGNORECASE)
return int(m.group(1)) if m else -1
def _resolve_get_file(session, get_file_url: str, timeout: float) -> str | None:
"""Follow get_file 302 → finalny portable CDN url (w sesji która ma cookies strony).
`?rnd=` cache-bust jak kt_player. stream=True + Range → łapiemy tylko nagłówki/finalny
URL po redirectach, NIE pobieramy 644MB body. Zwraca None gdy resolve padł."""
sep = "&" if "?" in get_file_url else "?"
url = f"{get_file_url}{sep}rnd={int(time.time() * 1000)}"
try:
r = session.get(
url,
timeout=timeout,
allow_redirects=True,
stream=True,
headers={"Referer": _BASE + "/", "Range": "bytes=0-1"},
)
final = str(r.url)
status = r.status_code
r.close()
except Exception as e:
log.info("porntrex: get_file resolve failed (%s): %s", get_file_url[:60], e)
return None
if status >= 400 or "/get_file/" in final:
log.info("porntrex: get_file resolve bad status=%s final=%s", status, final[:70])
return None
return final
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
# Wspólna sesja: get_file token jest cookie/session-bound, więc 302 MUSI być
# rozwiązany w tej samej sesji curl_cffi co fetch strony (patrz docstring).
session = None
if _HAS_CURL_CFFI:
from curl_cffi import requests as _cf_requests
session = _cf_requests.Session(impersonate=_DEFAULT_IMPERSONATE)
try:
resp = session.get(
page_url,
headers={"User-Agent": _DEFAULT_UA, "Accept": "text/html,application/xhtml+xml"},
timeout=timeout,
allow_redirects=True,
)
html = resp.text if resp.status_code < 400 else ""
except Exception as e:
log.info("porntrex: page fetch failed %s: %s", page_url, e)
html = ""
if not html:
html = fetch_tube_html(page_url, timeout=timeout)
session = None # fetch_tube_html użył innej sesji → nie resolvuj w `session`
else:
html = fetch_tube_html(page_url, timeout=timeout)
# Soft-delete: strona żyje (200) ale wideo skasowane → mark dead (nie transient).
# Tylko gdy html niepuste (puste = fetch fail = transient, NIE dead).
if html and _DEAD_RE.search(html):
raise HosterDead(f"porntrex {page_url}: video deleted/removed")
# Mapa <var_name> → quality label (np. video_alt_url → "720p HD").
quality_by_var: dict[str, str] = {}
for m in _TEXT_RE.finditer(html):
quality_by_var[m.group(1).lower()] = m.group(2).strip()
seen: set[str] = set()
result: list[StreamSource] = []
for m in _URL_RE.finditer(html):
var_name = m.group(1).lower()
url = m.group(2)
if url in seen:
continue
seen.add(url)
quality = quality_by_var.get(var_name)
# Rozwiąż get_file → portable CDN url (w sesji ze stroną). Gdy resolve padnie,
# oddaj get_file jako fallback (mobile spróbuje direct → ewentualnie proxy).
final_link = url
if session is not None:
resolved = _resolve_get_file(session, url, timeout)
if resolved:
final_link = resolved
result.append(
StreamSource(
link=final_link,
type="mp4",
quality=quality or None,
referer=_BASE + "/",
# Finalny CDN url (cloudswitches) jest time-bound (expires+md5), nie
# cookie/IP-bound → mobile gra direct, zero VPS proxy bandwidth.
raw={"mobile_direct_ok": True},
)
)
if not result:
log.info("porntrex: no KVS video_url in flashvars on %s", page_url)
return None
result.sort(key=lambda s: _quality_rank(s.quality), reverse=True)
return result