feat(extractors): detect deleted porntrex videos and mark dead

Porntrex soft-deletes: a removed video returns HTTP 200 with a "this video was deleted"
message instead of a player, so extract returned [] (transient) and the source was never
marked dead, leaving users on a permanently broken link (report 75dbf53e). Match the
deletion message and raise HosterDead so resolve marks the source dead.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-13 19:04:10 +02:00
parent 9d4384cef3
commit 32919d6a6c

View file

@ -30,12 +30,22 @@ import re
import time import time
from app.extractors._fetch import _DEFAULT_IMPERSONATE, _DEFAULT_UA, _HAS_CURL_CFFI, fetch_tube_html from app.extractors._fetch import _DEFAULT_IMPERSONATE, _DEFAULT_UA, _HAS_CURL_CFFI, fetch_tube_html
from app.extractors._models import StreamSource from app.extractors._models import HosterDead, StreamSource
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
_BASE = "https://www.porntrex.com" _BASE = "https://www.porntrex.com"
# Porntrex soft-delete: usunięte wideo zwraca HTTP 200 ze stroną z komunikatem (np.
# "this video was deleted per copyright owner request") zamiast playera → extract bez
# tego zwracał [] = transient → źródło NIGDY nie oznaczone dead → user wciąż klika
# martwy link (zgłoszenie 75dbf53e). Match → raise HosterDead → resolve mark-dead.
_DEAD_RE = re.compile(
r"this video (?:was|has been) deleted|video (?:was|has been) removed"
r"|no longer available|video is unavailable",
re.IGNORECASE,
)
# flashvars: `video_url: 'https://.../get_file/...mp4/'` + `video_url_text: '480p'`. # flashvars: `video_url: 'https://.../get_file/...mp4/'` + `video_url_text: '480p'`.
# Warianty: video_url, video_alt_url, video_alt_url2, video_alt_url3... # Warianty: video_url, video_alt_url, video_alt_url2, video_alt_url3...
_URL_RE = re.compile( _URL_RE = re.compile(
@ -107,6 +117,11 @@ def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | Non
else: else:
html = fetch_tube_html(page_url, timeout=timeout) html = fetch_tube_html(page_url, timeout=timeout)
# Soft-delete: strona żyje (200) ale wideo skasowane → mark dead (nie transient).
# Tylko gdy html niepuste (puste = fetch fail = transient, NIE dead).
if html and _DEAD_RE.search(html):
raise HosterDead(f"porntrex {page_url}: video deleted/removed")
# Mapa <var_name> → quality label (np. video_alt_url → "720p HD"). # Mapa <var_name> → quality label (np. video_alt_url → "720p HD").
quality_by_var: dict[str, str] = {} quality_by_var: dict[str, str] = {}
for m in _TEXT_RE.finditer(html): for m in _TEXT_RE.finditer(html):