"""Streamtape embed → direct mp4 extractor. Pattern (verified 2026-05-15 z residential, live URL `/e/PZqBZp4OomF0Q61`): 1. Embed `/e/` zwraca 89KB body z 4 `document.getElementById(...).innerHTML` assignmentami konstruującymi pełen URL do `/get_video`. Każdy uses ten sam pattern: document.getElementById('robotlink').innerHTML = '//streamtape.com/get_video' + ('?id=&expires=...&ip=...&token=...').substring(N).substring(M); Junk to 3-4 znaki przed `?` — substring(N).substring(M) je odcina. 2. Po sklejeniu fetch `https://streamtape.com/get_video?id=...&token=...` → 302 → `https://.tapecontent.net/radosgw///.mp4` (direct mp4, video/mp4 ~500MB, brak IP-bind). 3. Body czasem zwraca `Video not found! Maybe it got deleted by the creator!` — większość URLów w naszej DB (12k mass-DMCA'd 2026-05-15). Wtedy raise HosterDead, caller w playback.py oznaczy dead_at. Live URL coverage probed 2026-05-15: ~5% URLów żyje, reszta `Video not found`. """ from __future__ import annotations import logging import re from app.extractors._fetch import _DEFAULT_UA, browser_get from app.extractors._models import HosterDead, StreamSource log = logging.getLogger(__name__) # Match: `getElementById('xlink').innerHTML = "<prefix>" + '' + ('<suffix>').substring(N).substring(M);` # Streamtape generuje 4 assignmenty (ideoolink x2 + botlink + robotlink) — 2 są DECOYs # z połamanym hostname (`.comb`, `.cob`) i tylko botlink/robotlink dają prawdziwy URL. # Prefix może być fragmentem: `/streamtape.com`, `//streamtape.co`, `//streamtape.com/g` # — `get_video` często jest split między prefix i suffix po slice'ach. Decyzja na # podstawie KOMBINOWANEGO output containing exact `streamtape.com/get_video?`. _ASSIGN_RE = re.compile( r"document\.getElementById\(['\"](?P<elem>[a-z]+link)['\"]\)\.innerHTML\s*=\s*" r"['\"](?P<prefix>[^'\"]*streamtape[^'\"]*)['\"]" r"\s*\+\s*(?:['\"]{2}\s*\+\s*)?" r"\(['\"](?P<suffix>[^'\"]+)['\"]\)" r"(?P<slices>(?:\.substring\(\d+\))+)", re.IGNORECASE, ) _SUBSTRING_RE = re.compile(r"\.substring\((\d+)\)") _NOT_FOUND_RE = re.compile(r"Video\s+not\s+found", re.IGNORECASE) def _apply_slices(suffix: str, slices_str: str) -> str: out = suffix for m in _SUBSTRING_RE.finditer(slices_str): n = int(m.group(1)) out = out[n:] return out def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None: headers = { "User-Agent": _DEFAULT_UA, "Accept": "text/html,application/xhtml+xml", "Accept-Language": "en-US,en;q=0.9", } r = browser_get(page_url, headers=headers, timeout=timeout) if r.status_code in (404, 410): raise HosterDead(f"streamtape {page_url}: HTTP {r.status_code}") if r.status_code != 200 or not r.text: log.info("streamtape: fetch fail %s status=%s", page_url, r.status_code) return None if _NOT_FOUND_RE.search(r.text): raise HosterDead(f"streamtape {page_url}: Video not found") # Spróbuj wszystkie 4 assignmenty — pierwszy poprawny URL wygrywa. # `get_video` może być w prefix (residential variant) lub split prefix+suffix # (VPS variant gdzie decoy assignmenty produkują `.comb/get_video`). final_url: str | None = None for m in _ASSIGN_RE.finditer(r.text): prefix = m.group("prefix").strip() suffix = m.group("suffix") slices = m.group("slices") tail = _apply_slices(suffix, slices) combined = prefix + tail # Normalize: dodaj `https:` jeśli URL zaczyna się od `//` if combined.startswith("//"): url = "https:" + combined elif combined.startswith("/"): url = "https:/" + combined # `/streamtape.com/...` → `https://streamtape.com/...` else: url = combined # Walidacja — odsiewa decoys (`streamtape.comb`, `streamtape.cob`). if ( "streamtape.com/get_video?" in url and "id=" in url and "token=" in url ): final_url = url break if not final_url: log.info("streamtape: no valid innerHTML assignment found in %s", page_url) return None return [ StreamSource( link=final_url, quality=None, type="mp4", referer=page_url, # /get_video zwraca 302 do tapecontent.net direct mp4. Proxy musi # follow redirect (stream_proxy domyślnie follow_redirects=True). raw={"redirect_via": "streamtape_get_video"}, ) ]