goon/app/extractors/hosters/streamtape.py

"""Streamtape embed → direct mp4 extractor.

Pattern (verified 2026-05-15 z residential, live URL `/e/PZqBZp4OomF0Q61`):

1. Embed `/e/<id>` zwraca 89KB body z 4 `document.getElementById(...).innerHTML`
   assignmentami konstruującymi pełen URL do `/get_video`. Każdy uses ten sam
   pattern:

       document.getElementById('robotlink').innerHTML =
           '//streamtape.com/get_video' +
           ('<junk>?id=<id>&expires=...&ip=...&token=...').substring(N).substring(M);

   Junk to 3-4 znaki przed `?` — substring(N).substring(M) je odcina.

2. Po sklejeniu fetch `https://streamtape.com/get_video?id=...&token=...` →
   302 → `https://<cluster>.tapecontent.net/radosgw/<id>/<signed_path>/<title>.mp4`
   (direct mp4, video/mp4 ~500MB, brak IP-bind).

3. Body czasem zwraca `Video not found! Maybe it got deleted by the creator!`
   — większość URLów w naszej DB (12k mass-DMCA'd 2026-05-15). Wtedy raise
   HosterDead, caller w playback.py oznaczy dead_at.

Live URL coverage probed 2026-05-15: ~5% URLów żyje, reszta `Video not found`.
"""
from __future__ import annotations

import logging
import re

from app.extractors._fetch import _DEFAULT_UA, browser_get
from app.extractors._models import HosterDead, StreamSource

log = logging.getLogger(__name__)

# Match: `getElementById('xlink').innerHTML = "<prefix>" + '' + ('<suffix>').substring(N).substring(M);`
# Streamtape generuje 4 assignmenty (ideoolink x2 + botlink + robotlink) — 2 są DECOYs
# z połamanym hostname (`.comb`, `.cob`) i tylko botlink/robotlink dają prawdziwy URL.
# Prefix może być fragmentem: `/streamtape.com`, `//streamtape.co`, `//streamtape.com/g`
# — `get_video` często jest split między prefix i suffix po slice'ach. Decyzja na
# podstawie KOMBINOWANEGO output containing exact `streamtape.com/get_video?`.
_ASSIGN_RE = re.compile(
    r"document\.getElementById\(['\"](?P<elem>[a-z]+link)['\"]\)\.innerHTML\s*=\s*"
    r"['\"](?P<prefix>[^'\"]*streamtape[^'\"]*)['\"]"
    r"\s*\+\s*(?:['\"]{2}\s*\+\s*)?"
    r"\(['\"](?P<suffix>[^'\"]+)['\"]\)"
    r"(?P<slices>(?:\.substring\(\d+\))+)",
    re.IGNORECASE,
)
_SUBSTRING_RE = re.compile(r"\.substring\((\d+)\)")
_NOT_FOUND_RE = re.compile(r"Video\s+not\s+found", re.IGNORECASE)


def _apply_slices(suffix: str, slices_str: str) -> str:
    out = suffix
    for m in _SUBSTRING_RE.finditer(slices_str):
        n = int(m.group(1))
        out = out[n:]
    return out


def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None:
    headers = {
        "User-Agent": _DEFAULT_UA,
        "Accept": "text/html,application/xhtml+xml",
        "Accept-Language": "en-US,en;q=0.9",
    }
    r = browser_get(page_url, headers=headers, timeout=timeout)
    if r.status_code in (404, 410):
        raise HosterDead(f"streamtape {page_url}: HTTP {r.status_code}")
    if r.status_code != 200 or not r.text:
        log.info("streamtape: fetch fail %s status=%s", page_url, r.status_code)
        return None

    if _NOT_FOUND_RE.search(r.text):
        raise HosterDead(f"streamtape {page_url}: Video not found")

    # Spróbuj wszystkie 4 assignmenty — pierwszy poprawny URL wygrywa.
    # `get_video` może być w prefix (residential variant) lub split prefix+suffix
    # (VPS variant gdzie decoy assignmenty produkują `.comb/get_video`).
    final_url: str | None = None
    for m in _ASSIGN_RE.finditer(r.text):
        prefix = m.group("prefix").strip()
        suffix = m.group("suffix")
        slices = m.group("slices")
        tail = _apply_slices(suffix, slices)
        combined = prefix + tail
        # Normalize: dodaj `https:` jeśli URL zaczyna się od `//`
        if combined.startswith("//"):
            url = "https:" + combined
        elif combined.startswith("/"):
            url = "https:/" + combined  # `/streamtape.com/...` → `https://streamtape.com/...`
        else:
            url = combined
        # Walidacja — odsiewa decoys (`streamtape.comb`, `streamtape.cob`).
        if (
            "streamtape.com/get_video?" in url
            and "id=" in url
            and "token=" in url
        ):
            final_url = url
            break

    if not final_url:
        log.info("streamtape: no valid innerHTML assignment found in %s", page_url)
        return None

    return [
        StreamSource(
            link=final_url,
            quality=None,
            type="mp4",
            referer=page_url,
            # /get_video zwraca 302 do tapecontent.net direct mp4. Proxy musi
            # follow redirect (stream_proxy domyślnie follow_redirects=True).
            raw={"redirect_via": "streamtape_get_video"},
        )
    ]