goon/app/extractors/hosters/streamtape.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

117 lines
4.6 KiB
Python

"""Streamtape embed → direct mp4 extractor.
Pattern (verified 2026-05-15 z residential, live URL `/e/PZqBZp4OomF0Q61`):
1. Embed `/e/<id>` zwraca 89KB body z 4 `document.getElementById(...).innerHTML`
assignmentami konstruującymi pełen URL do `/get_video`. Każdy uses ten sam
pattern:
document.getElementById('robotlink').innerHTML =
'//streamtape.com/get_video' +
('<junk>?id=<id>&expires=...&ip=...&token=...').substring(N).substring(M);
Junk to 3-4 znaki przed `?` — substring(N).substring(M) je odcina.
2. Po sklejeniu fetch `https://streamtape.com/get_video?id=...&token=...` →
302 → `https://<cluster>.tapecontent.net/radosgw/<id>/<signed_path>/<title>.mp4`
(direct mp4, video/mp4 ~500MB, brak IP-bind).
3. Body czasem zwraca `Video not found! Maybe it got deleted by the creator!`
— większość URLów w naszej DB (12k mass-DMCA'd 2026-05-15). Wtedy raise
HosterDead, caller w playback.py oznaczy dead_at.
Live URL coverage probed 2026-05-15: ~5% URLów żyje, reszta `Video not found`.
"""
from __future__ import annotations
import logging
import re
from app.extractors._fetch import _DEFAULT_UA, browser_get
from app.extractors._models import HosterDead, StreamSource
log = logging.getLogger(__name__)
# Match: `getElementById('xlink').innerHTML = "<prefix>" + '' + ('<suffix>').substring(N).substring(M);`
# Streamtape generuje 4 assignmenty (ideoolink x2 + botlink + robotlink) — 2 są DECOYs
# z połamanym hostname (`.comb`, `.cob`) i tylko botlink/robotlink dają prawdziwy URL.
# Prefix może być fragmentem: `/streamtape.com`, `//streamtape.co`, `//streamtape.com/g`
# — `get_video` często jest split między prefix i suffix po slice'ach. Decyzja na
# podstawie KOMBINOWANEGO output containing exact `streamtape.com/get_video?`.
_ASSIGN_RE = re.compile(
r"document\.getElementById\(['\"](?P<elem>[a-z]+link)['\"]\)\.innerHTML\s*=\s*"
r"['\"](?P<prefix>[^'\"]*streamtape[^'\"]*)['\"]"
r"\s*\+\s*(?:['\"]{2}\s*\+\s*)?"
r"\(['\"](?P<suffix>[^'\"]+)['\"]\)"
r"(?P<slices>(?:\.substring\(\d+\))+)",
re.IGNORECASE,
)
_SUBSTRING_RE = re.compile(r"\.substring\((\d+)\)")
_NOT_FOUND_RE = re.compile(r"Video\s+not\s+found", re.IGNORECASE)
def _apply_slices(suffix: str, slices_str: str) -> str:
out = suffix
for m in _SUBSTRING_RE.finditer(slices_str):
n = int(m.group(1))
out = out[n:]
return out
def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None:
headers = {
"User-Agent": _DEFAULT_UA,
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
}
r = browser_get(page_url, headers=headers, timeout=timeout)
if r.status_code in (404, 410):
raise HosterDead(f"streamtape {page_url}: HTTP {r.status_code}")
if r.status_code != 200 or not r.text:
log.info("streamtape: fetch fail %s status=%s", page_url, r.status_code)
return None
if _NOT_FOUND_RE.search(r.text):
raise HosterDead(f"streamtape {page_url}: Video not found")
# Spróbuj wszystkie 4 assignmenty — pierwszy poprawny URL wygrywa.
# `get_video` może być w prefix (residential variant) lub split prefix+suffix
# (VPS variant gdzie decoy assignmenty produkują `.comb/get_video`).
final_url: str | None = None
for m in _ASSIGN_RE.finditer(r.text):
prefix = m.group("prefix").strip()
suffix = m.group("suffix")
slices = m.group("slices")
tail = _apply_slices(suffix, slices)
combined = prefix + tail
# Normalize: dodaj `https:` jeśli URL zaczyna się od `//`
if combined.startswith("//"):
url = "https:" + combined
elif combined.startswith("/"):
url = "https:/" + combined # `/streamtape.com/...` → `https://streamtape.com/...`
else:
url = combined
# Walidacja — odsiewa decoys (`streamtape.comb`, `streamtape.cob`).
if (
"streamtape.com/get_video?" in url
and "id=" in url
and "token=" in url
):
final_url = url
break
if not final_url:
log.info("streamtape: no valid innerHTML assignment found in %s", page_url)
return None
return [
StreamSource(
link=final_url,
quality=None,
type="mp4",
referer=page_url,
# /get_video zwraca 302 do tapecontent.net direct mp4. Proxy musi
# follow redirect (stream_proxy domyślnie follow_redirects=True).
raw={"redirect_via": "streamtape_get_video"},
)
]