Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
117 lines
4.6 KiB
Python
117 lines
4.6 KiB
Python
"""Streamtape embed → direct mp4 extractor.
|
|
|
|
Pattern (verified 2026-05-15 z residential, live URL `/e/PZqBZp4OomF0Q61`):
|
|
|
|
1. Embed `/e/<id>` zwraca 89KB body z 4 `document.getElementById(...).innerHTML`
|
|
assignmentami konstruującymi pełen URL do `/get_video`. Każdy uses ten sam
|
|
pattern:
|
|
|
|
document.getElementById('robotlink').innerHTML =
|
|
'//streamtape.com/get_video' +
|
|
('<junk>?id=<id>&expires=...&ip=...&token=...').substring(N).substring(M);
|
|
|
|
Junk to 3-4 znaki przed `?` — substring(N).substring(M) je odcina.
|
|
|
|
2. Po sklejeniu fetch `https://streamtape.com/get_video?id=...&token=...` →
|
|
302 → `https://<cluster>.tapecontent.net/radosgw/<id>/<signed_path>/<title>.mp4`
|
|
(direct mp4, video/mp4 ~500MB, brak IP-bind).
|
|
|
|
3. Body czasem zwraca `Video not found! Maybe it got deleted by the creator!`
|
|
— większość URLów w naszej DB (12k mass-DMCA'd 2026-05-15). Wtedy raise
|
|
HosterDead, caller w playback.py oznaczy dead_at.
|
|
|
|
Live URL coverage probed 2026-05-15: ~5% URLów żyje, reszta `Video not found`.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
from app.extractors._fetch import _DEFAULT_UA, browser_get
|
|
from app.extractors._models import HosterDead, StreamSource
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# Match: `getElementById('xlink').innerHTML = "<prefix>" + '' + ('<suffix>').substring(N).substring(M);`
|
|
# Streamtape generuje 4 assignmenty (ideoolink x2 + botlink + robotlink) — 2 są DECOYs
|
|
# z połamanym hostname (`.comb`, `.cob`) i tylko botlink/robotlink dają prawdziwy URL.
|
|
# Prefix może być fragmentem: `/streamtape.com`, `//streamtape.co`, `//streamtape.com/g`
|
|
# — `get_video` często jest split między prefix i suffix po slice'ach. Decyzja na
|
|
# podstawie KOMBINOWANEGO output containing exact `streamtape.com/get_video?`.
|
|
_ASSIGN_RE = re.compile(
|
|
r"document\.getElementById\(['\"](?P<elem>[a-z]+link)['\"]\)\.innerHTML\s*=\s*"
|
|
r"['\"](?P<prefix>[^'\"]*streamtape[^'\"]*)['\"]"
|
|
r"\s*\+\s*(?:['\"]{2}\s*\+\s*)?"
|
|
r"\(['\"](?P<suffix>[^'\"]+)['\"]\)"
|
|
r"(?P<slices>(?:\.substring\(\d+\))+)",
|
|
re.IGNORECASE,
|
|
)
|
|
_SUBSTRING_RE = re.compile(r"\.substring\((\d+)\)")
|
|
_NOT_FOUND_RE = re.compile(r"Video\s+not\s+found", re.IGNORECASE)
|
|
|
|
|
|
def _apply_slices(suffix: str, slices_str: str) -> str:
|
|
out = suffix
|
|
for m in _SUBSTRING_RE.finditer(slices_str):
|
|
n = int(m.group(1))
|
|
out = out[n:]
|
|
return out
|
|
|
|
|
|
def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None:
|
|
headers = {
|
|
"User-Agent": _DEFAULT_UA,
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
}
|
|
r = browser_get(page_url, headers=headers, timeout=timeout)
|
|
if r.status_code in (404, 410):
|
|
raise HosterDead(f"streamtape {page_url}: HTTP {r.status_code}")
|
|
if r.status_code != 200 or not r.text:
|
|
log.info("streamtape: fetch fail %s status=%s", page_url, r.status_code)
|
|
return None
|
|
|
|
if _NOT_FOUND_RE.search(r.text):
|
|
raise HosterDead(f"streamtape {page_url}: Video not found")
|
|
|
|
# Spróbuj wszystkie 4 assignmenty — pierwszy poprawny URL wygrywa.
|
|
# `get_video` może być w prefix (residential variant) lub split prefix+suffix
|
|
# (VPS variant gdzie decoy assignmenty produkują `.comb/get_video`).
|
|
final_url: str | None = None
|
|
for m in _ASSIGN_RE.finditer(r.text):
|
|
prefix = m.group("prefix").strip()
|
|
suffix = m.group("suffix")
|
|
slices = m.group("slices")
|
|
tail = _apply_slices(suffix, slices)
|
|
combined = prefix + tail
|
|
# Normalize: dodaj `https:` jeśli URL zaczyna się od `//`
|
|
if combined.startswith("//"):
|
|
url = "https:" + combined
|
|
elif combined.startswith("/"):
|
|
url = "https:/" + combined # `/streamtape.com/...` → `https://streamtape.com/...`
|
|
else:
|
|
url = combined
|
|
# Walidacja — odsiewa decoys (`streamtape.comb`, `streamtape.cob`).
|
|
if (
|
|
"streamtape.com/get_video?" in url
|
|
and "id=" in url
|
|
and "token=" in url
|
|
):
|
|
final_url = url
|
|
break
|
|
|
|
if not final_url:
|
|
log.info("streamtape: no valid innerHTML assignment found in %s", page_url)
|
|
return None
|
|
|
|
return [
|
|
StreamSource(
|
|
link=final_url,
|
|
quality=None,
|
|
type="mp4",
|
|
referer=page_url,
|
|
# /get_video zwraca 302 do tapecontent.net direct mp4. Proxy musi
|
|
# follow redirect (stream_proxy domyślnie follow_redirects=True).
|
|
raw={"redirect_via": "streamtape_get_video"},
|
|
)
|
|
]
|