goon/app/extractors/tubes/hqfap.py
jtrzupek 6de986b9a7 feat(hqfap): browse scraper + native mp4 extractor (~120k scenes)
PlayTube CMS. Sitemap-based pagination (listing has no GET paging),
JSON-LD VideoObject metadata, pornstar/category pills, " Clips"
categories mapped to studio. Direct mp4 (cdnde.com/okcdn.ru), tokens
time-bound and portable cross-IP, so mobile plays direct.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 17:51:04 +02:00

69 lines
2.4 KiB
Python

"""hqfap.com — direct stream extractor.
Scene page (SSR, za Cloudflare → curl_cffi w fetch_tube_html) ma JSON-LD
VideoObject z `contentUrl` = direct mp4. Dwie generacje hostingu w katalogu:
- nowsze sceny: `v4.cdnde.com/...?video=<b64>&time=<epoch>&ip=<addr>` — param
`ip` NIE jest egzekwowany (cross-IP test 2026-06-10: lokalny ISP i VPS Hetzner
oba 206), token time-bound → resolve on-demand daje świeży URL,
- starsze sceny: `vd*.okcdn.ru/?expires=...&srcIp=...&sig=...` (ok.ru) — również
portable cross-IP (206 z innego IP niż fetcher).
Mobile gra direct (mobile_direct auto-detect w playback.py), zero proxy/WebView.
"""
from __future__ import annotations
import json
import logging
import re
from app.extractors._fetch import fetch_tube_html
from app.extractors._models import StreamSource
log = logging.getLogger(__name__)
_JSONLD_RE = re.compile(
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
re.IGNORECASE | re.DOTALL,
)
# Fallback gdy JSON-LD nie parsuje się jako JSON (trailing comma itp.).
_CONTENT_URL_RE = re.compile(r'"contentUrl"\s*:\s*"([^"]+)"')
_QUALITY_RE = re.compile(r"_(\d{3,4})p\.mp4", re.IGNORECASE)
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
html = fetch_tube_html(page_url, timeout=timeout)
content_url: str | None = None
for m in _JSONLD_RE.finditer(html):
raw = m.group(1).strip()
if not raw:
continue
try:
data = json.loads(raw)
except (json.JSONDecodeError, ValueError):
continue
items = data if isinstance(data, list) else [data]
for obj in items:
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
content_url = (obj.get("contentUrl") or "").strip() or None
break
if content_url:
break
if not content_url:
rm = _CONTENT_URL_RE.search(html)
content_url = rm.group(1).strip() if rm else None
if not content_url or not content_url.startswith("http"):
log.warning("hqfap: no contentUrl in JSON-LD for %s", page_url)
return None
qm = _QUALITY_RE.search(content_url)
quality = f"{qm.group(1)}p" if qm else None
return [
StreamSource(
link=content_url,
quality=quality,
type="mp4",
referer="https://hqfap.com/",
)
]