goon/app/extractors/tubes/hqfap.py

"""hqfap.com — direct stream extractor.

Scene page (SSR, za Cloudflare → curl_cffi w fetch_tube_html) ma JSON-LD
VideoObject z `contentUrl` = direct mp4. Dwie generacje hostingu w katalogu:

  - nowsze sceny: `v4.cdnde.com/...?video=<b64>&time=<epoch>&ip=<addr>` — param
    `ip` NIE jest egzekwowany (cross-IP test 2026-06-10: lokalny ISP i VPS Hetzner
    oba 206), token time-bound → resolve on-demand daje świeży URL,
  - starsze sceny: `vd*.okcdn.ru/?expires=...&srcIp=...&sig=...` (ok.ru) — również
    portable cross-IP (206 z innego IP niż fetcher).

Mobile gra direct (mobile_direct auto-detect w playback.py), zero proxy/WebView.
"""
from __future__ import annotations

import json
import logging
import re

from app.extractors._fetch import fetch_tube_html
from app.extractors._models import StreamSource

log = logging.getLogger(__name__)

_JSONLD_RE = re.compile(
    r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
    re.IGNORECASE | re.DOTALL,
)
# Fallback gdy JSON-LD nie parsuje się jako JSON (trailing comma itp.).
_CONTENT_URL_RE = re.compile(r'"contentUrl"\s*:\s*"([^"]+)"')
_QUALITY_RE = re.compile(r"_(\d{3,4})p\.mp4", re.IGNORECASE)


def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
    html = fetch_tube_html(page_url, timeout=timeout)

    content_url: str | None = None
    for m in _JSONLD_RE.finditer(html):
        raw = m.group(1).strip()
        if not raw:
            continue
        try:
            data = json.loads(raw)
        except (json.JSONDecodeError, ValueError):
            continue
        items = data if isinstance(data, list) else [data]
        for obj in items:
            if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
                content_url = (obj.get("contentUrl") or "").strip() or None
                break
        if content_url:
            break
    if not content_url:
        rm = _CONTENT_URL_RE.search(html)
        content_url = rm.group(1).strip() if rm else None
    if not content_url or not content_url.startswith("http"):
        log.warning("hqfap: no contentUrl in JSON-LD for %s", page_url)
        return None

    qm = _QUALITY_RE.search(content_url)
    quality = f"{qm.group(1)}p" if qm else None
    return [
        StreamSource(
            link=content_url,
            quality=quality,
            type="mp4",
            referer="https://hqfap.com/",
        )
    ]