goon/app/connectors/direct_scrapers/hqfap.py
jtrzupek 6de986b9a7 feat(hqfap): browse scraper + native mp4 extractor (~120k scenes)
PlayTube CMS. Sitemap-based pagination (listing has no GET paging),
JSON-LD VideoObject metadata, pornstar/category pills, " Clips"
categories mapped to studio. Direct mp4 (cdnde.com/okcdn.ru), tokens
time-bound and portable cross-IP, so mobile plays direct.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 17:51:04 +02:00

294 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""hqfap.com — latest-vids browse scraper (PlayTube CMS).
Dołączony 2026-06-10 (user request). Re-uploader katalogu pornhd.pet (~120k scen,
thumbnaile to base64-encoded oryginalne URL-e w `/uploads/images/`).
Sygnały per scena (wszystko w SSR HTML detail page'a):
- JSON-LD VideoObject: name, uploadDate (ISO), duration (ISO 8601 `PT26M48S`),
thumbnailUrl, contentUrl (direct mp4 — patrz extractor `hqfapcom`)
- Performerzy: blok "Pornstars:" — `<a class='pill' href='/videos/pornstar/<Name>'>`
- Kategorie: blok "Categories & Tags:" — `<a class='pill' href='/videos/category/<Name>'>`
Część nazw ocenzurowana gwiazdkami (`Te***`) — pomijamy. Kategorie z suffixem
" Clips" to studia ("Filthy Kings Clips") → RawStudio.
Listing: strona główna i `/videos/latest` NIE paginują się przez GET (PlayTube
doładowuje AJAX-em), ale site ma pełny **sitemap index** (`/sitemap.xml` →
12× `sitemaps/videos/sitemap-N.xml`, po ~10k URL-i z `<lastmod>`). Crawl_page
buduje katalog z sitemap (sort lastmod desc = newest first) i tnie na strony po
20 URL-i — działa i dla browse_latest (pages 1-5) i dla deep_crawl (kursor).
Cloudflare: strony HTML wymagają browser TLS (curl_cffi w browser_get); plain
curl z VPS dostaje 403. Sitemap i thumbnaile schodzą bez challenge'a.
"""
from __future__ import annotations
import json
import logging
import re
from datetime import date, datetime
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
from app.extractors import browser_get
from app.normalize.text import slugify
log = logging.getLogger(__name__)
_BASE = "https://hqfap.com"
_SITEMAP_INDEX = f"{_BASE}/sitemap.xml"
_PAGE_SIZE = 20
_SITEMAP_LOC_RE = re.compile(r"<loc>\s*([^<]+?)\s*</loc>")
_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.DOTALL | re.IGNORECASE)
_LASTMOD_RE = re.compile(r"<lastmod>\s*([^<]+?)\s*</lastmod>")
_SCENE_ID_RE = re.compile(r"_(\d+)\.html")
_JSONLD_RE = re.compile(
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
re.IGNORECASE | re.DOTALL,
)
_ISO_DUR_RE = re.compile(r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE)
# Pille performera/kategorii. PlayTube renderuje single-quoted attrs; dopuszczamy
# oba quote'y. Nazwa z `<span itemprop='name'>` (href bywa URL-encoded / ze spacjami).
_PILL_RE = re.compile(
r"<a\s+class=['\"]pill['\"]\s+href=['\"]/videos/(?P<kind>pornstar|category)/[^'\"]*['\"]"
r".*?<span itemprop=['\"]name['\"]>(?P<name>[^<]+)</span>",
re.IGNORECASE | re.DOTALL,
)
def _parse_iso_duration(value: str | None) -> int | None:
"""`PT26M48S` → sekundy. None gdy format nieznany."""
if not value:
return None
m = _ISO_DUR_RE.match(value.strip())
if not m:
return None
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
return total or None
def _parse_iso_date(value: str | None) -> date | None:
"""`2026-06-09T16:00:00+00:00` → date. None gdy parse fail."""
if not value:
return None
try:
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
except ValueError:
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
if m:
try:
return date.fromisoformat(m.group(1))
except ValueError:
return None
return None
def _extract_video_object(html: str) -> dict | None:
"""Pierwszy JSON-LD VideoObject w HTML (hqfap emituje jeden, płaski dict)."""
for m in _JSONLD_RE.finditer(html):
raw = m.group(1).strip()
if not raw:
continue
try:
data = json.loads(raw)
except (json.JSONDecodeError, ValueError):
continue
items = data if isinstance(data, list) else [data]
for obj in items:
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
return obj
return None
class HQFapScraper(BaseBrowseScraper):
sitetag = "hqfapcom"
def __init__(self) -> None:
super().__init__()
# Katalog URL-i scen z sitemap, newest-first. Lazy-init raz per instancję
# (browse_latest i deep_crawl tworzą instancję per run, więc 13 fetchy XML
# amortyzuje się na cały run).
self._catalog: list[str] | None = None
# crawl_page override (jak EpornerApiScraper) — listing nie jest stronicowalny
# przez GET, źródłem paginacji jest sitemap. _listing_url/_extract_scene_urls
# nieużywane, ale abstrakcyjne — dostarczamy no-op implementacje.
def _listing_url(self, page: int) -> str: # pragma: no cover - nieużywane
return _SITEMAP_INDEX
def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover
return []
def _load_catalog(self) -> list[str] | None:
"""Pełna lista URL-i scen posortowana lastmod desc. None = fetch fail."""
if self._catalog is not None:
return self._catalog
try:
idx = browser_get(_SITEMAP_INDEX, timeout=self._timeout)
idx.raise_for_status()
except Exception as e:
log.warning("hqfap: sitemap index fetch failed: %s", e)
return None
sitemap_urls = [
u for u in _SITEMAP_LOC_RE.findall(idx.text) if "/videos/sitemap-" in u
]
if not sitemap_urls:
log.warning("hqfap: sitemap index has no video sitemaps")
return None
entries: list[tuple[str, str]] = [] # (lastmod, scene_url)
for sm_url in sitemap_urls:
try:
sm = browser_get(sm_url, timeout=self._timeout)
sm.raise_for_status()
except Exception as e:
# Brak jednego sitemapa ≠ fail całości — reszta katalogu wystarczy.
log.warning("hqfap: sitemap fetch failed %s: %s", sm_url, e)
continue
for block in _URL_BLOCK_RE.findall(sm.text):
loc_m = _SITEMAP_LOC_RE.search(block)
if not loc_m or "/watch/" not in loc_m.group(1):
continue
lastmod_m = _LASTMOD_RE.search(block)
entries.append((lastmod_m.group(1) if lastmod_m else "", loc_m.group(1)))
if not entries:
return None
# Dedup po scene id (sitemap potrafi powtórzyć URL między plikami).
entries.sort(key=lambda e: e[0], reverse=True)
seen_ids: set[str] = set()
catalog: list[str] = []
for _, url in entries:
id_m = _SCENE_ID_RE.search(url)
key = id_m.group(1) if id_m else url
if key in seen_ids:
continue
seen_ids.add(key)
catalog.append(url)
log.info("hqfap: catalog loaded — %d scenes from %d sitemaps",
len(catalog), len(sitemap_urls))
self._catalog = catalog
return catalog
def crawl_page(self, page: int) -> list[RawScene] | None:
catalog = self._load_catalog()
if catalog is None:
return None
start = (page - 1) * _PAGE_SIZE
chunk = catalog[start:start + _PAGE_SIZE]
if not chunk:
return []
out: list[RawScene] = []
for scene_url in chunk:
try:
res = browser_get(scene_url, timeout=self._timeout)
res.raise_for_status()
except Exception as e:
log.info("hqfap detail fetch failed %s: %s", scene_url, e)
continue
try:
raw = self._parse_detail(scene_url, res.text)
except Exception as e:
log.warning("hqfap detail parse failed %s: %s", scene_url, e)
continue
if raw is not None:
out.append(raw)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
video = _extract_video_object(detail_html)
if not video:
log.info("hqfap: no JSON-LD VideoObject on %s", scene_url)
return None
title = (video.get("name") or "").strip()
if not title:
return None
id_m = _SCENE_ID_RE.search(scene_url)
scene_id = id_m.group(1) if id_m else None
duration_sec = _parse_iso_duration(video.get("duration"))
release_date = _parse_iso_date(video.get("uploadDate"))
thumbnail_url = video.get("thumbnailUrl") or None
# Pille: pornstar → performer; category → tag, chyba że suffix " Clips"
# (studio-kategorie z importu pornhd, np. "Filthy Kings Clips" → "Filthy Kings").
# Ocenzurowane nazwy (`Te***`) pomijamy — gwiazdki to nie dane.
studio: RawStudio | None = None
performers: list[RawPerformer] = []
tags: list[RawTag] = []
seen_perf: set[str] = set()
seen_tag: set[str] = set()
for m in _PILL_RE.finditer(detail_html):
name = m.group("name").strip()
if not name or "*" in name:
continue
slug = slugify(name)
if not slug:
continue
if m.group("kind").lower() == "pornstar":
if slug not in seen_perf:
seen_perf.add(slug)
performers.append(
RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name)
)
elif name.lower().endswith(" clips"):
if studio is None:
studio_name = name[: -len(" clips")].strip()
if studio_name:
studio = RawStudio(
external_id=f"{self.sitetag}:studio:{slugify(studio_name)}",
name=studio_name,
slug=slugify(studio_name),
)
elif slug not in seen_tag:
seen_tag.add(slug)
tags.append(
RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug)
)
# Phash: thumbnaile to re-encodowane webp posterów pornhd.pet — dla studio
# contentu bywają oryginalnym studio art (szansa na phash match), dla amatorskiego
# nie zmatchują. Graceful: miss → composite scoring (title+performer+duration).
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
# Stream: JSON-LD contentUrl to direct mp4, ale token (`time=`) wygasa —
# NIE zapisujemy stream_url; extractor `hqfapcom` resolvuje świeży on-demand.
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{scene_id or scene_url}",
title=title,
release_date=release_date,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)