"""hqfap.com — latest-vids browse scraper (PlayTube CMS).
Dołączony 2026-06-10 (user request). Re-uploader katalogu pornhd.pet (~120k scen,
thumbnaile to base64-encoded oryginalne URL-e w `/uploads/images/`).
Sygnały per scena (wszystko w SSR HTML detail page'a):
- JSON-LD VideoObject: name, uploadDate (ISO), duration (ISO 8601 `PT26M48S`),
thumbnailUrl, contentUrl (direct mp4 — patrz extractor `hqfapcom`)
- Performerzy: blok "Pornstars:" — ``
- Kategorie: blok "Categories & Tags:" — ``
Część nazw ocenzurowana gwiazdkami (`Te***`) — pomijamy. Kategorie z suffixem
" Clips" to studia ("Filthy Kings Clips") → RawStudio.
Listing: strona główna i `/videos/latest` NIE paginują się przez GET (PlayTube
doładowuje AJAX-em), ale site ma pełny **sitemap index** (`/sitemap.xml` →
12× `sitemaps/videos/sitemap-N.xml`, po ~10k URL-i z ``). Crawl_page
buduje katalog z sitemap (sort lastmod desc = newest first) i tnie na strony po
20 URL-i — działa i dla browse_latest (pages 1-5) i dla deep_crawl (kursor).
Cloudflare: strony HTML wymagają browser TLS (curl_cffi w browser_get); plain
curl z VPS dostaje 403. Sitemap i thumbnaile schodzą bez challenge'a.
"""
from __future__ import annotations
import json
import logging
import re
from datetime import date, datetime
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
from app.extractors import browser_get
from app.normalize.text import slugify
log = logging.getLogger(__name__)
_BASE = "https://hqfap.com"
_SITEMAP_INDEX = f"{_BASE}/sitemap.xml"
_PAGE_SIZE = 20
_SITEMAP_LOC_RE = re.compile(r"\s*([^<]+?)\s*")
_URL_BLOCK_RE = re.compile(r"(.*?)", re.DOTALL | re.IGNORECASE)
_LASTMOD_RE = re.compile(r"\s*([^<]+?)\s*")
_SCENE_ID_RE = re.compile(r"_(\d+)\.html")
_JSONLD_RE = re.compile(
r'',
re.IGNORECASE | re.DOTALL,
)
_ISO_DUR_RE = re.compile(r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE)
# Pille performera/kategorii. PlayTube renderuje single-quoted attrs; dopuszczamy
# oba quote'y. Nazwa z `` (href bywa URL-encoded / ze spacjami).
_PILL_RE = re.compile(
r"pornstar|category)/[^'\"]*['\"]"
r".*?(?P[^<]+)",
re.IGNORECASE | re.DOTALL,
)
def _parse_iso_duration(value: str | None) -> int | None:
"""`PT26M48S` → sekundy. None gdy format nieznany."""
if not value:
return None
m = _ISO_DUR_RE.match(value.strip())
if not m:
return None
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
return total or None
def _parse_iso_date(value: str | None) -> date | None:
"""`2026-06-09T16:00:00+00:00` → date. None gdy parse fail."""
if not value:
return None
try:
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
except ValueError:
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
if m:
try:
return date.fromisoformat(m.group(1))
except ValueError:
return None
return None
def _extract_video_object(html: str) -> dict | None:
"""Pierwszy JSON-LD VideoObject w HTML (hqfap emituje jeden, płaski dict)."""
for m in _JSONLD_RE.finditer(html):
raw = m.group(1).strip()
if not raw:
continue
try:
data = json.loads(raw)
except (json.JSONDecodeError, ValueError):
continue
items = data if isinstance(data, list) else [data]
for obj in items:
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
return obj
return None
class HQFapScraper(BaseBrowseScraper):
sitetag = "hqfapcom"
def __init__(self) -> None:
super().__init__()
# Katalog URL-i scen z sitemap, newest-first. Lazy-init raz per instancję
# (browse_latest i deep_crawl tworzą instancję per run, więc 13 fetchy XML
# amortyzuje się na cały run).
self._catalog: list[str] | None = None
# crawl_page override (jak EpornerApiScraper) — listing nie jest stronicowalny
# przez GET, źródłem paginacji jest sitemap. _listing_url/_extract_scene_urls
# nieużywane, ale abstrakcyjne — dostarczamy no-op implementacje.
def _listing_url(self, page: int) -> str: # pragma: no cover - nieużywane
return _SITEMAP_INDEX
def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover
return []
def _load_catalog(self) -> list[str] | None:
"""Pełna lista URL-i scen posortowana lastmod desc. None = fetch fail."""
if self._catalog is not None:
return self._catalog
try:
idx = browser_get(_SITEMAP_INDEX, timeout=self._timeout)
idx.raise_for_status()
except Exception as e:
log.warning("hqfap: sitemap index fetch failed: %s", e)
return None
sitemap_urls = [
u for u in _SITEMAP_LOC_RE.findall(idx.text) if "/videos/sitemap-" in u
]
if not sitemap_urls:
log.warning("hqfap: sitemap index has no video sitemaps")
return None
entries: list[tuple[str, str]] = [] # (lastmod, scene_url)
for sm_url in sitemap_urls:
try:
sm = browser_get(sm_url, timeout=self._timeout)
sm.raise_for_status()
except Exception as e:
# Brak jednego sitemapa ≠ fail całości — reszta katalogu wystarczy.
log.warning("hqfap: sitemap fetch failed %s: %s", sm_url, e)
continue
for block in _URL_BLOCK_RE.findall(sm.text):
loc_m = _SITEMAP_LOC_RE.search(block)
if not loc_m or "/watch/" not in loc_m.group(1):
continue
lastmod_m = _LASTMOD_RE.search(block)
entries.append((lastmod_m.group(1) if lastmod_m else "", loc_m.group(1)))
if not entries:
return None
# Dedup po scene id (sitemap potrafi powtórzyć URL między plikami).
entries.sort(key=lambda e: e[0], reverse=True)
seen_ids: set[str] = set()
catalog: list[str] = []
for _, url in entries:
id_m = _SCENE_ID_RE.search(url)
key = id_m.group(1) if id_m else url
if key in seen_ids:
continue
seen_ids.add(key)
catalog.append(url)
log.info("hqfap: catalog loaded — %d scenes from %d sitemaps",
len(catalog), len(sitemap_urls))
self._catalog = catalog
return catalog
def crawl_page(self, page: int) -> list[RawScene] | None:
catalog = self._load_catalog()
if catalog is None:
return None
start = (page - 1) * _PAGE_SIZE
chunk = catalog[start:start + _PAGE_SIZE]
if not chunk:
return []
out: list[RawScene] = []
for scene_url in chunk:
try:
res = browser_get(scene_url, timeout=self._timeout)
res.raise_for_status()
except Exception as e:
log.info("hqfap detail fetch failed %s: %s", scene_url, e)
continue
try:
raw = self._parse_detail(scene_url, res.text)
except Exception as e:
log.warning("hqfap detail parse failed %s: %s", scene_url, e)
continue
if raw is not None:
out.append(raw)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
video = _extract_video_object(detail_html)
if not video:
log.info("hqfap: no JSON-LD VideoObject on %s", scene_url)
return None
title = (video.get("name") or "").strip()
if not title:
return None
id_m = _SCENE_ID_RE.search(scene_url)
scene_id = id_m.group(1) if id_m else None
duration_sec = _parse_iso_duration(video.get("duration"))
release_date = _parse_iso_date(video.get("uploadDate"))
thumbnail_url = video.get("thumbnailUrl") or None
# Pille: pornstar → performer; category → tag, chyba że suffix " Clips"
# (studio-kategorie z importu pornhd, np. "Filthy Kings Clips" → "Filthy Kings").
# Ocenzurowane nazwy (`Te***`) pomijamy — gwiazdki to nie dane.
studio: RawStudio | None = None
performers: list[RawPerformer] = []
tags: list[RawTag] = []
seen_perf: set[str] = set()
seen_tag: set[str] = set()
for m in _PILL_RE.finditer(detail_html):
name = m.group("name").strip()
if not name or "*" in name:
continue
slug = slugify(name)
if not slug:
continue
if m.group("kind").lower() == "pornstar":
if slug not in seen_perf:
seen_perf.add(slug)
performers.append(
RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name)
)
elif name.lower().endswith(" clips"):
if studio is None:
studio_name = name[: -len(" clips")].strip()
if studio_name:
studio = RawStudio(
external_id=f"{self.sitetag}:studio:{slugify(studio_name)}",
name=studio_name,
slug=slugify(studio_name),
)
elif slug not in seen_tag:
seen_tag.add(slug)
tags.append(
RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug)
)
# Phash: thumbnaile to re-encodowane webp posterów pornhd.pet — dla studio
# contentu bywają oryginalnym studio art (szansa na phash match), dla amatorskiego
# nie zmatchują. Graceful: miss → composite scoring (title+performer+duration).
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
# Stream: JSON-LD contentUrl to direct mp4, ale token (`time=`) wygasa —
# NIE zapisujemy stream_url; extractor `hqfapcom` resolvuje świeży on-demand.
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{scene_id or scene_url}",
title=title,
release_date=release_date,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)