From 6de986b9a7b2482f660304a795deb68c1ed479eb Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Wed, 10 Jun 2026 17:51:04 +0200 Subject: [PATCH] feat(hqfap): browse scraper + native mp4 extractor (~120k scenes) PlayTube CMS. Sitemap-based pagination (listing has no GET paging), JSON-LD VideoObject metadata, pornstar/category pills, " Clips" categories mapped to studio. Direct mp4 (cdnde.com/okcdn.ru), tokens time-bound and portable cross-IP, so mobile plays direct. Co-Authored-By: Claude Fable 5 --- app/connectors/direct_scrapers/__init__.py | 7 + app/connectors/direct_scrapers/hqfap.py | 294 +++++++++++++++++++++ app/extractors/__init__.py | 5 + app/extractors/tubes/hqfap.py | 69 +++++ 4 files changed, 375 insertions(+) create mode 100644 app/connectors/direct_scrapers/hqfap.py create mode 100644 app/extractors/tubes/hqfap.py diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index 171de5c..cb77395 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -153,6 +153,7 @@ from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402 from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402 from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402 +from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402 from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402 from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402 @@ -203,6 +204,12 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ # Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie # full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.) XVideosBrowseScraper, + # HQFapScraper — dołączony 2026-06-10 (user request). PlayTube CMS, ~120k scen + # (re-upload pornhd.pet). JSON-LD VideoObject (title+uploadDate+duration+thumb+ + # contentUrl) + pille Pornstars/Categories na detail page. Listing nie paginuje + # się GET-em → crawl_page po sitemap index (12 plików, lastmod desc). Direct mp4 + # (cdnde.com / okcdn.ru), cross-IP portable → natywny extractor `hqfapcom`. + HQFapScraper, # 4k69.com — NIE dołączony: homepage JS-rendered, brak og:/KVS markerów w surowym HTML # (probe 2026-06-01). Wymagałby headless render — odłożony. # porntrex/hqporner/youporn — NIE: KVS/JS bez SSR duration → niewidoczne orphany (2026-06-03). diff --git a/app/connectors/direct_scrapers/hqfap.py b/app/connectors/direct_scrapers/hqfap.py new file mode 100644 index 0000000..6aecb55 --- /dev/null +++ b/app/connectors/direct_scrapers/hqfap.py @@ -0,0 +1,294 @@ +"""hqfap.com — latest-vids browse scraper (PlayTube CMS). + +Dołączony 2026-06-10 (user request). Re-uploader katalogu pornhd.pet (~120k scen, +thumbnaile to base64-encoded oryginalne URL-e w `/uploads/images/`). + +Sygnały per scena (wszystko w SSR HTML detail page'a): + - JSON-LD VideoObject: name, uploadDate (ISO), duration (ISO 8601 `PT26M48S`), + thumbnailUrl, contentUrl (direct mp4 — patrz extractor `hqfapcom`) + - Performerzy: blok "Pornstars:" — `` + - Kategorie: blok "Categories & Tags:" — `` + Część nazw ocenzurowana gwiazdkami (`Te***`) — pomijamy. Kategorie z suffixem + " Clips" to studia ("Filthy Kings Clips") → RawStudio. + +Listing: strona główna i `/videos/latest` NIE paginują się przez GET (PlayTube +doładowuje AJAX-em), ale site ma pełny **sitemap index** (`/sitemap.xml` → +12× `sitemaps/videos/sitemap-N.xml`, po ~10k URL-i z ``). Crawl_page +buduje katalog z sitemap (sort lastmod desc = newest first) i tnie na strony po +20 URL-i — działa i dla browse_latest (pages 1-5) i dla deep_crawl (kursor). + +Cloudflare: strony HTML wymagają browser TLS (curl_cffi w browser_get); plain +curl z VPS dostaje 403. Sitemap i thumbnaile schodzą bez challenge'a. +""" +from __future__ import annotations + +import json +import logging +import re +from datetime import date, datetime + +from app.connectors.base import ( + RawFingerprint, + RawPerformer, + RawPlaybackSource, + RawScene, + RawStudio, + RawTag, +) +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, +) +from app.extractors import browser_get +from app.normalize.text import slugify + +log = logging.getLogger(__name__) + +_BASE = "https://hqfap.com" +_SITEMAP_INDEX = f"{_BASE}/sitemap.xml" +_PAGE_SIZE = 20 + +_SITEMAP_LOC_RE = re.compile(r"\s*([^<]+?)\s*") +_URL_BLOCK_RE = re.compile(r"(.*?)", re.DOTALL | re.IGNORECASE) +_LASTMOD_RE = re.compile(r"\s*([^<]+?)\s*") +_SCENE_ID_RE = re.compile(r"_(\d+)\.html") + +_JSONLD_RE = re.compile( + r']+type=["\']application/ld\+json["\'][^>]*>(.*?)', + re.IGNORECASE | re.DOTALL, +) +_ISO_DUR_RE = re.compile(r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE) + +# Pille performera/kategorii. PlayTube renderuje single-quoted attrs; dopuszczamy +# oba quote'y. Nazwa z `` (href bywa URL-encoded / ze spacjami). +_PILL_RE = re.compile( + r"pornstar|category)/[^'\"]*['\"]" + r".*?(?P[^<]+)", + re.IGNORECASE | re.DOTALL, +) + + +def _parse_iso_duration(value: str | None) -> int | None: + """`PT26M48S` → sekundy. None gdy format nieznany.""" + if not value: + return None + m = _ISO_DUR_RE.match(value.strip()) + if not m: + return None + total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0) + return total or None + + +def _parse_iso_date(value: str | None) -> date | None: + """`2026-06-09T16:00:00+00:00` → date. None gdy parse fail.""" + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")).date() + except ValueError: + m = re.match(r"(\d{4}-\d{2}-\d{2})", value) + if m: + try: + return date.fromisoformat(m.group(1)) + except ValueError: + return None + return None + + +def _extract_video_object(html: str) -> dict | None: + """Pierwszy JSON-LD VideoObject w HTML (hqfap emituje jeden, płaski dict).""" + for m in _JSONLD_RE.finditer(html): + raw = m.group(1).strip() + if not raw: + continue + try: + data = json.loads(raw) + except (json.JSONDecodeError, ValueError): + continue + items = data if isinstance(data, list) else [data] + for obj in items: + if isinstance(obj, dict) and obj.get("@type") == "VideoObject": + return obj + return None + + +class HQFapScraper(BaseBrowseScraper): + sitetag = "hqfapcom" + + def __init__(self) -> None: + super().__init__() + # Katalog URL-i scen z sitemap, newest-first. Lazy-init raz per instancję + # (browse_latest i deep_crawl tworzą instancję per run, więc 13 fetchy XML + # amortyzuje się na cały run). + self._catalog: list[str] | None = None + + # crawl_page override (jak EpornerApiScraper) — listing nie jest stronicowalny + # przez GET, źródłem paginacji jest sitemap. _listing_url/_extract_scene_urls + # nieużywane, ale abstrakcyjne — dostarczamy no-op implementacje. + def _listing_url(self, page: int) -> str: # pragma: no cover - nieużywane + return _SITEMAP_INDEX + + def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover + return [] + + def _load_catalog(self) -> list[str] | None: + """Pełna lista URL-i scen posortowana lastmod desc. None = fetch fail.""" + if self._catalog is not None: + return self._catalog + try: + idx = browser_get(_SITEMAP_INDEX, timeout=self._timeout) + idx.raise_for_status() + except Exception as e: + log.warning("hqfap: sitemap index fetch failed: %s", e) + return None + sitemap_urls = [ + u for u in _SITEMAP_LOC_RE.findall(idx.text) if "/videos/sitemap-" in u + ] + if not sitemap_urls: + log.warning("hqfap: sitemap index has no video sitemaps") + return None + + entries: list[tuple[str, str]] = [] # (lastmod, scene_url) + for sm_url in sitemap_urls: + try: + sm = browser_get(sm_url, timeout=self._timeout) + sm.raise_for_status() + except Exception as e: + # Brak jednego sitemapa ≠ fail całości — reszta katalogu wystarczy. + log.warning("hqfap: sitemap fetch failed %s: %s", sm_url, e) + continue + for block in _URL_BLOCK_RE.findall(sm.text): + loc_m = _SITEMAP_LOC_RE.search(block) + if not loc_m or "/watch/" not in loc_m.group(1): + continue + lastmod_m = _LASTMOD_RE.search(block) + entries.append((lastmod_m.group(1) if lastmod_m else "", loc_m.group(1))) + + if not entries: + return None + # Dedup po scene id (sitemap potrafi powtórzyć URL między plikami). + entries.sort(key=lambda e: e[0], reverse=True) + seen_ids: set[str] = set() + catalog: list[str] = [] + for _, url in entries: + id_m = _SCENE_ID_RE.search(url) + key = id_m.group(1) if id_m else url + if key in seen_ids: + continue + seen_ids.add(key) + catalog.append(url) + log.info("hqfap: catalog loaded — %d scenes from %d sitemaps", + len(catalog), len(sitemap_urls)) + self._catalog = catalog + return catalog + + def crawl_page(self, page: int) -> list[RawScene] | None: + catalog = self._load_catalog() + if catalog is None: + return None + start = (page - 1) * _PAGE_SIZE + chunk = catalog[start:start + _PAGE_SIZE] + if not chunk: + return [] + out: list[RawScene] = [] + for scene_url in chunk: + try: + res = browser_get(scene_url, timeout=self._timeout) + res.raise_for_status() + except Exception as e: + log.info("hqfap detail fetch failed %s: %s", scene_url, e) + continue + try: + raw = self._parse_detail(scene_url, res.text) + except Exception as e: + log.warning("hqfap detail parse failed %s: %s", scene_url, e) + continue + if raw is not None: + out.append(raw) + return out + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + video = _extract_video_object(detail_html) + if not video: + log.info("hqfap: no JSON-LD VideoObject on %s", scene_url) + return None + + title = (video.get("name") or "").strip() + if not title: + return None + + id_m = _SCENE_ID_RE.search(scene_url) + scene_id = id_m.group(1) if id_m else None + + duration_sec = _parse_iso_duration(video.get("duration")) + release_date = _parse_iso_date(video.get("uploadDate")) + thumbnail_url = video.get("thumbnailUrl") or None + + # Pille: pornstar → performer; category → tag, chyba że suffix " Clips" + # (studio-kategorie z importu pornhd, np. "Filthy Kings Clips" → "Filthy Kings"). + # Ocenzurowane nazwy (`Te***`) pomijamy — gwiazdki to nie dane. + studio: RawStudio | None = None + performers: list[RawPerformer] = [] + tags: list[RawTag] = [] + seen_perf: set[str] = set() + seen_tag: set[str] = set() + for m in _PILL_RE.finditer(detail_html): + name = m.group("name").strip() + if not name or "*" in name: + continue + slug = slugify(name) + if not slug: + continue + if m.group("kind").lower() == "pornstar": + if slug not in seen_perf: + seen_perf.add(slug) + performers.append( + RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name) + ) + elif name.lower().endswith(" clips"): + if studio is None: + studio_name = name[: -len(" clips")].strip() + if studio_name: + studio = RawStudio( + external_id=f"{self.sitetag}:studio:{slugify(studio_name)}", + name=studio_name, + slug=slugify(studio_name), + ) + elif slug not in seen_tag: + seen_tag.add(slug) + tags.append( + RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug) + ) + + # Phash: thumbnaile to re-encodowane webp posterów pornhd.pet — dla studio + # contentu bywają oryginalnym studio art (szansa na phash match), dla amatorskiego + # nie zmatchują. Graceful: miss → composite scoring (title+performer+duration). + fingerprints: list[RawFingerprint] = [] + if thumbnail_url: + ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + # Stream: JSON-LD contentUrl to direct mp4, ale token (`time=`) wygasa — + # NIE zapisujemy stream_url; extractor `hqfapcom` resolvuje świeży on-demand. + playback_sources = [ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumbnail_url, + ) + ] + + return RawScene( + external_id=f"{self.sitetag}:{scene_id or scene_url}", + title=title, + release_date=release_date, + duration_sec=duration_sec, + url=scene_url, + studio=studio, + performers=performers, + tags=tags, + fingerprints=fingerprints, + playback_sources=playback_sources, + ) diff --git a/app/extractors/__init__.py b/app/extractors/__init__.py index f7604dc..a0bf3c0 100644 --- a/app/extractors/__init__.py +++ b/app/extractors/__init__.py @@ -31,6 +31,7 @@ from app.extractors.tubes import ( freshporno, fullmovies, hdporngg, + hqfap, hqporner, latestpornvideo, paradisehill, @@ -174,6 +175,10 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = { # (#19866e9e wcześniej źle: założyłem „get_file 403 IP-bound" testem plain-curl.) "fullmoviesxxx": fullmovies.extract, "hdporngg": hdporngg.extract, + # hqfap — JSON-LD contentUrl = direct mp4 (cdnde.com nowsze / okcdn.ru starsze). + # Cross-IP test 2026-06-10: oba CDN-y portable (`ip=`/`srcIp=` nie egzekwowane), + # tokeny time-bound → on-demand fetch daje świeży URL. Mobile direct, zero proxy. + "hqfapcom": hqfap.extract, } diff --git a/app/extractors/tubes/hqfap.py b/app/extractors/tubes/hqfap.py new file mode 100644 index 0000000..ff918ff --- /dev/null +++ b/app/extractors/tubes/hqfap.py @@ -0,0 +1,69 @@ +"""hqfap.com — direct stream extractor. + +Scene page (SSR, za Cloudflare → curl_cffi w fetch_tube_html) ma JSON-LD +VideoObject z `contentUrl` = direct mp4. Dwie generacje hostingu w katalogu: + + - nowsze sceny: `v4.cdnde.com/...?video=&time=&ip=` — param + `ip` NIE jest egzekwowany (cross-IP test 2026-06-10: lokalny ISP i VPS Hetzner + oba 206), token time-bound → resolve on-demand daje świeży URL, + - starsze sceny: `vd*.okcdn.ru/?expires=...&srcIp=...&sig=...` (ok.ru) — również + portable cross-IP (206 z innego IP niż fetcher). + +Mobile gra direct (mobile_direct auto-detect w playback.py), zero proxy/WebView. +""" +from __future__ import annotations + +import json +import logging +import re + +from app.extractors._fetch import fetch_tube_html +from app.extractors._models import StreamSource + +log = logging.getLogger(__name__) + +_JSONLD_RE = re.compile( + r']+type=["\']application/ld\+json["\'][^>]*>(.*?)', + re.IGNORECASE | re.DOTALL, +) +# Fallback gdy JSON-LD nie parsuje się jako JSON (trailing comma itp.). +_CONTENT_URL_RE = re.compile(r'"contentUrl"\s*:\s*"([^"]+)"') +_QUALITY_RE = re.compile(r"_(\d{3,4})p\.mp4", re.IGNORECASE) + + +def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None: + html = fetch_tube_html(page_url, timeout=timeout) + + content_url: str | None = None + for m in _JSONLD_RE.finditer(html): + raw = m.group(1).strip() + if not raw: + continue + try: + data = json.loads(raw) + except (json.JSONDecodeError, ValueError): + continue + items = data if isinstance(data, list) else [data] + for obj in items: + if isinstance(obj, dict) and obj.get("@type") == "VideoObject": + content_url = (obj.get("contentUrl") or "").strip() or None + break + if content_url: + break + if not content_url: + rm = _CONTENT_URL_RE.search(html) + content_url = rm.group(1).strip() if rm else None + if not content_url or not content_url.startswith("http"): + log.warning("hqfap: no contentUrl in JSON-LD for %s", page_url) + return None + + qm = _QUALITY_RE.search(content_url) + quality = f"{qm.group(1)}p" if qm else None + return [ + StreamSource( + link=content_url, + quality=quality, + type="mp4", + referer="https://hqfap.com/", + ) + ]