goon/app/connectors/direct_scrapers/pornhat.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

99 lines
4.1 KiB
Python

"""pornhat.com — search-mode scraper (performer-driven backfill).
KVS engine. Search URL: `/search/<query>/` z `+` jako space separator. Scene URLs
to `/video/<slug>/` (slug bez ID prefix, w przeciwieństwie do 3Movs/OK.xxx). Slug
zawiera tokens query gdy match jest relevant, więc filtruje się automatycznie.
Auto-screenshot thumbnaile (`static.pornhat.com/contents/videos_screenshots/.../1.jpg`)
— do canonical match przez phash NIE nadają się (sprawdzone w probe 2026-05-12, 8%).
Ale wartość scrapera: discovering nowych scen performera których inne tube'y/canonical
nie mają. Mostly orphan ingest, ale dla popular performers może łapać studio scenes
których nie mamy w TPDB jeszcze.
Metadata enrich: scene page ma `class="info-video js-ajax-{dvd,model,tag}"` div'y
z `data-setup='{"title": ..., "url": ..., "dir": ...}'` JSON. Parsujemy w
`_fetch_scene_metadata()` żeby insertować studio (dvd), dodatkowych performerów
(models), i tagi do każdej sceny.
"""
from __future__ import annotations
import json
import logging
import re
from app.connectors.base import RawPerformer, RawStudio, RawTag
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
# `class="info-video js-ajax-<kind>"` ... `data-setup='<json>'`. JSON jest
# single-quoted (HTML attribute), z double-quotes wewnątrz dla string values.
# `\1` w replacement: backreference do `<kind>` żeby wiedzieć co matchujemy.
_AJAX_DATA_RE = re.compile(
r"class=\"info-video js-ajax-(?P<kind>dvd|model|tag)[^\"]*\"[^>]*data-setup='(?P<json>[^']+)'",
re.IGNORECASE,
)
class PornHatScraper(BaseSearchScraper):
sitetag = "pornhatcom"
# Pagination KVS-style: /search/<query>/<page>/ (page=1 ALSO works z explicit `/1/`)
_search_url_template = "https://www.pornhat.com/search/{query}/{page}/"
# PornHat search HTML używa relative hrefs `/video/<slug>/`. BaseSearchScraper
# automatycznie konwertuje relative → absolute via urlparse(search_url).netloc.
_scene_url_re = re.compile(
r'href="(?P<url>(?:https://www\.pornhat\.com)?/video/(?P<slug>[a-z0-9\-]+)/)"',
re.IGNORECASE,
)
def _format_query_for_url(self, query: str) -> str:
# KVS: lowercase + spaces → `-` (slug-style), działa też `+`
return query.strip().lower().replace(" ", "-")
def _fetch_scene_metadata(
self, scene_url: str
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
"""Fetch scene detail + parse `js-ajax-{dvd,model,tag}` data-setup JSON."""
try:
r = browser_get(scene_url, timeout=self._timeout)
if r.status_code != 200:
return None
except Exception as e:
log.debug("pornhat detail fetch failed %s: %s", scene_url, e)
return None
studio: RawStudio | None = None
performers: list[RawPerformer] = []
tags: list[RawTag] = []
for m in _AJAX_DATA_RE.finditer(r.text):
kind = m.group("kind").lower()
try:
data = json.loads(m.group("json"))
except json.JSONDecodeError:
continue
name = (data.get("title") or "").strip()
slug = (data.get("dir") or "").strip() or None
if not name:
continue
if kind == "dvd":
# `dvd` to studio/series wrapper (np. "Adult Time"). Pierwsze
# wystąpienie bierzemy jako studio sceny — rzadko jest ich więcej.
if studio is None:
studio = RawStudio(
external_id=f"pornhatcom:dvd:{slug or name.lower()}",
name=name,
slug=slug,
)
elif kind == "model":
performers.append(RawPerformer(name=name))
elif kind == "tag":
tags.append(RawTag(
external_id=f"pornhatcom:tag:{slug or name.lower()}",
name=name,
slug=slug,
))
return studio, performers, tags