Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
99 lines
4.1 KiB
Python
99 lines
4.1 KiB
Python
"""pornhat.com — search-mode scraper (performer-driven backfill).
|
|
|
|
KVS engine. Search URL: `/search/<query>/` z `+` jako space separator. Scene URLs
|
|
to `/video/<slug>/` (slug bez ID prefix, w przeciwieństwie do 3Movs/OK.xxx). Slug
|
|
zawiera tokens query gdy match jest relevant, więc filtruje się automatycznie.
|
|
|
|
Auto-screenshot thumbnaile (`static.pornhat.com/contents/videos_screenshots/.../1.jpg`)
|
|
— do canonical match przez phash NIE nadają się (sprawdzone w probe 2026-05-12, 8%).
|
|
Ale wartość scrapera: discovering nowych scen performera których inne tube'y/canonical
|
|
nie mają. Mostly orphan ingest, ale dla popular performers może łapać studio scenes
|
|
których nie mamy w TPDB jeszcze.
|
|
|
|
Metadata enrich: scene page ma `class="info-video js-ajax-{dvd,model,tag}"` div'y
|
|
z `data-setup='{"title": ..., "url": ..., "dir": ...}'` JSON. Parsujemy w
|
|
`_fetch_scene_metadata()` żeby insertować studio (dvd), dodatkowych performerów
|
|
(models), i tagi do każdej sceny.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
|
|
from app.connectors.base import RawPerformer, RawStudio, RawTag
|
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
|
from app.extractors import browser_get
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# `class="info-video js-ajax-<kind>"` ... `data-setup='<json>'`. JSON jest
|
|
# single-quoted (HTML attribute), z double-quotes wewnątrz dla string values.
|
|
# `\1` w replacement: backreference do `<kind>` żeby wiedzieć co matchujemy.
|
|
_AJAX_DATA_RE = re.compile(
|
|
r"class=\"info-video js-ajax-(?P<kind>dvd|model|tag)[^\"]*\"[^>]*data-setup='(?P<json>[^']+)'",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
class PornHatScraper(BaseSearchScraper):
|
|
sitetag = "pornhatcom"
|
|
# Pagination KVS-style: /search/<query>/<page>/ (page=1 ALSO works z explicit `/1/`)
|
|
_search_url_template = "https://www.pornhat.com/search/{query}/{page}/"
|
|
# PornHat search HTML używa relative hrefs `/video/<slug>/`. BaseSearchScraper
|
|
# automatycznie konwertuje relative → absolute via urlparse(search_url).netloc.
|
|
_scene_url_re = re.compile(
|
|
r'href="(?P<url>(?:https://www\.pornhat\.com)?/video/(?P<slug>[a-z0-9\-]+)/)"',
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
def _format_query_for_url(self, query: str) -> str:
|
|
# KVS: lowercase + spaces → `-` (slug-style), działa też `+`
|
|
return query.strip().lower().replace(" ", "-")
|
|
|
|
def _fetch_scene_metadata(
|
|
self, scene_url: str
|
|
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
|
|
"""Fetch scene detail + parse `js-ajax-{dvd,model,tag}` data-setup JSON."""
|
|
try:
|
|
r = browser_get(scene_url, timeout=self._timeout)
|
|
if r.status_code != 200:
|
|
return None
|
|
except Exception as e:
|
|
log.debug("pornhat detail fetch failed %s: %s", scene_url, e)
|
|
return None
|
|
|
|
studio: RawStudio | None = None
|
|
performers: list[RawPerformer] = []
|
|
tags: list[RawTag] = []
|
|
|
|
for m in _AJAX_DATA_RE.finditer(r.text):
|
|
kind = m.group("kind").lower()
|
|
try:
|
|
data = json.loads(m.group("json"))
|
|
except json.JSONDecodeError:
|
|
continue
|
|
name = (data.get("title") or "").strip()
|
|
slug = (data.get("dir") or "").strip() or None
|
|
if not name:
|
|
continue
|
|
if kind == "dvd":
|
|
# `dvd` to studio/series wrapper (np. "Adult Time"). Pierwsze
|
|
# wystąpienie bierzemy jako studio sceny — rzadko jest ich więcej.
|
|
if studio is None:
|
|
studio = RawStudio(
|
|
external_id=f"pornhatcom:dvd:{slug or name.lower()}",
|
|
name=name,
|
|
slug=slug,
|
|
)
|
|
elif kind == "model":
|
|
performers.append(RawPerformer(name=name))
|
|
elif kind == "tag":
|
|
tags.append(RawTag(
|
|
external_id=f"pornhatcom:tag:{slug or name.lower()}",
|
|
name=name,
|
|
slug=slug,
|
|
))
|
|
|
|
return studio, performers, tags
|