"""HDPorn92Scraper — direct HTML scrape hdporn92.com search.
Search: `https://hdporn92.com/page//?s=`. Scene URL format:
`https://hdporn92.com//` (jeden segment ścieżki). Trzeba odsiać
nawigację (`/categories/`, `/actors/`, `/feed/`, `/dmca/`, `/contact-us/`,
external links badoinkvr/etc.).
"""
from __future__ import annotations
import logging
import re
import urllib.parse
from collections.abc import Iterator
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
_SCENE_URL_RE = re.compile(r'href="(https://hdporn92\.com/([a-z0-9][a-z0-9-]+))/?"')
_NAV_SLUGS = {
"actors", "categories", "tags", "feed", "dmca", "contact-us",
"comments", "wp-content", "wp-admin", "wp-includes", "wp-login.php",
"page", "?filter", "?s",
}
class HDPorn92Scraper(BaseDirectTubeScraper):
sitetag = "hdporn92com"
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
q = urllib.parse.quote_plus(query.strip())
url = f"https://hdporn92.com/page/{page}/?s={q}"
try:
r = browser_get(url, timeout=60)
except Exception as e:
log.warning("hdporn92 search fetch failed: %s", e)
return
if r.status_code != 200:
return
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
seen: set[str] = set()
yielded = 0
for m in _SCENE_URL_RE.finditer(r.text):
scene_url = m.group(1) + "/"
slug = m.group(2)
if slug in _NAV_SLUGS:
continue
if scene_url in seen:
continue
seen.add(scene_url)
slug_lower = slug.lower()
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
continue
title = slug.replace("-", " ").strip()
yield RawScene(
external_id=f"hdporn92com:{scene_url}",
title=title,
url=scene_url,
playback_sources=[
RawPlaybackSource(origin="tube:hdporn92com", page_url=scene_url)
],
performers=[RawPerformer(name=query.strip())],
raw={
"source": "direct_scraper:hdporn92",
"query": query,
"page": page,
"url": scene_url,
},
)
yielded += 1
if limit is not None and yielded >= limit:
return