"""HDPorn92Scraper — direct HTML scrape hdporn92.com search. Search: `https://hdporn92.com/page//?s=`. Scene URL format: `https://hdporn92.com//` (jeden segment ścieżki). Trzeba odsiać nawigację (`/categories/`, `/actors/`, `/feed/`, `/dmca/`, `/contact-us/`, external links badoinkvr/etc.). """ from __future__ import annotations import logging import re import urllib.parse from collections.abc import Iterator from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene from app.connectors.direct_scrapers.base import BaseDirectTubeScraper from app.extractors import browser_get log = logging.getLogger(__name__) _SCENE_URL_RE = re.compile(r'href="(https://hdporn92\.com/([a-z0-9][a-z0-9-]+))/?"') _NAV_SLUGS = { "actors", "categories", "tags", "feed", "dmca", "contact-us", "comments", "wp-content", "wp-admin", "wp-includes", "wp-login.php", "page", "?filter", "?s", } class HDPorn92Scraper(BaseDirectTubeScraper): sitetag = "hdporn92com" def search( self, query: str, *, page: int = 1, limit: int | None = None, ) -> Iterator[RawScene]: q = urllib.parse.quote_plus(query.strip()) url = f"https://hdporn92.com/page/{page}/?s={q}" try: r = browser_get(url, timeout=60) except Exception as e: log.warning("hdporn92 search fetch failed: %s", e) return if r.status_code != 200: return query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3} seen: set[str] = set() yielded = 0 for m in _SCENE_URL_RE.finditer(r.text): scene_url = m.group(1) + "/" slug = m.group(2) if slug in _NAV_SLUGS: continue if scene_url in seen: continue seen.add(scene_url) slug_lower = slug.lower() if query_tokens and not any(tok in slug_lower for tok in query_tokens): continue title = slug.replace("-", " ").strip() yield RawScene( external_id=f"hdporn92com:{scene_url}", title=title, url=scene_url, playback_sources=[ RawPlaybackSource(origin="tube:hdporn92com", page_url=scene_url) ], performers=[RawPerformer(name=query.strip())], raw={ "source": "direct_scraper:hdporn92", "query": query, "page": page, "url": scene_url, }, ) yielded += 1 if limit is not None and yielded >= limit: return