"""HQPornerScraper — direct HTML scrape hqporner search page. Search URL: `https://hqporner.com/?q=&p=`. Static HTML zwraca ~50 linków `/hdporn/-.html` per strona. Tytuł deducimy ze slug'a (porn-app data API zwraca dokładniejszy ale wymaga round-trip — dla MVP slug-derived OK, resolver i tak je sciagnie z TPDB merge). Search fuzzy: hqporner zwraca "Lola Noir" gdy szukamy "Noir" itp. Dlatego filtrujemy wyniki po tym czy slug zawiera query (lub jego token) — analogicznie jak `fetch_scenes_for_search` w pornapp connectorze. """ from __future__ import annotations import logging import re import urllib.parse from collections.abc import Iterator from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene from app.connectors.direct_scrapers.base import BaseDirectTubeScraper from app.extractors import browser_get log = logging.getLogger(__name__) _SCENE_HREF_RE = re.compile(r'/hdporn/(\d+)-([^"\.]+)\.html') class HQPornerScraper(BaseDirectTubeScraper): sitetag = "hqpornercom" def search( self, query: str, *, page: int = 1, limit: int | None = None, ) -> Iterator[RawScene]: q = urllib.parse.quote_plus(query.strip()) url = f"https://hqporner.com/?q={q}&p={page}" try: r = browser_get(url, timeout=30) except Exception as e: log.warning("hqporner search fetch failed: %s", e) return if r.status_code != 200: log.debug("hqporner search %s status=%d", url, r.status_code) return # Filtr: slug musi zawierać przynajmniej jedno z słów query (case-insensitive) # Eliminuje totalnie niezwiązane wyniki gdy fuzzy search szumi. query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3} seen_urls: set[str] = set() yielded = 0 for m in _SCENE_HREF_RE.finditer(r.text): scene_id = m.group(1) slug_part = m.group(2) scene_url = f"https://hqporner.com/hdporn/{scene_id}-{slug_part}.html" if scene_url in seen_urls: continue seen_urls.add(scene_url) # Title-token filter slug_lower = slug_part.lower() if query_tokens and not any(tok in slug_lower for tok in query_tokens): continue title = slug_part.replace("_", " ").replace("-", " ").strip() yield RawScene( external_id=f"hqpornercom:{scene_url}", title=title, url=scene_url, playback_sources=[ RawPlaybackSource( origin="tube:hqpornercom", page_url=scene_url, ) ], # Wymuszamy hint performera = query — search per performer name znaczy # że scena prawie na pewno o nim. Resolver dorobi ScenePerformer link. performers=[RawPerformer(name=query.strip())], raw={ "source": "direct_scraper:hqporner", "query": query, "page": page, "scene_id": scene_id, "url": scene_url, }, ) yielded += 1 if limit is not None and yielded >= limit: return