hqporner search post-filter kept a scene if its slug contained ANY query token (>=3 chars). For multi-word performer names this matched on a single common token (e.g. "anna","mia"), so the performer-driven ingest attributed the scene to EVERY performer sharing that token — scenes accumulated up to 503 wrong performers (hqporner = 5659 of 5897 scenes with >30 performers; bug-reports 2026-06-07). Switch ANY->ALL: the slug must contain every query token, requiring a full name match before attribution. Single-word names still work. Precision over recall — 144 wrong performers is far worse than missing a few loose matches. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
98 lines
3.7 KiB
Python
98 lines
3.7 KiB
Python
"""HQPornerScraper — direct HTML scrape hqporner search page.
|
|
|
|
Search URL: `https://hqporner.com/?q=<query>&p=<page>`. Static HTML zwraca ~50
|
|
linków `/hdporn/<id>-<slug>.html` per strona. Tytuł deducimy ze slug'a (porn-app
|
|
data API zwraca dokładniejszy ale wymaga round-trip — dla MVP slug-derived OK,
|
|
resolver i tak je sciagnie z TPDB merge).
|
|
|
|
Search fuzzy: hqporner zwraca "Lola Noir" gdy szukamy "Noir" itp. Dlatego
|
|
filtrujemy wyniki po tym czy slug zawiera query (lub jego token) — analogicznie
|
|
jak `fetch_scenes_for_search` w pornapp connectorze.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
import urllib.parse
|
|
from collections.abc import Iterator
|
|
|
|
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
|
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
|
from app.extractors import browser_get
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
_SCENE_HREF_RE = re.compile(r'/hdporn/(\d+)-([^"\.]+)\.html')
|
|
|
|
|
|
class HQPornerScraper(BaseDirectTubeScraper):
|
|
sitetag = "hqpornercom"
|
|
|
|
def search(
|
|
self,
|
|
query: str,
|
|
*,
|
|
page: int = 1,
|
|
limit: int | None = None,
|
|
) -> Iterator[RawScene]:
|
|
q = urllib.parse.quote_plus(query.strip())
|
|
url = f"https://hqporner.com/?q={q}&p={page}"
|
|
try:
|
|
r = browser_get(url, timeout=30)
|
|
except Exception as e:
|
|
log.warning("hqporner search fetch failed: %s", e)
|
|
return
|
|
if r.status_code != 200:
|
|
log.debug("hqporner search %s status=%d", url, r.status_code)
|
|
return
|
|
|
|
# Filtr: slug musi zawierać WSZYSTKIE słowa query (≥3 znaki), case-insensitive.
|
|
# Wcześniej `any` (≥1 token) → przy 2-słownych nazwach match na jednym pospolitym
|
|
# tokenie (np. "anna"/"mia") atrybutował scenę do KAŻDEGO performera dzielącego ten
|
|
# token → sceny z setkami błędnych aktorek (do 503; hqporner = 5659/5897 takich scen,
|
|
# bug-report 2026-06-07). `all` wymaga pełnego dopasowania nazwy → precyzja.
|
|
# Pojedyncze nazwy ("Belladonna") nadal działają (jeden token musi być).
|
|
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
|
|
|
|
seen_urls: set[str] = set()
|
|
yielded = 0
|
|
for m in _SCENE_HREF_RE.finditer(r.text):
|
|
scene_id = m.group(1)
|
|
slug_part = m.group(2)
|
|
scene_url = f"https://hqporner.com/hdporn/{scene_id}-{slug_part}.html"
|
|
if scene_url in seen_urls:
|
|
continue
|
|
seen_urls.add(scene_url)
|
|
|
|
# Title-token filter
|
|
slug_lower = slug_part.lower()
|
|
if query_tokens and not all(tok in slug_lower for tok in query_tokens):
|
|
continue
|
|
|
|
title = slug_part.replace("_", " ").replace("-", " ").strip()
|
|
|
|
yield RawScene(
|
|
external_id=f"hqpornercom:{scene_url}",
|
|
title=title,
|
|
url=scene_url,
|
|
playback_sources=[
|
|
RawPlaybackSource(
|
|
origin="tube:hqpornercom",
|
|
page_url=scene_url,
|
|
)
|
|
],
|
|
# Wymuszamy hint performera = query — search per performer name znaczy
|
|
# że scena prawie na pewno o nim. Resolver dorobi ScenePerformer link.
|
|
performers=[RawPerformer(name=query.strip())],
|
|
raw={
|
|
"source": "direct_scraper:hqporner",
|
|
"query": query,
|
|
"page": page,
|
|
"scene_id": scene_id,
|
|
"url": scene_url,
|
|
},
|
|
)
|
|
yielded += 1
|
|
if limit is not None and yielded >= limit:
|
|
return
|