From cd257740be6080960b5a87f17a49a37199e4608c Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Mon, 8 Jun 2026 09:28:18 +0200 Subject: [PATCH] =?UTF-8?q?fix(hqporner):=20require=20ALL=20query=20tokens?= =?UTF-8?q?=20in=20slug=20=E2=80=94=20stop=20performer=20over-attribution?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hqporner search post-filter kept a scene if its slug contained ANY query token (>=3 chars). For multi-word performer names this matched on a single common token (e.g. "anna","mia"), so the performer-driven ingest attributed the scene to EVERY performer sharing that token — scenes accumulated up to 503 wrong performers (hqporner = 5659 of 5897 scenes with >30 performers; bug-reports 2026-06-07). Switch ANY->ALL: the slug must contain every query token, requiring a full name match before attribution. Single-word names still work. Precision over recall — 144 wrong performers is far worse than missing a few loose matches. Co-Authored-By: Claude Opus 4.8 --- app/connectors/direct_scrapers/hqporner.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/app/connectors/direct_scrapers/hqporner.py b/app/connectors/direct_scrapers/hqporner.py index cc5bd62..7139af2 100644 --- a/app/connectors/direct_scrapers/hqporner.py +++ b/app/connectors/direct_scrapers/hqporner.py @@ -47,8 +47,12 @@ class HQPornerScraper(BaseDirectTubeScraper): log.debug("hqporner search %s status=%d", url, r.status_code) return - # Filtr: slug musi zawierać przynajmniej jedno z słów query (case-insensitive) - # Eliminuje totalnie niezwiązane wyniki gdy fuzzy search szumi. + # Filtr: slug musi zawierać WSZYSTKIE słowa query (≥3 znaki), case-insensitive. + # Wcześniej `any` (≥1 token) → przy 2-słownych nazwach match na jednym pospolitym + # tokenie (np. "anna"/"mia") atrybutował scenę do KAŻDEGO performera dzielącego ten + # token → sceny z setkami błędnych aktorek (do 503; hqporner = 5659/5897 takich scen, + # bug-report 2026-06-07). `all` wymaga pełnego dopasowania nazwy → precyzja. + # Pojedyncze nazwy ("Belladonna") nadal działają (jeden token musi być). query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3} seen_urls: set[str] = set() @@ -63,7 +67,7 @@ class HQPornerScraper(BaseDirectTubeScraper): # Title-token filter slug_lower = slug_part.lower() - if query_tokens and not any(tok in slug_lower for tok in query_tokens): + if query_tokens and not all(tok in slug_lower for tok in query_tokens): continue title = slug_part.replace("_", " ").replace("-", " ").strip()