goon/app/connectors/direct_scrapers/_search_base.py
jtrzupek 210aec0536 feat(scrapers): extract tags + description from porndish scene pages
porndish-only scenes had no tags and no description — the scraper only derived a
title from the URL slug. The scene page (g1/bimber WP theme) carries both: a
<p class="entry-tags"> list of /video2/<slug>/ links (the "#" tags the user sees,
categories + co-performers) and a prose description <p> in .entry-content.

Override _fetch_scene_metadata in PornDishScraper to pull both from one page
fetch. Extend the base hook to accept an optional 4th return element
(description) and thread it into RawScene.description — backward compatible with
the existing 3-tuple (pornhat). Strips leading embed-button labels
("Video Player N", "Server N") from the prose. Verified on live scenes: clean
tag lists + real descriptions.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 21:32:10 +02:00

244 lines
11 KiB
Python

"""BaseSearchScraper — shared search-page HTML scraping logika.
Wzorzec stosowany przez wszystkie tube'y discovery scrapers:
1. Build search URL z `_search_url_template` (formatowane query+page).
2. Fetch HTML curl_cffi.
3. Match `_scene_url_re` (regex z grupą `url` lub group(1) jako scene URL,
opcjonalnie `slug` lub `id` jako tytuł source).
4. Filtruj wyniki po query tokens (slug musi zawierać ≥1 token z query) —
fuzzy search tube'ów często zwraca niezwiązane wyniki.
5. Yield RawScene z `external_id=f"{sitetag}:{scene_url}"`.
Subclass override:
- `sitetag: str` — np. "pornhubcom"
- `_search_url_template: str` — z `{query}` i `{page}` placeholderami
- `_scene_url_re: re.Pattern[str]` — regex z named group `url` (scene URL)
- `_title_from_match(match) -> str` — opcjonalny override (default: derive z URL slug)
- `_token_filter_text(match) -> str` — co testować na query tokens (default: cała URL)
"""
from __future__ import annotations
import logging
import re
import urllib.parse
from collections.abc import Iterator
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
# Image src extraction: matches src, data-src, data-original, data-lazy-src, data-lazy
# (lazy-load lib variants). Wymaga rozszerzenia obrazka żeby ograniczyć false positives
# (sprite icons, spinners) — JPG/PNG/WEBP są ~ jedynymi formatami które tube'y używają
# dla scene thumbnails.
_IMG_SRC_RE = re.compile(
r'<img[^>]+(?:src|data-src|data-original|data-lazy-src|data-lazy)=["\']'
r'((?://|https?://)[^"\']+\.(?:jpg|jpeg|png|webp|gif)[^"\']*)',
re.IGNORECASE,
)
log = logging.getLogger(__name__)
class BaseSearchScraper(BaseDirectTubeScraper):
"""Subclass dostarcza URL template + regex; reszta scraping flow shared.
Domyślny user agent / headers wystarczą dla ~większości tubes; te które wymagają
specyficznych (np. CF protected) override'ują `_search_headers()` lub fetch całość.
"""
#: Format URL search page'a, z `{query}` (quote_plus'ed) + `{page}` (int).
_search_url_template: str = ""
#: Regex matchujący scene URL w search HTML. Wymagana grupa `url` (full scene URL),
#: opcjonalna grupa `slug` (do title derivation gdy dostępny w URL).
_scene_url_re: re.Pattern[str] = re.compile(r"$^") # placeholder — subclass override
#: Minimalna długość tokena query do filtrowania wyników (krótsze ignorujemy żeby
#: nie matchowały niezwiązanych slugów).
_query_token_min_len: int = 3
#: Search HTTP timeout.
_timeout: float = 30.0
#: Slugi do odrzucenia (URL-e nawigacyjne / footer linki które matchują regex
#: ale nie są scenami). Przydatne dla WordPress-like tubes gdzie scene URL
#: pattern (`<host>/<slug>/`) zbiega się z `/categories/`, `/actors/` itp.
_nav_slug_blacklist: frozenset[str] = frozenset({
"actors", "actor", "actress", "categories", "category", "tags", "tag",
"feed", "dmca", "contact-us", "contact", "comments", "wp-content",
"wp-admin", "wp-includes", "wp-login.php", "page", "?filter", "?s",
"about", "about-us", "privacy", "privacy-policy", "tos", "terms",
"2257", "18-u-s-c-2257", "sitemap", "sitemap.xml",
})
#: Window (chars) wokół scene URL match, w którym szukamy `<img>` jako thumbnail.
#: WordPress-like tubes mają thumb w `<a href="..."><img src="...thumb.jpg"></a>` —
#: ±800 chars łapie ten pattern niezawodnie.
_thumbnail_window: int = 800
def _scene_url_from_match(self, m: re.Match[str]) -> str:
"""Domyślnie group(1) — subclass override gdy regex używa named groups inaczej."""
try:
return m.group("url")
except IndexError:
return m.group(1)
def _slug_from_match(self, m: re.Match[str], scene_url: str) -> str:
"""Slug do filtrowania query tokens + derivation tytułu. Default: ostatni segment URL.
Subclass override gdy regex daje explicit named group `slug`.
"""
if "slug" in m.groupdict():
slug = m.group("slug")
if slug:
return slug
# Fallback: parsuj URL
path = urllib.parse.urlparse(scene_url).path.rstrip("/")
return path.split("/")[-1] if path else ""
def _title_from_slug(self, slug: str) -> str:
return slug.replace("_", " ").replace("-", " ").strip()
def _format_query_for_url(self, query: str) -> str:
"""Default: URL-encode (spaces → `+`). Subclass override gdy tube wymaga
innego formatu — np. KVS-style sites użyją slug (spaces → `-`).
"""
return urllib.parse.quote_plus(query.strip())
def _fetch_scene_metadata(
self, scene_url: str
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
"""Optional hook — subclass może override żeby fetch'ować scene detail page
i wyciągnąć studio/performerów/tagi. Default zwraca None (skip detail fetch).
Wywoływane PER SCENE w `search()` — dodaje +1 HTTP request per match. Subclass
powinien rzucić wyjątki swobodnie, base łapie i kontynuuje bez metadata.
Returns: (studio, performers, tags). Każde może być None / pusta lista.
"""
return None
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
if not self._search_url_template:
raise NotImplementedError(f"{type(self).__name__}._search_url_template not set")
q = self._format_query_for_url(query)
url = self._search_url_template.format(query=q, page=page)
try:
r = browser_get(url, timeout=self._timeout)
except Exception as e:
log.warning("%s search fetch failed: %s", self.sitetag, e)
return
if r.status_code != 200:
log.debug("%s search %s status=%d", self.sitetag, url, r.status_code)
return
query_tokens = {
tok for tok in query.lower().split() if len(tok) >= self._query_token_min_len
}
seen: set[str] = set()
yielded = 0
for m in self._scene_url_re.finditer(r.text):
scene_url = self._scene_url_from_match(m).strip()
if scene_url.startswith("//"):
scene_url = "https:" + scene_url
elif scene_url.startswith("/"):
# Relative URL — prefix host z search URL.
base = urllib.parse.urlparse(url)
scene_url = f"{base.scheme}://{base.netloc}{scene_url}"
if scene_url in seen:
continue
seen.add(scene_url)
slug = self._slug_from_match(m, scene_url)
slug_lower = slug.lower()
if slug_lower in self._nav_slug_blacklist:
continue
# Strict: WSZYSTKIE query tokens muszą być w slug. Wcześniej `any()`
# przepuszczał scenę gdy choć jeden token był w slug — dla performera
# "Ava Koxxx" (query="ava koxxx") wszystkie sceny z "ava-*" slug
# (Ava Devine, Ava Addams itp.) były labelowane jako "Ava Koxxx",
# bo `any("ava" in slug)` =True. User reports: scena "ava devine
# gangbanged..." miała Ava Koxxx w DB. Fix: `all()` — slug musi
# zawierać każdy ≥3-char token z imienia performera.
if query_tokens and not all(tok in slug_lower for tok in query_tokens):
continue
title = self._title_from_slug(slug)
# Thumbnail: search ±N chars around scene_url match for nearest <img src=>.
# Większość tubes ma `<a href="<scene>"><img src="<thumb>"></a>` lub flat
# `<img src=><a href=>` — window 800 obejmuje oba.
window_start = max(0, m.start() - self._thumbnail_window)
window_end = min(len(r.text), m.end() + self._thumbnail_window)
window_html = r.text[window_start:window_end]
thumb_url: str | None = None
img_m = _IMG_SRC_RE.search(window_html)
if img_m:
thumb_url = img_m.group(1).strip()
if thumb_url.startswith("//"):
thumb_url = "https:" + thumb_url
elif thumb_url.startswith("/"):
base = urllib.parse.urlparse(url)
thumb_url = f"{base.scheme}://{base.netloc}{thumb_url}"
# Opcjonalny metadata fetch (studio/dodatkowi performerzy/tagi). Default
# zwraca None — większość tube'ów ma tylko search HTML bez metadata.
# PornHat ma `data-setup='{...}'` w `js-ajax-{dvd,model,tag}` divach.
studio: RawStudio | None = None
extra_performers: list[RawPerformer] = []
tags: list[RawTag] = []
description: str | None = None
try:
meta = self._fetch_scene_metadata(scene_url)
except Exception as e:
log.debug("%s metadata fetch failed for %s: %s", self.sitetag, scene_url, e)
meta = None
if meta is not None:
# Back-compat: subclass może zwrócić 3-tuple (studio, performers, tags)
# LUB 4-tuple z dodatkowym `description` (porndish). Unpack defensywnie.
studio, extra_performers, tags = meta[0], meta[1], meta[2]
if len(meta) > 3:
description = meta[3]
# Performer z query zawsze obecny (driver scraping). Extra performers
# z detail page dorzucamy — dedupe po slug/name w resolverze.
all_performers = [RawPerformer(name=query.strip()), *extra_performers]
yield RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
description=description,
url=scene_url,
playback_sources=[
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
thumbnail_url=thumb_url,
)
],
performers=all_performers,
studio=studio,
tags=tags,
raw={
"source": f"direct_scraper:{self.sitetag}",
"query": query,
"page": page,
"url": scene_url,
"search_url": url,
"thumbnail_url": thumb_url,
},
)
yielded += 1
if limit is not None and yielded >= limit:
return