porndish-only scenes had no tags and no description — the scraper only derived a
title from the URL slug. The scene page (g1/bimber WP theme) carries both: a
<p class="entry-tags"> list of /video2/<slug>/ links (the "#" tags the user sees,
categories + co-performers) and a prose description <p> in .entry-content.
Override _fetch_scene_metadata in PornDishScraper to pull both from one page
fetch. Extend the base hook to accept an optional 4th return element
(description) and thread it into RawScene.description — backward compatible with
the existing 3-tuple (pornhat). Strips leading embed-button labels
("Video Player N", "Server N") from the prose. Verified on live scenes: clean
tag lists + real descriptions.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
244 lines
11 KiB
Python
244 lines
11 KiB
Python
"""BaseSearchScraper — shared search-page HTML scraping logika.
|
|
|
|
Wzorzec stosowany przez wszystkie tube'y discovery scrapers:
|
|
1. Build search URL z `_search_url_template` (formatowane query+page).
|
|
2. Fetch HTML curl_cffi.
|
|
3. Match `_scene_url_re` (regex z grupą `url` lub group(1) jako scene URL,
|
|
opcjonalnie `slug` lub `id` jako tytuł source).
|
|
4. Filtruj wyniki po query tokens (slug musi zawierać ≥1 token z query) —
|
|
fuzzy search tube'ów często zwraca niezwiązane wyniki.
|
|
5. Yield RawScene z `external_id=f"{sitetag}:{scene_url}"`.
|
|
|
|
Subclass override:
|
|
- `sitetag: str` — np. "pornhubcom"
|
|
- `_search_url_template: str` — z `{query}` i `{page}` placeholderami
|
|
- `_scene_url_re: re.Pattern[str]` — regex z named group `url` (scene URL)
|
|
- `_title_from_match(match) -> str` — opcjonalny override (default: derive z URL slug)
|
|
- `_token_filter_text(match) -> str` — co testować na query tokens (default: cała URL)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
import urllib.parse
|
|
from collections.abc import Iterator
|
|
|
|
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag
|
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
|
from app.extractors import browser_get
|
|
|
|
|
|
# Image src extraction: matches src, data-src, data-original, data-lazy-src, data-lazy
|
|
# (lazy-load lib variants). Wymaga rozszerzenia obrazka żeby ograniczyć false positives
|
|
# (sprite icons, spinners) — JPG/PNG/WEBP są ~ jedynymi formatami które tube'y używają
|
|
# dla scene thumbnails.
|
|
_IMG_SRC_RE = re.compile(
|
|
r'<img[^>]+(?:src|data-src|data-original|data-lazy-src|data-lazy)=["\']'
|
|
r'((?://|https?://)[^"\']+\.(?:jpg|jpeg|png|webp|gif)[^"\']*)',
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class BaseSearchScraper(BaseDirectTubeScraper):
|
|
"""Subclass dostarcza URL template + regex; reszta scraping flow shared.
|
|
|
|
Domyślny user agent / headers wystarczą dla ~większości tubes; te które wymagają
|
|
specyficznych (np. CF protected) override'ują `_search_headers()` lub fetch całość.
|
|
"""
|
|
|
|
#: Format URL search page'a, z `{query}` (quote_plus'ed) + `{page}` (int).
|
|
_search_url_template: str = ""
|
|
|
|
#: Regex matchujący scene URL w search HTML. Wymagana grupa `url` (full scene URL),
|
|
#: opcjonalna grupa `slug` (do title derivation gdy dostępny w URL).
|
|
_scene_url_re: re.Pattern[str] = re.compile(r"$^") # placeholder — subclass override
|
|
|
|
#: Minimalna długość tokena query do filtrowania wyników (krótsze ignorujemy żeby
|
|
#: nie matchowały niezwiązanych slugów).
|
|
_query_token_min_len: int = 3
|
|
|
|
#: Search HTTP timeout.
|
|
_timeout: float = 30.0
|
|
|
|
#: Slugi do odrzucenia (URL-e nawigacyjne / footer linki które matchują regex
|
|
#: ale nie są scenami). Przydatne dla WordPress-like tubes gdzie scene URL
|
|
#: pattern (`<host>/<slug>/`) zbiega się z `/categories/`, `/actors/` itp.
|
|
_nav_slug_blacklist: frozenset[str] = frozenset({
|
|
"actors", "actor", "actress", "categories", "category", "tags", "tag",
|
|
"feed", "dmca", "contact-us", "contact", "comments", "wp-content",
|
|
"wp-admin", "wp-includes", "wp-login.php", "page", "?filter", "?s",
|
|
"about", "about-us", "privacy", "privacy-policy", "tos", "terms",
|
|
"2257", "18-u-s-c-2257", "sitemap", "sitemap.xml",
|
|
})
|
|
|
|
#: Window (chars) wokół scene URL match, w którym szukamy `<img>` jako thumbnail.
|
|
#: WordPress-like tubes mają thumb w `<a href="..."><img src="...thumb.jpg"></a>` —
|
|
#: ±800 chars łapie ten pattern niezawodnie.
|
|
_thumbnail_window: int = 800
|
|
|
|
def _scene_url_from_match(self, m: re.Match[str]) -> str:
|
|
"""Domyślnie group(1) — subclass override gdy regex używa named groups inaczej."""
|
|
try:
|
|
return m.group("url")
|
|
except IndexError:
|
|
return m.group(1)
|
|
|
|
def _slug_from_match(self, m: re.Match[str], scene_url: str) -> str:
|
|
"""Slug do filtrowania query tokens + derivation tytułu. Default: ostatni segment URL.
|
|
|
|
Subclass override gdy regex daje explicit named group `slug`.
|
|
"""
|
|
if "slug" in m.groupdict():
|
|
slug = m.group("slug")
|
|
if slug:
|
|
return slug
|
|
# Fallback: parsuj URL
|
|
path = urllib.parse.urlparse(scene_url).path.rstrip("/")
|
|
return path.split("/")[-1] if path else ""
|
|
|
|
def _title_from_slug(self, slug: str) -> str:
|
|
return slug.replace("_", " ").replace("-", " ").strip()
|
|
|
|
def _format_query_for_url(self, query: str) -> str:
|
|
"""Default: URL-encode (spaces → `+`). Subclass override gdy tube wymaga
|
|
innego formatu — np. KVS-style sites użyją slug (spaces → `-`).
|
|
"""
|
|
return urllib.parse.quote_plus(query.strip())
|
|
|
|
def _fetch_scene_metadata(
|
|
self, scene_url: str
|
|
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
|
|
"""Optional hook — subclass może override żeby fetch'ować scene detail page
|
|
i wyciągnąć studio/performerów/tagi. Default zwraca None (skip detail fetch).
|
|
|
|
Wywoływane PER SCENE w `search()` — dodaje +1 HTTP request per match. Subclass
|
|
powinien rzucić wyjątki swobodnie, base łapie i kontynuuje bez metadata.
|
|
|
|
Returns: (studio, performers, tags). Każde może być None / pusta lista.
|
|
"""
|
|
return None
|
|
|
|
def search(
|
|
self,
|
|
query: str,
|
|
*,
|
|
page: int = 1,
|
|
limit: int | None = None,
|
|
) -> Iterator[RawScene]:
|
|
if not self._search_url_template:
|
|
raise NotImplementedError(f"{type(self).__name__}._search_url_template not set")
|
|
|
|
q = self._format_query_for_url(query)
|
|
url = self._search_url_template.format(query=q, page=page)
|
|
|
|
try:
|
|
r = browser_get(url, timeout=self._timeout)
|
|
except Exception as e:
|
|
log.warning("%s search fetch failed: %s", self.sitetag, e)
|
|
return
|
|
if r.status_code != 200:
|
|
log.debug("%s search %s status=%d", self.sitetag, url, r.status_code)
|
|
return
|
|
|
|
query_tokens = {
|
|
tok for tok in query.lower().split() if len(tok) >= self._query_token_min_len
|
|
}
|
|
|
|
seen: set[str] = set()
|
|
yielded = 0
|
|
for m in self._scene_url_re.finditer(r.text):
|
|
scene_url = self._scene_url_from_match(m).strip()
|
|
if scene_url.startswith("//"):
|
|
scene_url = "https:" + scene_url
|
|
elif scene_url.startswith("/"):
|
|
# Relative URL — prefix host z search URL.
|
|
base = urllib.parse.urlparse(url)
|
|
scene_url = f"{base.scheme}://{base.netloc}{scene_url}"
|
|
if scene_url in seen:
|
|
continue
|
|
seen.add(scene_url)
|
|
|
|
slug = self._slug_from_match(m, scene_url)
|
|
slug_lower = slug.lower()
|
|
if slug_lower in self._nav_slug_blacklist:
|
|
continue
|
|
# Strict: WSZYSTKIE query tokens muszą być w slug. Wcześniej `any()`
|
|
# przepuszczał scenę gdy choć jeden token był w slug — dla performera
|
|
# "Ava Koxxx" (query="ava koxxx") wszystkie sceny z "ava-*" slug
|
|
# (Ava Devine, Ava Addams itp.) były labelowane jako "Ava Koxxx",
|
|
# bo `any("ava" in slug)` =True. User reports: scena "ava devine
|
|
# gangbanged..." miała Ava Koxxx w DB. Fix: `all()` — slug musi
|
|
# zawierać każdy ≥3-char token z imienia performera.
|
|
if query_tokens and not all(tok in slug_lower for tok in query_tokens):
|
|
continue
|
|
|
|
title = self._title_from_slug(slug)
|
|
|
|
# Thumbnail: search ±N chars around scene_url match for nearest <img src=>.
|
|
# Większość tubes ma `<a href="<scene>"><img src="<thumb>"></a>` lub flat
|
|
# `<img src=><a href=>` — window 800 obejmuje oba.
|
|
window_start = max(0, m.start() - self._thumbnail_window)
|
|
window_end = min(len(r.text), m.end() + self._thumbnail_window)
|
|
window_html = r.text[window_start:window_end]
|
|
thumb_url: str | None = None
|
|
img_m = _IMG_SRC_RE.search(window_html)
|
|
if img_m:
|
|
thumb_url = img_m.group(1).strip()
|
|
if thumb_url.startswith("//"):
|
|
thumb_url = "https:" + thumb_url
|
|
elif thumb_url.startswith("/"):
|
|
base = urllib.parse.urlparse(url)
|
|
thumb_url = f"{base.scheme}://{base.netloc}{thumb_url}"
|
|
|
|
# Opcjonalny metadata fetch (studio/dodatkowi performerzy/tagi). Default
|
|
# zwraca None — większość tube'ów ma tylko search HTML bez metadata.
|
|
# PornHat ma `data-setup='{...}'` w `js-ajax-{dvd,model,tag}` divach.
|
|
studio: RawStudio | None = None
|
|
extra_performers: list[RawPerformer] = []
|
|
tags: list[RawTag] = []
|
|
description: str | None = None
|
|
try:
|
|
meta = self._fetch_scene_metadata(scene_url)
|
|
except Exception as e:
|
|
log.debug("%s metadata fetch failed for %s: %s", self.sitetag, scene_url, e)
|
|
meta = None
|
|
if meta is not None:
|
|
# Back-compat: subclass może zwrócić 3-tuple (studio, performers, tags)
|
|
# LUB 4-tuple z dodatkowym `description` (porndish). Unpack defensywnie.
|
|
studio, extra_performers, tags = meta[0], meta[1], meta[2]
|
|
if len(meta) > 3:
|
|
description = meta[3]
|
|
|
|
# Performer z query zawsze obecny (driver scraping). Extra performers
|
|
# z detail page dorzucamy — dedupe po slug/name w resolverze.
|
|
all_performers = [RawPerformer(name=query.strip()), *extra_performers]
|
|
|
|
yield RawScene(
|
|
external_id=f"{self.sitetag}:{scene_url}",
|
|
title=title,
|
|
description=description,
|
|
url=scene_url,
|
|
playback_sources=[
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
thumbnail_url=thumb_url,
|
|
)
|
|
],
|
|
performers=all_performers,
|
|
studio=studio,
|
|
tags=tags,
|
|
raw={
|
|
"source": f"direct_scraper:{self.sitetag}",
|
|
"query": query,
|
|
"page": page,
|
|
"url": scene_url,
|
|
"search_url": url,
|
|
"thumbnail_url": thumb_url,
|
|
},
|
|
)
|
|
yielded += 1
|
|
if limit is not None and yielded >= limit:
|
|
return
|