From 210aec0536af35c0068c8d2e57d60db52714b051 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Sat, 6 Jun 2026 21:32:10 +0200 Subject: [PATCH] feat(scrapers): extract tags + description from porndish scene pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit porndish-only scenes had no tags and no description — the scraper only derived a title from the URL slug. The scene page (g1/bimber WP theme) carries both: a

list of /video2// links (the "#" tags the user sees, categories + co-performers) and a prose description

in .entry-content. Override _fetch_scene_metadata in PornDishScraper to pull both from one page fetch. Extend the base hook to accept an optional 4th return element (description) and thread it into RawScene.description — backward compatible with the existing 3-tuple (pornhat). Strips leading embed-button labels ("Video Player N", "Server N") from the prose. Verified on live scenes: clean tag lists + real descriptions. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../direct_scrapers/_search_base.py | 8 +- app/connectors/direct_scrapers/porndish.py | 97 +++++++++++++++++++ 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/app/connectors/direct_scrapers/_search_base.py b/app/connectors/direct_scrapers/_search_base.py index b21e6f6..2ee269b 100644 --- a/app/connectors/direct_scrapers/_search_base.py +++ b/app/connectors/direct_scrapers/_search_base.py @@ -198,13 +198,18 @@ class BaseSearchScraper(BaseDirectTubeScraper): studio: RawStudio | None = None extra_performers: list[RawPerformer] = [] tags: list[RawTag] = [] + description: str | None = None try: meta = self._fetch_scene_metadata(scene_url) except Exception as e: log.debug("%s metadata fetch failed for %s: %s", self.sitetag, scene_url, e) meta = None if meta is not None: - studio, extra_performers, tags = meta + # Back-compat: subclass może zwrócić 3-tuple (studio, performers, tags) + # LUB 4-tuple z dodatkowym `description` (porndish). Unpack defensywnie. + studio, extra_performers, tags = meta[0], meta[1], meta[2] + if len(meta) > 3: + description = meta[3] # Performer z query zawsze obecny (driver scraping). Extra performers # z detail page dorzucamy — dedupe po slug/name w resolverze. @@ -213,6 +218,7 @@ class BaseSearchScraper(BaseDirectTubeScraper): yield RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, + description=description, url=scene_url, playback_sources=[ RawPlaybackSource( diff --git a/app/connectors/direct_scrapers/porndish.py b/app/connectors/direct_scrapers/porndish.py index f25dbb7..6a8ffcb 100644 --- a/app/connectors/direct_scrapers/porndish.py +++ b/app/connectors/direct_scrapers/porndish.py @@ -2,12 +2,52 @@ Search: `https://porndish.com/page//?s=`. Scene URL: `https://porndish.com//`. + +Scene detail page (g1/bimber WordPress theme) zawiera: + - `

…` + — lista tagów (kategorie + performerzy wymieszani, tak jak porndish je pokazuje + jako „#" hashtagi). Bierzemy wszystkie jako RawTag (resolver dedupuje; performer + z query i tak dochodzi osobno). + - prozę opisu w `

` wewnątrz `.entry-content` (przed `entry-tags`, po embed-JS). +Bez `_fetch_scene_metadata` overrides scena z samego porndish miała 0 tagów i brak +description (bug-report od Jana 2026-06-06: „nie ma tagów (# na stronie) ani description"). """ from __future__ import annotations +import html as html_mod +import logging import re +from app.connectors.base import RawPerformer, RawStudio, RawTag from app.connectors.direct_scrapers._search_base import BaseSearchScraper +from app.extractors import browser_get + +log = logging.getLogger(__name__) + +_ENTRY_TAG_RE = re.compile( + r']+href="[^"]*/video2/(?P[^"/]+)/"[^>]*class="[^"]*entry-tag[^"]*"[^>]*>' + r'(?P[^<]+)', + re.IGNORECASE, +) +_ENTRY_CONTENT_RE = re.compile( + r']*class="[^"]*entry-content[^"]*"[^>]*>(?P.*?)', + re.IGNORECASE | re.DOTALL, +) +_SCRIPT_STYLE_RE = re.compile(r"|", re.IGNORECASE | re.DOTALL) +_P_RE = re.compile(r"]*>(?P.*?)

", re.IGNORECASE | re.DOTALL) +_TAG_STRIP_RE = re.compile(r"<[^>]+>") +_WS_RE = re.compile(r"\s+") +_SLUG_RE = re.compile(r"[^a-z0-9]+") + + +def _slugify(name: str) -> str: + return _SLUG_RE.sub("-", name.lower()).strip("-") or "tag" + + +def _clean_text(fragment: str) -> str: + txt = _TAG_STRIP_RE.sub(" ", fragment) + txt = html_mod.unescape(txt) + return _WS_RE.sub(" ", txt).strip() class PornDishScraper(BaseSearchScraper): @@ -17,3 +57,60 @@ class PornDishScraper(BaseSearchScraper): r'href="(?Phttps://porndish\.com/(?P[a-z0-9][a-z0-9\-]+))/"', re.IGNORECASE, ) + + def _fetch_scene_metadata( + self, scene_url: str + ) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag], str | None] | None: + """Fetch scene page → (studio=None, performers=[], tags, description). + + 4-elementowy zwrot (base unpacka opcjonalny `description`). porndish nie + wyróżnia studia, a performer z query dochodzi w base — tu tylko tagi + opis. + """ + try: + r = browser_get(scene_url, timeout=self._timeout) + except Exception as e: + log.debug("porndish meta fetch failed for %s: %s", scene_url, e) + return None + if r.status_code != 200 or not r.text: + return None + html = r.text + + # Tagi: entry-tag anchors (slug z /video2// + display name). + tags: list[RawTag] = [] + seen: set[str] = set() + for m in _ENTRY_TAG_RE.finditer(html): + name = html_mod.unescape(m.group("name")).strip() + slug = (m.group("slug") or "").strip().lower() or _slugify(name) + if not name or len(name) > 40 or slug in seen: + continue + seen.add(slug) + tags.append(RawTag(external_id=f"porndishcom:tag:{slug}", name=name, slug=slug)) + + # Description: najdłuższy prozowy

w .entry-content (bez entry-tags / embed-JS). + description: str | None = None + mc = _ENTRY_CONTENT_RE.search(html) + body = mc.group("body") if mc else html + body = _SCRIPT_STYLE_RE.sub(" ", body) + best = "" + for pm in _P_RE.finditer(body): + inner = pm.group("inner") + if "entry-tag" in inner: + continue + txt = _clean_text(inner) + # Pomijamy resztki JS / boilerplate „Watch … porn video" / przyciski serwerów. + if not txt or "getElementById" in txt or "addEventListener" in txt: + continue + low = txt.lower() + if low.startswith("watch ") and low.endswith("porn video"): + continue + if len(txt) > len(best): + best = txt + # Strip wiodące etykiety przycisków embedu („Video Player 1 Video Player 2 …", + # czasem „Server N") które wpadają na początek prozy. + best = re.sub(r"^(?:Video Player \d+\s*|Server \d+\s*|Download\s*)+", "", best, flags=re.IGNORECASE).strip() + if len(best) >= 40: + description = best + + if not tags and description is None: + return None + return (None, [], tags, description)