feat(scrapers): extract tags + description from porndish scene pages

porndish-only scenes had no tags and no description — the scraper only derived a title from the URL slug. The scene page (g1/bimber WP theme) carries both: a <p class="entry-tags"> list of /video2/<slug>/ links (the "#" tags the user sees, categories + co-performers) and a prose description <p> in .entry-content. Override _fetch_scene_metadata in PornDishScraper to pull both from one page fetch. Extend the base hook to accept an optional 4th return element (description) and thread it into RawScene.description — backward compatible with the existing 3-tuple (pornhat). Strips leading embed-button labels ("Video Player N", "Server N") from the prose. Verified on live scenes: clean tag lists + real descriptions. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 21:32:10 +02:00 · 2026-06-06 21:32:10 +02:00 · 210aec0536
commit 210aec0536
parent 77323d23e6
2 changed files with 104 additions and 1 deletions
--- a/app/connectors/direct_scrapers/_search_base.py
+++ b/app/connectors/direct_scrapers/_search_base.py
@ -198,13 +198,18 @@ class BaseSearchScraper(BaseDirectTubeScraper):
            studio: RawStudio | None = None
            extra_performers: list[RawPerformer] = []
            tags: list[RawTag] = []
+            description: str | None = None
            try:
                meta = self._fetch_scene_metadata(scene_url)
            except Exception as e:
                log.debug("%s metadata fetch failed for %s: %s", self.sitetag, scene_url, e)
                meta = None
            if meta is not None:
-                studio, extra_performers, tags = meta
+                # Back-compat: subclass może zwrócić 3-tuple (studio, performers, tags)
+                # LUB 4-tuple z dodatkowym `description` (porndish). Unpack defensywnie.
+                studio, extra_performers, tags = meta[0], meta[1], meta[2]
+                if len(meta) > 3:
+                    description = meta[3]

            # Performer z query zawsze obecny (driver scraping). Extra performers
            # z detail page dorzucamy — dedupe po slug/name w resolverze.
@ -213,6 +218,7 @@ class BaseSearchScraper(BaseDirectTubeScraper):
            yield RawScene(
                external_id=f"{self.sitetag}:{scene_url}",
                title=title,
+                description=description,
                url=scene_url,
                playback_sources=[
                    RawPlaybackSource(
--- a/app/connectors/direct_scrapers/porndish.py
+++ b/app/connectors/direct_scrapers/porndish.py
@ -2,12 +2,52 @@

 Search: `https://porndish.com/page/<n>/?s=<q>`.
 Scene URL: `https://porndish.com/<slug>/`.
+
+Scene detail page (g1/bimber WordPress theme) zawiera:
+  - `<p class="entry-tags"><a class="entry-tag entry-tag-N" href=".../video2/<slug>/">Name</a>…`
+    — lista tagów (kategorie + performerzy wymieszani, tak jak porndish je pokazuje
+    jako „#" hashtagi). Bierzemy wszystkie jako RawTag (resolver dedupuje; performer
+    z query i tak dochodzi osobno).
+  - prozę opisu w `<p>` wewnątrz `.entry-content` (przed `entry-tags`, po embed-JS).
+Bez `_fetch_scene_metadata` overrides scena z samego porndish miała 0 tagów i brak
+description (bug-report od Jana 2026-06-06: „nie ma tagów (# na stronie) ani description").
 """
 from __future__ import annotations

+import html as html_mod
+import logging
 import re

+from app.connectors.base import RawPerformer, RawStudio, RawTag
 from app.connectors.direct_scrapers._search_base import BaseSearchScraper
+from app.extractors import browser_get
+
+log = logging.getLogger(__name__)
+
+_ENTRY_TAG_RE = re.compile(
+    r'<a[^>]+href="[^"]*/video2/(?P<slug>[^"/]+)/"[^>]*class="[^"]*entry-tag[^"]*"[^>]*>'
+    r'(?P<name>[^<]+)</a>',
+    re.IGNORECASE,
+)
+_ENTRY_CONTENT_RE = re.compile(
+    r'<div[^>]*class="[^"]*entry-content[^"]*"[^>]*>(?P<body>.*?)</article>',
+    re.IGNORECASE | re.DOTALL,
+)
+_SCRIPT_STYLE_RE = re.compile(r"<script\b.*?</script>|<style\b.*?</style>", re.IGNORECASE | re.DOTALL)
+_P_RE = re.compile(r"<p\b[^>]*>(?P<inner>.*?)</p>", re.IGNORECASE | re.DOTALL)
+_TAG_STRIP_RE = re.compile(r"<[^>]+>")
+_WS_RE = re.compile(r"\s+")
+_SLUG_RE = re.compile(r"[^a-z0-9]+")
+
+
+def _slugify(name: str) -> str:
+    return _SLUG_RE.sub("-", name.lower()).strip("-") or "tag"
+
+
+def _clean_text(fragment: str) -> str:
+    txt = _TAG_STRIP_RE.sub(" ", fragment)
+    txt = html_mod.unescape(txt)
+    return _WS_RE.sub(" ", txt).strip()


 class PornDishScraper(BaseSearchScraper):
@ -17,3 +57,60 @@ class PornDishScraper(BaseSearchScraper):
        r'href="(?P<url>https://porndish\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
        re.IGNORECASE,
    )
+
+    def _fetch_scene_metadata(
+        self, scene_url: str
+    ) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag], str | None] | None:
+        """Fetch scene page → (studio=None, performers=[], tags, description).
+
+        4-elementowy zwrot (base unpacka opcjonalny `description`). porndish nie
+        wyróżnia studia, a performer z query dochodzi w base — tu tylko tagi + opis.
+        """
+        try:
+            r = browser_get(scene_url, timeout=self._timeout)
+        except Exception as e:
+            log.debug("porndish meta fetch failed for %s: %s", scene_url, e)
+            return None
+        if r.status_code != 200 or not r.text:
+            return None
+        html = r.text
+
+        # Tagi: entry-tag anchors (slug z /video2/<slug>/ + display name).
+        tags: list[RawTag] = []
+        seen: set[str] = set()
+        for m in _ENTRY_TAG_RE.finditer(html):
+            name = html_mod.unescape(m.group("name")).strip()
+            slug = (m.group("slug") or "").strip().lower() or _slugify(name)
+            if not name or len(name) > 40 or slug in seen:
+                continue
+            seen.add(slug)
+            tags.append(RawTag(external_id=f"porndishcom:tag:{slug}", name=name, slug=slug))
+
+        # Description: najdłuższy prozowy <p> w .entry-content (bez entry-tags / embed-JS).
+        description: str | None = None
+        mc = _ENTRY_CONTENT_RE.search(html)
+        body = mc.group("body") if mc else html
+        body = _SCRIPT_STYLE_RE.sub(" ", body)
+        best = ""
+        for pm in _P_RE.finditer(body):
+            inner = pm.group("inner")
+            if "entry-tag" in inner:
+                continue
+            txt = _clean_text(inner)
+            # Pomijamy resztki JS / boilerplate „Watch … porn video" / przyciski serwerów.
+            if not txt or "getElementById" in txt or "addEventListener" in txt:
+                continue
+            low = txt.lower()
+            if low.startswith("watch ") and low.endswith("porn video"):
+                continue
+            if len(txt) > len(best):
+                best = txt
+        # Strip wiodące etykiety przycisków embedu („Video Player 1 Video Player 2 …",
+        # czasem „Server N") które wpadają na początek prozy.
+        best = re.sub(r"^(?:Video Player \d+\s*|Server \d+\s*|Download\s*)+", "", best, flags=re.IGNORECASE).strip()
+        if len(best) >= 40:
+            description = best
+
+        if not tags and description is None:
+            return None
+        return (None, [], tags, description)