"""porndish.com — direct HTML scrape. Search: `https://porndish.com/page//?s=`. Scene URL: `https://porndish.com//`. Scene detail page (g1/bimber WordPress theme) zawiera: - `

` wewnątrz `.entry-content` (przed `entry-tags`, po embed-JS). Bez `_fetch_scene_metadata` overrides scena z samego porndish miała 0 tagów i brak description (bug-report od Jana 2026-06-06: „nie ma tagów (# na stronie) ani description"). """ from __future__ import annotations import html as html_mod import logging import re from app.connectors.base import RawPerformer, RawStudio, RawTag from app.connectors.direct_scrapers._search_base import BaseSearchScraper from app.extractors import browser_get log = logging.getLogger(__name__) _ENTRY_TAG_RE = re.compile( r']+href="[^"]*/video2/(?P[^"/]+)/"[^>]*class="[^"]*entry-tag[^"]*"[^>]*>' r'(?P[^<]+)', re.IGNORECASE, ) _ENTRY_CONTENT_RE = re.compile( r']*class="[^"]*entry-content[^"]*"[^>]*>(?P.*?)', re.IGNORECASE | re.DOTALL, ) _SCRIPT_STYLE_RE = re.compile(r"|", re.IGNORECASE | re.DOTALL) _P_RE = re.compile(r"]*>(?P.*?)

", re.IGNORECASE | re.DOTALL) _TAG_STRIP_RE = re.compile(r"<[^>]+>") _WS_RE = re.compile(r"\s+") _SLUG_RE = re.compile(r"[^a-z0-9]+") def _slugify(name: str) -> str: return _SLUG_RE.sub("-", name.lower()).strip("-") or "tag" def _clean_text(fragment: str) -> str: txt = _TAG_STRIP_RE.sub(" ", fragment) txt = html_mod.unescape(txt) return _WS_RE.sub(" ", txt).strip() class PornDishScraper(BaseSearchScraper): sitetag = "porndishcom" _search_url_template = "https://porndish.com/page/{page}/?s={query}" _scene_url_re = re.compile( r'href="(?Phttps://porndish\.com/(?P[a-z0-9][a-z0-9\-]+))/"', re.IGNORECASE, ) def _fetch_scene_metadata( self, scene_url: str ) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag], str | None] | None: """Fetch scene page → (studio=None, performers=[], tags, description). 4-elementowy zwrot (base unpacka opcjonalny `description`). porndish nie wyróżnia studia, a performer z query dochodzi w base — tu tylko tagi + opis. """ try: r = browser_get(scene_url, timeout=self._timeout) except Exception as e: log.debug("porndish meta fetch failed for %s: %s", scene_url, e) return None if r.status_code != 200 or not r.text: return None html = r.text # Tagi: entry-tag anchors (slug z /video2// + display name). tags: list[RawTag] = [] seen: set[str] = set() for m in _ENTRY_TAG_RE.finditer(html): name = html_mod.unescape(m.group("name")).strip() slug = (m.group("slug") or "").strip().lower() or _slugify(name) if not name or len(name) > 40 or slug in seen: continue seen.add(slug) tags.append(RawTag(external_id=f"porndishcom:tag:{slug}", name=name, slug=slug)) # Description: najdłuższy prozowy

w .entry-content (bez entry-tags / embed-JS). description: str | None = None mc = _ENTRY_CONTENT_RE.search(html) body = mc.group("body") if mc else html body = _SCRIPT_STYLE_RE.sub(" ", body) best = "" for pm in _P_RE.finditer(body): inner = pm.group("inner") if "entry-tag" in inner: continue txt = _clean_text(inner) # Pomijamy resztki JS / boilerplate „Watch … porn video" / przyciski serwerów. if not txt or "getElementById" in txt or "addEventListener" in txt: continue low = txt.lower() if low.startswith("watch ") and low.endswith("porn video"): continue if len(txt) > len(best): best = txt # Strip wiodące etykiety przycisków embedu („Video Player 1 Video Player 2 …", # czasem „Server N") które wpadają na początek prozy. best = re.sub(r"^(?:Video Player \d+\s*|Server \d+\s*|Download\s*)+", "", best, flags=re.IGNORECASE).strip() if len(best) >= 40: description = best if not tags and description is None: return None return (None, [], tags, description)