porndish-only scenes had no tags and no description — the scraper only derived a
title from the URL slug. The scene page (g1/bimber WP theme) carries both: a
<p class="entry-tags"> list of /video2/<slug>/ links (the "#" tags the user sees,
categories + co-performers) and a prose description <p> in .entry-content.
Override _fetch_scene_metadata in PornDishScraper to pull both from one page
fetch. Extend the base hook to accept an optional 4th return element
(description) and thread it into RawScene.description — backward compatible with
the existing 3-tuple (pornhat). Strips leading embed-button labels
("Video Player N", "Server N") from the prose. Verified on live scenes: clean
tag lists + real descriptions.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
116 lines
4.6 KiB
Python
116 lines
4.6 KiB
Python
"""porndish.com — direct HTML scrape.
|
|
|
|
Search: `https://porndish.com/page/<n>/?s=<q>`.
|
|
Scene URL: `https://porndish.com/<slug>/`.
|
|
|
|
Scene detail page (g1/bimber WordPress theme) zawiera:
|
|
- `<p class="entry-tags"><a class="entry-tag entry-tag-N" href=".../video2/<slug>/">Name</a>…`
|
|
— lista tagów (kategorie + performerzy wymieszani, tak jak porndish je pokazuje
|
|
jako „#" hashtagi). Bierzemy wszystkie jako RawTag (resolver dedupuje; performer
|
|
z query i tak dochodzi osobno).
|
|
- prozę opisu w `<p>` wewnątrz `.entry-content` (przed `entry-tags`, po embed-JS).
|
|
Bez `_fetch_scene_metadata` overrides scena z samego porndish miała 0 tagów i brak
|
|
description (bug-report od Jana 2026-06-06: „nie ma tagów (# na stronie) ani description").
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import html as html_mod
|
|
import logging
|
|
import re
|
|
|
|
from app.connectors.base import RawPerformer, RawStudio, RawTag
|
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
|
from app.extractors import browser_get
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_ENTRY_TAG_RE = re.compile(
|
|
r'<a[^>]+href="[^"]*/video2/(?P<slug>[^"/]+)/"[^>]*class="[^"]*entry-tag[^"]*"[^>]*>'
|
|
r'(?P<name>[^<]+)</a>',
|
|
re.IGNORECASE,
|
|
)
|
|
_ENTRY_CONTENT_RE = re.compile(
|
|
r'<div[^>]*class="[^"]*entry-content[^"]*"[^>]*>(?P<body>.*?)</article>',
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
_SCRIPT_STYLE_RE = re.compile(r"<script\b.*?</script>|<style\b.*?</style>", re.IGNORECASE | re.DOTALL)
|
|
_P_RE = re.compile(r"<p\b[^>]*>(?P<inner>.*?)</p>", re.IGNORECASE | re.DOTALL)
|
|
_TAG_STRIP_RE = re.compile(r"<[^>]+>")
|
|
_WS_RE = re.compile(r"\s+")
|
|
_SLUG_RE = re.compile(r"[^a-z0-9]+")
|
|
|
|
|
|
def _slugify(name: str) -> str:
|
|
return _SLUG_RE.sub("-", name.lower()).strip("-") or "tag"
|
|
|
|
|
|
def _clean_text(fragment: str) -> str:
|
|
txt = _TAG_STRIP_RE.sub(" ", fragment)
|
|
txt = html_mod.unescape(txt)
|
|
return _WS_RE.sub(" ", txt).strip()
|
|
|
|
|
|
class PornDishScraper(BaseSearchScraper):
|
|
sitetag = "porndishcom"
|
|
_search_url_template = "https://porndish.com/page/{page}/?s={query}"
|
|
_scene_url_re = re.compile(
|
|
r'href="(?P<url>https://porndish\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
def _fetch_scene_metadata(
|
|
self, scene_url: str
|
|
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag], str | None] | None:
|
|
"""Fetch scene page → (studio=None, performers=[], tags, description).
|
|
|
|
4-elementowy zwrot (base unpacka opcjonalny `description`). porndish nie
|
|
wyróżnia studia, a performer z query dochodzi w base — tu tylko tagi + opis.
|
|
"""
|
|
try:
|
|
r = browser_get(scene_url, timeout=self._timeout)
|
|
except Exception as e:
|
|
log.debug("porndish meta fetch failed for %s: %s", scene_url, e)
|
|
return None
|
|
if r.status_code != 200 or not r.text:
|
|
return None
|
|
html = r.text
|
|
|
|
# Tagi: entry-tag anchors (slug z /video2/<slug>/ + display name).
|
|
tags: list[RawTag] = []
|
|
seen: set[str] = set()
|
|
for m in _ENTRY_TAG_RE.finditer(html):
|
|
name = html_mod.unescape(m.group("name")).strip()
|
|
slug = (m.group("slug") or "").strip().lower() or _slugify(name)
|
|
if not name or len(name) > 40 or slug in seen:
|
|
continue
|
|
seen.add(slug)
|
|
tags.append(RawTag(external_id=f"porndishcom:tag:{slug}", name=name, slug=slug))
|
|
|
|
# Description: najdłuższy prozowy <p> w .entry-content (bez entry-tags / embed-JS).
|
|
description: str | None = None
|
|
mc = _ENTRY_CONTENT_RE.search(html)
|
|
body = mc.group("body") if mc else html
|
|
body = _SCRIPT_STYLE_RE.sub(" ", body)
|
|
best = ""
|
|
for pm in _P_RE.finditer(body):
|
|
inner = pm.group("inner")
|
|
if "entry-tag" in inner:
|
|
continue
|
|
txt = _clean_text(inner)
|
|
# Pomijamy resztki JS / boilerplate „Watch … porn video" / przyciski serwerów.
|
|
if not txt or "getElementById" in txt or "addEventListener" in txt:
|
|
continue
|
|
low = txt.lower()
|
|
if low.startswith("watch ") and low.endswith("porn video"):
|
|
continue
|
|
if len(txt) > len(best):
|
|
best = txt
|
|
# Strip wiodące etykiety przycisków embedu („Video Player 1 Video Player 2 …",
|
|
# czasem „Server N") które wpadają na początek prozy.
|
|
best = re.sub(r"^(?:Video Player \d+\s*|Server \d+\s*|Download\s*)+", "", best, flags=re.IGNORECASE).strip()
|
|
if len(best) >= 40:
|
|
description = best
|
|
|
|
if not tags and description is None:
|
|
return None
|
|
return (None, [], tags, description)
|