feat(ingest): revive porndish — search→WP REST API browse
Watchdog flagged porndish as frozen (search ?s= stopped yielding new scenes 2026-05-07, 1151h). It's WordPress and the VPS can reach it, so converted to a browse scraper over the WP REST API (/wp-json/wp/v2/posts?_embed=1), same pattern as perverzija: title, date, featured thumbnail, studio (category — FreeUseFantasy / I Have A Wife / … paysite content) and tags. Performers via canonical merge. Playback unchanged (embed iframe → phone-side). 60 fresh scenes on first crawl. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b3ecf7141a
commit
a10c51aebf
2 changed files with 125 additions and 91 deletions
|
|
@ -84,7 +84,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
|||
# only (DEAD_HOSTER_RE blacklist - malware drive-by .reg downloads). SERVER1_URL =
|
||||
# streamtape, brak SERVER2/SERVER3 backup. Porn-app sam olewa porn4days. 10,346
|
||||
# solo-orphan scen.
|
||||
PornDishScraper,
|
||||
# PornDishScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-24,
|
||||
# watchdog GOON-16: search `?s=` zamarzł 2026-05-07). WordPress → browse przez WP REST
|
||||
# API (/wp-json/wp/v2/posts) jak perverzija: tytuł/data/thumb/studio(category)/tagi.
|
||||
# XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan.
|
||||
# Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek
|
||||
# match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled.
|
||||
|
|
@ -139,6 +141,7 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
|
|||
|
||||
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||
PerverzijaScraper,
|
||||
PornDishScraper,
|
||||
FreshpornoScraper,
|
||||
FpoxxxScraper,
|
||||
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
||||
|
|
|
|||
|
|
@ -1,116 +1,147 @@
|
|||
"""porndish.com — direct HTML scrape.
|
||||
"""porndish.com — latest browse scraper via WordPress REST API.
|
||||
|
||||
Search: `https://porndish.com/page/<n>/?s=<q>`.
|
||||
Scene URL: `https://porndish.com/<slug>/`.
|
||||
Historia: dawniej search scraper (`?s=`), zamarzł 2026-05-07 (search przestał dawać
|
||||
nowe sceny — 1151h cisza, watchdog GOON-16). To WordPress (g1/bimber theme), VPS
|
||||
dociera, więc czysty kanał to REST API: `/wp-json/wp/v2/posts?_embed=1` daje
|
||||
ustrukturyzowany JSON jednym requestem na stronę. Przerobione na browse 2026-06-24
|
||||
(ten sam wzorzec co perverzija).
|
||||
|
||||
Scene detail page (g1/bimber WordPress theme) zawiera:
|
||||
- `<p class="entry-tags"><a class="entry-tag entry-tag-N" href=".../video2/<slug>/">Name</a>…`
|
||||
— lista tagów (kategorie + performerzy wymieszani, tak jak porndish je pokazuje
|
||||
jako „#" hashtagi). Bierzemy wszystkie jako RawTag (resolver dedupuje; performer
|
||||
z query i tak dochodzi osobno).
|
||||
- prozę opisu w `<p>` wewnątrz `.entry-content` (przed `entry-tags`, po embed-JS).
|
||||
Bez `_fetch_scene_metadata` overrides scena z samego porndish miała 0 tagów i brak
|
||||
description (bug-report od Jana 2026-06-06: „nie ma tagów (# na stronie) ani description").
|
||||
Z REST `_embed`: tytuł, data, miniatura (featured_media), STUDIO (taksonomia
|
||||
`category` — np. "Freeuse Fantasy", content studyjny) i tagi (`post_tag` — porndish
|
||||
miesza w nich performerów z gatunkami, bierzemy jak jest; canonical-merge i tak
|
||||
dorabia performerów z TPDB/StashDB, a tytuł ma nazwiska). Performerów osobno nie
|
||||
wyciągamy (post_tag ich nie rozdziela od gatunków bez listy known-performers).
|
||||
|
||||
Playback: post page embeduje hoster iframe → extractor `porndishcom` → `_embed_iframe`
|
||||
→ resolwowany phone-side.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import html as html_mod
|
||||
import html
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import date, datetime
|
||||
|
||||
from app.connectors.base import RawPerformer, RawStudio, RawTag
|
||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||
from app.connectors.base import (
|
||||
RawFingerprint,
|
||||
RawPlaybackSource,
|
||||
RawScene,
|
||||
RawStudio,
|
||||
RawTag,
|
||||
)
|
||||
from app.connectors.direct_scrapers._browse_base import (
|
||||
BaseBrowseScraper,
|
||||
compute_thumbnail_phash,
|
||||
)
|
||||
from app.extractors import browser_get
|
||||
from app.normalize.text import slugify
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_ENTRY_TAG_RE = re.compile(
|
||||
r'<a[^>]+href="[^"]*/video2/(?P<slug>[^"/]+)/"[^>]*class="[^"]*entry-tag[^"]*"[^>]*>'
|
||||
r'(?P<name>[^<]+)</a>',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_ENTRY_CONTENT_RE = re.compile(
|
||||
r'<div[^>]*class="[^"]*entry-content[^"]*"[^>]*>(?P<body>.*?)</article>',
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
_SCRIPT_STYLE_RE = re.compile(r"<script\b.*?</script>|<style\b.*?</style>", re.IGNORECASE | re.DOTALL)
|
||||
_P_RE = re.compile(r"<p\b[^>]*>(?P<inner>.*?)</p>", re.IGNORECASE | re.DOTALL)
|
||||
_TAG_STRIP_RE = re.compile(r"<[^>]+>")
|
||||
_WS_RE = re.compile(r"\s+")
|
||||
_SLUG_RE = re.compile(r"[^a-z0-9]+")
|
||||
_BASE = "https://www.porndish.com"
|
||||
_PER_PAGE = 20
|
||||
|
||||
|
||||
def _slugify(name: str) -> str:
|
||||
return _SLUG_RE.sub("-", name.lower()).strip("-") or "tag"
|
||||
def _parse_date(value: str | None) -> date | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _clean_text(fragment: str) -> str:
|
||||
txt = _TAG_STRIP_RE.sub(" ", fragment)
|
||||
txt = html_mod.unescape(txt)
|
||||
return _WS_RE.sub(" ", txt).strip()
|
||||
|
||||
|
||||
class PornDishScraper(BaseSearchScraper):
|
||||
class PornDishScraper(BaseBrowseScraper):
|
||||
sitetag = "porndishcom"
|
||||
_search_url_template = "https://porndish.com/page/{page}/?s={query}"
|
||||
_scene_url_re = re.compile(
|
||||
r'href="(?P<url>https://porndish\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def _fetch_scene_metadata(
|
||||
self, scene_url: str
|
||||
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag], str | None] | None:
|
||||
"""Fetch scene page → (studio=None, performers=[], tags, description).
|
||||
def _listing_url(self, page: int) -> str:
|
||||
return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1"
|
||||
|
||||
4-elementowy zwrot (base unpacka opcjonalny `description`). porndish nie
|
||||
wyróżnia studia, a performer z query dochodzi w base — tu tylko tagi + opis.
|
||||
"""
|
||||
# crawl_page nadpisany (REST JSON, nie HTML) → abstrakcje nieużywane.
|
||||
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||
return []
|
||||
|
||||
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||
return None
|
||||
|
||||
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||
url = self._listing_url(page)
|
||||
try:
|
||||
r = browser_get(scene_url, timeout=self._timeout)
|
||||
res = browser_get(url, timeout=self._timeout)
|
||||
except Exception as e:
|
||||
log.debug("porndish meta fetch failed for %s: %s", scene_url, e)
|
||||
log.warning("porndish REST fetch failed (page %d): %s", page, e)
|
||||
return None
|
||||
if r.status_code != 200 or not r.text:
|
||||
# WP zwraca 400 (rest_post_invalid_page_number) za ostatnią stroną → exhausted.
|
||||
if res.status_code != 200:
|
||||
return []
|
||||
try:
|
||||
posts = json.loads(res.text)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
log.warning("porndish REST: bad JSON page %d", page)
|
||||
return None
|
||||
html = r.text
|
||||
if not isinstance(posts, list) or not posts:
|
||||
return []
|
||||
|
||||
# Tagi: entry-tag anchors (slug z /video2/<slug>/ + display name).
|
||||
tags: list[RawTag] = []
|
||||
seen: set[str] = set()
|
||||
for m in _ENTRY_TAG_RE.finditer(html):
|
||||
name = html_mod.unescape(m.group("name")).strip()
|
||||
slug = (m.group("slug") or "").strip().lower() or _slugify(name)
|
||||
if not name or len(name) > 40 or slug in seen:
|
||||
out: list[RawScene] = []
|
||||
for p in posts:
|
||||
link = (p.get("link") or "").strip()
|
||||
title = html.unescape((p.get("title") or {}).get("rendered", "")).strip()
|
||||
if not link or not title:
|
||||
continue
|
||||
seen.add(slug)
|
||||
tags.append(RawTag(external_id=f"porndishcom:tag:{slug}", name=name, slug=slug))
|
||||
release_date = _parse_date(p.get("date"))
|
||||
|
||||
# Description: najdłuższy prozowy <p> w .entry-content (bez entry-tags / embed-JS).
|
||||
description: str | None = None
|
||||
mc = _ENTRY_CONTENT_RE.search(html)
|
||||
body = mc.group("body") if mc else html
|
||||
body = _SCRIPT_STYLE_RE.sub(" ", body)
|
||||
best = ""
|
||||
for pm in _P_RE.finditer(body):
|
||||
inner = pm.group("inner")
|
||||
if "entry-tag" in inner:
|
||||
continue
|
||||
txt = _clean_text(inner)
|
||||
# Pomijamy resztki JS / boilerplate „Watch … porn video" / przyciski serwerów.
|
||||
if not txt or "getElementById" in txt or "addEventListener" in txt:
|
||||
continue
|
||||
low = txt.lower()
|
||||
if low.startswith("watch ") and low.endswith("porn video"):
|
||||
continue
|
||||
if len(txt) > len(best):
|
||||
best = txt
|
||||
# Strip wiodące etykiety przycisków embedu („Video Player 1 Video Player 2 …",
|
||||
# czasem „Server N") które wpadają na początek prozy.
|
||||
best = re.sub(r"^(?:Video Player \d+\s*|Server \d+\s*|Download\s*)+", "", best, flags=re.IGNORECASE).strip()
|
||||
if len(best) >= 40:
|
||||
description = best
|
||||
emb = p.get("_embedded") or {}
|
||||
fm = emb.get("wp:featuredmedia") or []
|
||||
thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None
|
||||
|
||||
if not tags and description is None:
|
||||
return None
|
||||
return (None, [], tags, description)
|
||||
studio: RawStudio | None = None
|
||||
tags: list[RawTag] = []
|
||||
seen_tag: set[str] = set()
|
||||
for group in emb.get("wp:term") or []:
|
||||
if not group:
|
||||
continue
|
||||
tax = group[0].get("taxonomy")
|
||||
if tax == "category" and studio is None:
|
||||
sname = (group[0].get("name") or "").strip()
|
||||
if sname:
|
||||
studio = RawStudio(
|
||||
external_id=f"{self.sitetag}:studio:{slugify(sname)}",
|
||||
name=sname, slug=slugify(sname),
|
||||
)
|
||||
elif tax == "post_tag":
|
||||
for g in group:
|
||||
name = (g.get("name") or "").strip()
|
||||
sl = (g.get("slug") or slugify(name)).strip()
|
||||
if not name or sl in seen_tag:
|
||||
continue
|
||||
seen_tag.add(sl)
|
||||
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
|
||||
|
||||
fingerprints: list[RawFingerprint] = []
|
||||
if thumb:
|
||||
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||
if ph:
|
||||
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||
|
||||
out.append(
|
||||
RawScene(
|
||||
external_id=f"{self.sitetag}:{link}",
|
||||
title=title,
|
||||
release_date=release_date,
|
||||
url=link,
|
||||
studio=studio,
|
||||
performers=[],
|
||||
tags=tags,
|
||||
fingerprints=fingerprints,
|
||||
playback_sources=[
|
||||
RawPlaybackSource(
|
||||
origin=f"tube:{self.sitetag}",
|
||||
page_url=link,
|
||||
thumbnail_url=thumb,
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
log.info("porndish REST page %d: %d scenes", page, len(out))
|
||||
return out
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue