From a10c51aebfe2f7edc934c1ee018efc610ce70e7b Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Wed, 24 Jun 2026 15:09:27 +0200 Subject: [PATCH] =?UTF-8?q?feat(ingest):=20revive=20porndish=20=E2=80=94?= =?UTF-8?q?=20search=E2=86=92WP=20REST=20API=20browse?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Watchdog flagged porndish as frozen (search ?s= stopped yielding new scenes 2026-05-07, 1151h). It's WordPress and the VPS can reach it, so converted to a browse scraper over the WP REST API (/wp-json/wp/v2/posts?_embed=1), same pattern as perverzija: title, date, featured thumbnail, studio (category — FreeUseFantasy / I Have A Wife / … paysite content) and tags. Performers via canonical merge. Playback unchanged (embed iframe → phone-side). 60 fresh scenes on first crawl. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/connectors/direct_scrapers/__init__.py | 5 +- app/connectors/direct_scrapers/porndish.py | 211 ++++++++++++--------- 2 files changed, 125 insertions(+), 91 deletions(-) diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index 12319bf..da28cd8 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -84,7 +84,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [ # only (DEAD_HOSTER_RE blacklist - malware drive-by .reg downloads). SERVER1_URL = # streamtape, brak SERVER2/SERVER3 backup. Porn-app sam olewa porn4days. 10,346 # solo-orphan scen. - PornDishScraper, + # PornDishScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-24, + # watchdog GOON-16: search `?s=` zamarzł 2026-05-07). WordPress → browse przez WP REST + # API (/wp-json/wp/v2/posts) jak perverzija: tytuł/data/thumb/studio(category)/tagi. # XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan. # Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek # match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled. @@ -139,6 +141,7 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ PerverzijaScraper, + PornDishScraper, FreshpornoScraper, FpoxxxScraper, # LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven diff --git a/app/connectors/direct_scrapers/porndish.py b/app/connectors/direct_scrapers/porndish.py index 6a8ffcb..9b65b0c 100644 --- a/app/connectors/direct_scrapers/porndish.py +++ b/app/connectors/direct_scrapers/porndish.py @@ -1,116 +1,147 @@ -"""porndish.com — direct HTML scrape. +"""porndish.com — latest browse scraper via WordPress REST API. -Search: `https://porndish.com/page//?s=`. -Scene URL: `https://porndish.com//`. +Historia: dawniej search scraper (`?s=`), zamarzł 2026-05-07 (search przestał dawać +nowe sceny — 1151h cisza, watchdog GOON-16). To WordPress (g1/bimber theme), VPS +dociera, więc czysty kanał to REST API: `/wp-json/wp/v2/posts?_embed=1` daje +ustrukturyzowany JSON jednym requestem na stronę. Przerobione na browse 2026-06-24 +(ten sam wzorzec co perverzija). -Scene detail page (g1/bimber WordPress theme) zawiera: - - `

` wewnątrz `.entry-content` (przed `entry-tags`, po embed-JS). -Bez `_fetch_scene_metadata` overrides scena z samego porndish miała 0 tagów i brak -description (bug-report od Jana 2026-06-06: „nie ma tagów (# na stronie) ani description"). +Z REST `_embed`: tytuł, data, miniatura (featured_media), STUDIO (taksonomia +`category` — np. "Freeuse Fantasy", content studyjny) i tagi (`post_tag` — porndish +miesza w nich performerów z gatunkami, bierzemy jak jest; canonical-merge i tak +dorabia performerów z TPDB/StashDB, a tytuł ma nazwiska). Performerów osobno nie +wyciągamy (post_tag ich nie rozdziela od gatunków bez listy known-performers). + +Playback: post page embeduje hoster iframe → extractor `porndishcom` → `_embed_iframe` +→ resolwowany phone-side. """ from __future__ import annotations -import html as html_mod +import html +import json import logging -import re +from datetime import date, datetime -from app.connectors.base import RawPerformer, RawStudio, RawTag -from app.connectors.direct_scrapers._search_base import BaseSearchScraper +from app.connectors.base import ( + RawFingerprint, + RawPlaybackSource, + RawScene, + RawStudio, + RawTag, +) +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, +) from app.extractors import browser_get +from app.normalize.text import slugify log = logging.getLogger(__name__) -_ENTRY_TAG_RE = re.compile( - r']+href="[^"]*/video2/(?P[^"/]+)/"[^>]*class="[^"]*entry-tag[^"]*"[^>]*>' - r'(?P[^<]+)', - re.IGNORECASE, -) -_ENTRY_CONTENT_RE = re.compile( - r']*class="[^"]*entry-content[^"]*"[^>]*>(?P.*?)', - re.IGNORECASE | re.DOTALL, -) -_SCRIPT_STYLE_RE = re.compile(r"|", re.IGNORECASE | re.DOTALL) -_P_RE = re.compile(r"]*>(?P.*?)

", re.IGNORECASE | re.DOTALL) -_TAG_STRIP_RE = re.compile(r"<[^>]+>") -_WS_RE = re.compile(r"\s+") -_SLUG_RE = re.compile(r"[^a-z0-9]+") +_BASE = "https://www.porndish.com" +_PER_PAGE = 20 -def _slugify(name: str) -> str: - return _SLUG_RE.sub("-", name.lower()).strip("-") or "tag" +def _parse_date(value: str | None) -> date | None: + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")).date() + except ValueError: + return None -def _clean_text(fragment: str) -> str: - txt = _TAG_STRIP_RE.sub(" ", fragment) - txt = html_mod.unescape(txt) - return _WS_RE.sub(" ", txt).strip() - - -class PornDishScraper(BaseSearchScraper): +class PornDishScraper(BaseBrowseScraper): sitetag = "porndishcom" - _search_url_template = "https://porndish.com/page/{page}/?s={query}" - _scene_url_re = re.compile( - r'href="(?Phttps://porndish\.com/(?P[a-z0-9][a-z0-9\-]+))/"', - re.IGNORECASE, - ) - def _fetch_scene_metadata( - self, scene_url: str - ) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag], str | None] | None: - """Fetch scene page → (studio=None, performers=[], tags, description). + def _listing_url(self, page: int) -> str: + return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1" - 4-elementowy zwrot (base unpacka opcjonalny `description`). porndish nie - wyróżnia studia, a performer z query dochodzi w base — tu tylko tagi + opis. - """ + # crawl_page nadpisany (REST JSON, nie HTML) → abstrakcje nieużywane. + def _extract_scene_urls(self, listing_html: str) -> list[str]: + return [] + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + return None + + def crawl_page(self, page: int) -> list[RawScene] | None: + url = self._listing_url(page) try: - r = browser_get(scene_url, timeout=self._timeout) + res = browser_get(url, timeout=self._timeout) except Exception as e: - log.debug("porndish meta fetch failed for %s: %s", scene_url, e) + log.warning("porndish REST fetch failed (page %d): %s", page, e) return None - if r.status_code != 200 or not r.text: + # WP zwraca 400 (rest_post_invalid_page_number) za ostatnią stroną → exhausted. + if res.status_code != 200: + return [] + try: + posts = json.loads(res.text) + except (json.JSONDecodeError, ValueError): + log.warning("porndish REST: bad JSON page %d", page) return None - html = r.text + if not isinstance(posts, list) or not posts: + return [] - # Tagi: entry-tag anchors (slug z /video2// + display name). - tags: list[RawTag] = [] - seen: set[str] = set() - for m in _ENTRY_TAG_RE.finditer(html): - name = html_mod.unescape(m.group("name")).strip() - slug = (m.group("slug") or "").strip().lower() or _slugify(name) - if not name or len(name) > 40 or slug in seen: + out: list[RawScene] = [] + for p in posts: + link = (p.get("link") or "").strip() + title = html.unescape((p.get("title") or {}).get("rendered", "")).strip() + if not link or not title: continue - seen.add(slug) - tags.append(RawTag(external_id=f"porndishcom:tag:{slug}", name=name, slug=slug)) + release_date = _parse_date(p.get("date")) - # Description: najdłuższy prozowy

w .entry-content (bez entry-tags / embed-JS). - description: str | None = None - mc = _ENTRY_CONTENT_RE.search(html) - body = mc.group("body") if mc else html - body = _SCRIPT_STYLE_RE.sub(" ", body) - best = "" - for pm in _P_RE.finditer(body): - inner = pm.group("inner") - if "entry-tag" in inner: - continue - txt = _clean_text(inner) - # Pomijamy resztki JS / boilerplate „Watch … porn video" / przyciski serwerów. - if not txt or "getElementById" in txt or "addEventListener" in txt: - continue - low = txt.lower() - if low.startswith("watch ") and low.endswith("porn video"): - continue - if len(txt) > len(best): - best = txt - # Strip wiodące etykiety przycisków embedu („Video Player 1 Video Player 2 …", - # czasem „Server N") które wpadają na początek prozy. - best = re.sub(r"^(?:Video Player \d+\s*|Server \d+\s*|Download\s*)+", "", best, flags=re.IGNORECASE).strip() - if len(best) >= 40: - description = best + emb = p.get("_embedded") or {} + fm = emb.get("wp:featuredmedia") or [] + thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None - if not tags and description is None: - return None - return (None, [], tags, description) + studio: RawStudio | None = None + tags: list[RawTag] = [] + seen_tag: set[str] = set() + for group in emb.get("wp:term") or []: + if not group: + continue + tax = group[0].get("taxonomy") + if tax == "category" and studio is None: + sname = (group[0].get("name") or "").strip() + if sname: + studio = RawStudio( + external_id=f"{self.sitetag}:studio:{slugify(sname)}", + name=sname, slug=slugify(sname), + ) + elif tax == "post_tag": + for g in group: + name = (g.get("name") or "").strip() + sl = (g.get("slug") or slugify(name)).strip() + if not name or sl in seen_tag: + continue + seen_tag.add(sl) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl)) + + fingerprints: list[RawFingerprint] = [] + if thumb: + ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + out.append( + RawScene( + external_id=f"{self.sitetag}:{link}", + title=title, + release_date=release_date, + url=link, + studio=studio, + performers=[], + tags=tags, + fingerprints=fingerprints, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=link, + thumbnail_url=thumb, + ) + ], + ) + ) + + log.info("porndish REST page %d: %d scenes", page, len(out)) + return out