Search (?s=) started returning 429 and the homepage is JS-rendered (no post links in raw HTML), so the old search scraper got 0 (frozen since 2026-05-07). perverzija is WordPress and the VPS can reach it (200, not CF-blocked), so converted to a browse scraper over the WP REST API (/wp-json/wp/v2/posts?_embed=1): one structured call per page gives title, date, featured thumbnail, studio (category — DadCrush/FamilyStrokes/ … TeamSkeet-family paysite re-ups) and genre tags. Performers via canonical merge (stars taxonomy isn't REST-exposed; title carries names). Playback unchanged (embed iframe → phone-side). 15 fresh + 45 refreshed on first crawl. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
148 lines
5.5 KiB
Python
148 lines
5.5 KiB
Python
"""perverzija.com — latest browse scraper via WordPress REST API.
|
|
|
|
Historia: dawniej search scraper (`?s=`), ale 2026-06 perverzija rate-limituje search
|
|
(429) a homepage jest JS-renderowane (brak linków postów w surowym HTML) → search
|
|
zwracał 0. To WordPress, więc czysty kanał to REST API: `/wp-json/wp/v2/posts` daje
|
|
ustrukturyzowany JSON (link, date, title, featured thumb, taksonomie) jednym requestem
|
|
na stronę. VPS dociera (curl_cffi bypassuje JA3; 200 nie 403). Przerobione na browse
|
|
2026-06-22 (user request).
|
|
|
|
Z REST `?_embed=1` bierzemy: tytuł, datę, miniaturę (featured_media), STUDIO
|
|
(taksonomia `category` — np. "DadCrush"/"TeamSkeet", to studyjny re-up) i tagi
|
|
(`post_tag`). Performerów REST nie wystawia (custom taksonomia `stars` bez show_in_rest)
|
|
→ puste, dorabia canonical-merge (content studyjny dobrze matchuje TPDB/StashDB; tytuł
|
|
i tak ma nazwiska).
|
|
|
|
Playback: post page (tube.perverzija.com/<slug>/) embeduje xtremestream iframe →
|
|
extractor `perverzijacom` → `_embed_iframe` → hoster resolwowany phone-side.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import html
|
|
import json
|
|
import logging
|
|
from datetime import date, datetime
|
|
|
|
from app.connectors.base import (
|
|
RawFingerprint,
|
|
RawPlaybackSource,
|
|
RawScene,
|
|
RawStudio,
|
|
RawTag,
|
|
)
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
)
|
|
from app.extractors import browser_get
|
|
from app.normalize.text import slugify
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_BASE = "https://www.perverzija.com"
|
|
_PER_PAGE = 20
|
|
|
|
|
|
def _parse_date(value: str | None) -> date | None:
|
|
if not value:
|
|
return None
|
|
try:
|
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
class PerverzijaScraper(BaseBrowseScraper):
|
|
sitetag = "perverzijacom"
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1"
|
|
|
|
# crawl_page nadpisany (REST JSON, nie HTML) → abstrakcje nieużywane.
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
return []
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
return None
|
|
|
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
|
url = self._listing_url(page)
|
|
try:
|
|
res = browser_get(url, timeout=self._timeout)
|
|
except Exception as e:
|
|
log.warning("perverzija REST fetch failed (page %d): %s", page, e)
|
|
return None
|
|
# WP zwraca 400 (rest_post_invalid_page_number) za ostatnią stroną → exhausted.
|
|
if res.status_code != 200:
|
|
return []
|
|
try:
|
|
posts = json.loads(res.text)
|
|
except (json.JSONDecodeError, ValueError):
|
|
log.warning("perverzija REST: bad JSON page %d", page)
|
|
return None
|
|
if not isinstance(posts, list) or not posts:
|
|
return []
|
|
|
|
out: list[RawScene] = []
|
|
for p in posts:
|
|
link = (p.get("link") or "").strip()
|
|
title = html.unescape((p.get("title") or {}).get("rendered", "")).strip()
|
|
if not link or not title:
|
|
continue
|
|
release_date = _parse_date(p.get("date"))
|
|
|
|
emb = p.get("_embedded") or {}
|
|
fm = emb.get("wp:featuredmedia") or []
|
|
thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None
|
|
|
|
studio: RawStudio | None = None
|
|
tags: list[RawTag] = []
|
|
seen_tag: set[str] = set()
|
|
for group in emb.get("wp:term") or []:
|
|
if not group:
|
|
continue
|
|
tax = group[0].get("taxonomy")
|
|
if tax == "category" and studio is None:
|
|
sname = (group[0].get("name") or "").strip()
|
|
if sname:
|
|
studio = RawStudio(
|
|
external_id=f"{self.sitetag}:studio:{slugify(sname)}",
|
|
name=sname, slug=slugify(sname),
|
|
)
|
|
elif tax == "post_tag":
|
|
for g in group:
|
|
name = (g.get("name") or "").strip()
|
|
sl = (g.get("slug") or slugify(name)).strip()
|
|
if not name or sl in seen_tag:
|
|
continue
|
|
seen_tag.add(sl)
|
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
|
|
|
|
fingerprints: list[RawFingerprint] = []
|
|
if thumb:
|
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
|
if ph:
|
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
|
|
out.append(
|
|
RawScene(
|
|
external_id=f"{self.sitetag}:{link}",
|
|
title=title,
|
|
release_date=release_date,
|
|
url=link,
|
|
studio=studio,
|
|
performers=[],
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
playback_sources=[
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=link,
|
|
thumbnail_url=thumb,
|
|
)
|
|
],
|
|
)
|
|
)
|
|
|
|
log.info("perverzija REST page %d: %d scenes", page, len(out))
|
|
return out
|