"""perverzija.com — latest browse scraper via WordPress REST API. Historia: dawniej search scraper (`?s=`), ale 2026-06 perverzija rate-limituje search (429) a homepage jest JS-renderowane (brak linków postów w surowym HTML) → search zwracał 0. To WordPress, więc czysty kanał to REST API: `/wp-json/wp/v2/posts` daje ustrukturyzowany JSON (link, date, title, featured thumb, taksonomie) jednym requestem na stronę. VPS dociera (curl_cffi bypassuje JA3; 200 nie 403). Przerobione na browse 2026-06-22 (user request). Z REST `?_embed=1` bierzemy: tytuł, datę, miniaturę (featured_media), STUDIO (taksonomia `category` — np. "DadCrush"/"TeamSkeet", to studyjny re-up) i tagi (`post_tag`). Performerów REST nie wystawia (custom taksonomia `stars` bez show_in_rest) → puste, dorabia canonical-merge (content studyjny dobrze matchuje TPDB/StashDB; tytuł i tak ma nazwiska). Playback: post page (tube.perverzija.com//) embeduje xtremestream iframe → extractor `perverzijacom` → `_embed_iframe` → hoster resolwowany phone-side. """ from __future__ import annotations import html import json import logging from datetime import date, datetime from app.connectors.base import ( RawFingerprint, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, ) from app.extractors import browser_get from app.normalize.text import slugify log = logging.getLogger(__name__) _BASE = "https://www.perverzija.com" _PER_PAGE = 20 def _parse_date(value: str | None) -> date | None: if not value: return None try: return datetime.fromisoformat(value.replace("Z", "+00:00")).date() except ValueError: return None class PerverzijaScraper(BaseBrowseScraper): sitetag = "perverzijacom" def _listing_url(self, page: int) -> str: return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1" # crawl_page nadpisany (REST JSON, nie HTML) → abstrakcje nieużywane. def _extract_scene_urls(self, listing_html: str) -> list[str]: return [] def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: return None def crawl_page(self, page: int) -> list[RawScene] | None: url = self._listing_url(page) try: res = browser_get(url, timeout=self._timeout) except Exception as e: log.warning("perverzija REST fetch failed (page %d): %s", page, e) return None # WP zwraca 400 (rest_post_invalid_page_number) za ostatnią stroną → exhausted. if res.status_code != 200: return [] try: posts = json.loads(res.text) except (json.JSONDecodeError, ValueError): log.warning("perverzija REST: bad JSON page %d", page) return None if not isinstance(posts, list) or not posts: return [] out: list[RawScene] = [] for p in posts: link = (p.get("link") or "").strip() title = html.unescape((p.get("title") or {}).get("rendered", "")).strip() if not link or not title: continue release_date = _parse_date(p.get("date")) emb = p.get("_embedded") or {} fm = emb.get("wp:featuredmedia") or [] thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None studio: RawStudio | None = None tags: list[RawTag] = [] seen_tag: set[str] = set() for group in emb.get("wp:term") or []: if not group: continue tax = group[0].get("taxonomy") if tax == "category" and studio is None: sname = (group[0].get("name") or "").strip() if sname: studio = RawStudio( external_id=f"{self.sitetag}:studio:{slugify(sname)}", name=sname, slug=slugify(sname), ) elif tax == "post_tag": for g in group: name = (g.get("name") or "").strip() sl = (g.get("slug") or slugify(name)).strip() if not name or sl in seen_tag: continue seen_tag.add(sl) tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl)) fingerprints: list[RawFingerprint] = [] if thumb: ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) out.append( RawScene( external_id=f"{self.sitetag}:{link}", title=title, release_date=release_date, url=link, studio=studio, performers=[], tags=tags, fingerprints=fingerprints, playback_sources=[ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=link, thumbnail_url=thumb, ) ], ) ) log.info("perverzija REST page %d: %d scenes", page, len(out)) return out