"""mypornerleak.com — latest BROWSE scraper via WordPress REST API, obok search scrapera. MyPornerLeakScraper (search) zostaje w ALL_DIRECT_SCRAPERS; ten browse dokłada świeżość wprost z WP REST (`/wp-json/wp/v2/posts?_embed=1`). W odróżnieniu od perverzija/porndish, mypornerleak WYSTAWIA custom taksonomię `actors` w REST → mamy też performerów (nie tylko studio z `category` + tagi z `post_tag`). Playback: post page embeduje hoster iframe → extractor `mypornerleakcom` → `_embed_iframe`, resolwowany phone-side (bez zmian). """ from __future__ import annotations import html import json import logging from datetime import date, datetime from app.connectors.base import ( RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, ) from app.extractors import browser_get from app.normalize.text import slugify log = logging.getLogger(__name__) _BASE = "https://mypornerleak.com" _PER_PAGE = 20 def _parse_date(value: str | None) -> date | None: if not value: return None try: return datetime.fromisoformat(value.replace("Z", "+00:00")).date() except ValueError: return None class MyPornerLeakBrowseScraper(BaseBrowseScraper): sitetag = "mypornerleakcom" def _listing_url(self, page: int) -> str: return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1" def _extract_scene_urls(self, listing_html: str) -> list[str]: return [] def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: return None def crawl_page(self, page: int) -> list[RawScene] | None: url = self._listing_url(page) try: res = browser_get(url, timeout=self._timeout) except Exception as e: log.warning("mypornerleak REST fetch failed (page %d): %s", page, e) return None if res.status_code != 200: return [] try: posts = json.loads(res.text) except (json.JSONDecodeError, ValueError): log.warning("mypornerleak REST: bad JSON page %d", page) return None if not isinstance(posts, list) or not posts: return [] out: list[RawScene] = [] for p in posts: link = (p.get("link") or "").strip() title = html.unescape((p.get("title") or {}).get("rendered", "")).strip() if not link or not title: continue release_date = _parse_date(p.get("date")) emb = p.get("_embedded") or {} fm = emb.get("wp:featuredmedia") or [] thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None studio: RawStudio | None = None tags: list[RawTag] = [] performers: list[RawPerformer] = [] seen_tag: set[str] = set() seen_perf: set[str] = set() for group in emb.get("wp:term") or []: if not group: continue tax = group[0].get("taxonomy") if tax == "category" and studio is None: sname = (group[0].get("name") or "").strip() if sname: studio = RawStudio( external_id=f"{self.sitetag}:studio:{slugify(sname)}", name=sname, slug=slugify(sname), ) elif tax == "actors": for g in group: name = (g.get("name") or "").strip() sl = slugify(name) if not name or sl in seen_perf: continue seen_perf.add(sl) performers.append( RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name) ) elif tax == "post_tag": for g in group: name = (g.get("name") or "").strip() sl = (g.get("slug") or slugify(name)).strip() if not name or sl in seen_tag: continue seen_tag.add(sl) tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl)) fingerprints: list[RawFingerprint] = [] if thumb: ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) out.append( RawScene( external_id=f"{self.sitetag}:{link}", title=title, release_date=release_date, url=link, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=[ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=link, thumbnail_url=thumb, ) ], ) ) log.info("mypornerleak REST page %d: %d scenes", page, len(out)) return out