goon/app/connectors/direct_scrapers/mypornerleak_browse.py
jtrzupek 55612e262b feat(ingest): add browse scrapers for porntrex + mypornerleak (alongside search)
Both were search-only — fresh only as long as the performer queue cycles and the
site search keeps working. Added browse scrapers next to the existing search ones
(xvideos/eporner pattern: search keeps performer back-catalog coverage, browse
guarantees latest-feed freshness → watchdog 48h instead of 168h):
- porntrex: KVS /latest-updates/<n>/ (title + thumb + phash)
- mypornerleak: WP REST /wp-json/wp/v2/posts?_embed=1 (title + date + studio from
  category + performers from the actors taxonomy)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-24 15:41:22 +02:00

150 lines
5.4 KiB
Python

"""mypornerleak.com — latest BROWSE scraper via WordPress REST API, obok search scrapera.
MyPornerLeakScraper (search) zostaje w ALL_DIRECT_SCRAPERS; ten browse dokłada
świeżość wprost z WP REST (`/wp-json/wp/v2/posts?_embed=1`). W odróżnieniu od
perverzija/porndish, mypornerleak WYSTAWIA custom taksonomię `actors` w REST →
mamy też performerów (nie tylko studio z `category` + tagi z `post_tag`).
Playback: post page embeduje hoster iframe → extractor `mypornerleakcom` →
`_embed_iframe`, resolwowany phone-side (bez zmian).
"""
from __future__ import annotations
import html
import json
import logging
from datetime import date, datetime
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
from app.extractors import browser_get
from app.normalize.text import slugify
log = logging.getLogger(__name__)
_BASE = "https://mypornerleak.com"
_PER_PAGE = 20
def _parse_date(value: str | None) -> date | None:
if not value:
return None
try:
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
except ValueError:
return None
class MyPornerLeakBrowseScraper(BaseBrowseScraper):
sitetag = "mypornerleakcom"
def _listing_url(self, page: int) -> str:
return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
return []
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
return None
def crawl_page(self, page: int) -> list[RawScene] | None:
url = self._listing_url(page)
try:
res = browser_get(url, timeout=self._timeout)
except Exception as e:
log.warning("mypornerleak REST fetch failed (page %d): %s", page, e)
return None
if res.status_code != 200:
return []
try:
posts = json.loads(res.text)
except (json.JSONDecodeError, ValueError):
log.warning("mypornerleak REST: bad JSON page %d", page)
return None
if not isinstance(posts, list) or not posts:
return []
out: list[RawScene] = []
for p in posts:
link = (p.get("link") or "").strip()
title = html.unescape((p.get("title") or {}).get("rendered", "")).strip()
if not link or not title:
continue
release_date = _parse_date(p.get("date"))
emb = p.get("_embedded") or {}
fm = emb.get("wp:featuredmedia") or []
thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None
studio: RawStudio | None = None
tags: list[RawTag] = []
performers: list[RawPerformer] = []
seen_tag: set[str] = set()
seen_perf: set[str] = set()
for group in emb.get("wp:term") or []:
if not group:
continue
tax = group[0].get("taxonomy")
if tax == "category" and studio is None:
sname = (group[0].get("name") or "").strip()
if sname:
studio = RawStudio(
external_id=f"{self.sitetag}:studio:{slugify(sname)}",
name=sname, slug=slugify(sname),
)
elif tax == "actors":
for g in group:
name = (g.get("name") or "").strip()
sl = slugify(name)
if not name or sl in seen_perf:
continue
seen_perf.add(sl)
performers.append(
RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name)
)
elif tax == "post_tag":
for g in group:
name = (g.get("name") or "").strip()
sl = (g.get("slug") or slugify(name)).strip()
if not name or sl in seen_tag:
continue
seen_tag.add(sl)
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
fingerprints: list[RawFingerprint] = []
if thumb:
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
out.append(
RawScene(
external_id=f"{self.sitetag}:{link}",
title=title,
release_date=release_date,
url=link,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=[
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=link,
thumbnail_url=thumb,
)
],
)
)
log.info("mypornerleak REST page %d: %d scenes", page, len(out))
return out