"""mypornerleak.com — performer-page listing scrape (search-based, performer-driven). 2026-06-16 fix (zamrożony od 05-07): treść serwowana jest pod subdomeną `w8.mypornerleak.com` (load-balancer wN.), a stary regex szukał scen na gołym `mypornerleak.com//` → 0 trafień. Kanoniczny host i tak serwuje stronę listingu i sceny, więc fetchujemy `mypornerleak.com/actor//` a linki scen normalizujemy wN.→ kanoniczny (stabilny page_url/dedup). Wszystko z LISTINGU (bez detail-fetcha) — karta `
` koduje metadane: - klasa `actors-` (multi) → performerzy; `category-` (multi) → tagi - `` → URL sceny + czysty tytuł - `MM:SS|HH:MM:SS` - `` → thumbnail Bez studia: mypornerleak to repost leaków OnlyFans/amatorskich (brak studyjnego źródła). Playback przez extractor `mypornerleakcom` (_embed_iframe → hoster). """ from __future__ import annotations import html import logging import re from collections.abc import Iterator from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawTag from app.connectors.direct_scrapers._search_base import BaseSearchScraper from app.extractors import browser_get from app.normalize.text import slugify log = logging.getLogger(__name__) _BASE = "https://mypornerleak.com" _ARTICLE_RE = re.compile(r']*\bclass="([^"]+)"', re.IGNORECASE) _LINK_RE = re.compile(r'(?:<[^>]+>)*\s*(\d{1,2}:\d{2}(?::\d{2})?)', re.IGNORECASE) _THUMB_RE = re.compile(r'data-src="([^"]+)"', re.IGNORECASE) _WN_HOST_RE = re.compile(r"https?://w\d+\.mypornerleak\.com", re.IGNORECASE) _CLASS_ACTOR_RE = re.compile(r"\bactors-([a-z0-9-]+)") _CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)") def _parse_duration(s: str) -> int | None: parts = s.split(":") try: if len(parts) == 2: return int(parts[0]) * 60 + int(parts[1]) if len(parts) == 3: return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) except ValueError: return None return None def _name_from_slug(slug: str) -> str: return " ".join(w.capitalize() for w in slug.split("-") if w) class MyPornerLeakScraper(BaseSearchScraper): sitetag = "mypornerleakcom" def search( self, query: str, *, page: int = 1, limit: int | None = None ) -> Iterator[RawScene]: actor_slug = slugify(query) if not actor_slug: return url = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "") try: r = browser_get(url, timeout=self._timeout) except Exception as e: log.warning("mypornerleak actor-page fetch failed (%s): %s", url, e) return if r.status_code != 200: return text = r.text anchors = list(_ARTICLE_RE.finditer(text)) seen: set[str] = set() yielded = 0 for idx, m in enumerate(anchors): cls = m.group(1) win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500 window = text[m.start():win_end] link_m = _LINK_RE.search(window) if not link_m: continue scene_url = _WN_HOST_RE.sub(_BASE, link_m.group(1)).rstrip("/") + "/" if scene_url in seen: continue seen.add(scene_url) title = html.unescape(link_m.group(2)).strip() if not title: continue dur_m = _DURATION_RE.search(window) duration_sec = _parse_duration(dur_m.group(1)) if dur_m else None thumb_m = _THUMB_RE.search(window) thumb = thumb_m.group(1) if thumb_m else None # Performerzy + tagi z klasy
. performers: list[RawPerformer] = [] seen_perf: set[str] = set() for am in _CLASS_ACTOR_RE.finditer(cls): sl = am.group(1) if sl in seen_perf: continue seen_perf.add(sl) performers.append( RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=_name_from_slug(sl)) ) if not performers: performers.append( RawPerformer(external_id=f"{self.sitetag}:performer:{actor_slug}", name=query.strip()) ) tags: list[RawTag] = [] seen_tag: set[str] = set() for cm in _CLASS_CAT_RE.finditer(cls): sl = re.sub(r"-(porn|leaks?|videos?)$", "", cm.group(1)) if not sl or sl in seen_tag: continue seen_tag.add(sl) tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl)) yield RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, duration_sec=duration_sec, url=scene_url, performers=performers, tags=tags, playback_sources=[ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumb, ) ], ) yielded += 1 if limit is not None and yielded >= limit: return