From e77deef667996426b31bb07b1a2cd5f0beeac2e6 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Tue, 16 Jun 2026 23:16:02 +0200 Subject: [PATCH] fix(mypornerleak): revive search via /actor/ listing + metadata Content moved to the w8.mypornerleak.com (wN) load-balancer subdomain, so the old bare-domain scene regex matched nothing (frozen since 05-07). Rewrote search() to scrape the canonical /actor// listing: scene URL (wN host normalized to canonical for stable dedup), title, duration, performers and category-tags from the
class (actors-*/category-*), thumbnail. No studio (OnlyFans/amateur leaks have none). Multi-performer works; playback unchanged (hoster, phone-side). Co-Authored-By: Claude Opus 4.8 --- .../direct_scrapers/mypornerleak.py | 141 +++++++++++++++++- 1 file changed, 133 insertions(+), 8 deletions(-) diff --git a/app/connectors/direct_scrapers/mypornerleak.py b/app/connectors/direct_scrapers/mypornerleak.py index 7a5738f..69120f7 100644 --- a/app/connectors/direct_scrapers/mypornerleak.py +++ b/app/connectors/direct_scrapers/mypornerleak.py @@ -1,19 +1,144 @@ -"""mypornerleak.com — direct HTML scrape. +"""mypornerleak.com — performer-page listing scrape (search-based, performer-driven). -Search: `https://mypornerleak.com/page//?s=`. -Scene URL: `https://mypornerleak.com//`. +2026-06-16 fix (zamrożony od 05-07): treść serwowana jest pod subdomeną +`w8.mypornerleak.com` (load-balancer wN.), a stary regex szukał scen na gołym +`mypornerleak.com//` → 0 trafień. Kanoniczny host i tak serwuje stronę +listingu i sceny, więc fetchujemy `mypornerleak.com/actor//` a linki scen +normalizujemy wN.→ kanoniczny (stabilny page_url/dedup). + +Wszystko z LISTINGU (bez detail-fetcha) — karta `
` koduje metadane: + - klasa `actors-` (multi) → performerzy; `category-` (multi) → tagi + - `` → URL sceny + czysty tytuł + - `MM:SS|HH:MM:SS` + - `` → thumbnail + +Bez studia: mypornerleak to repost leaków OnlyFans/amatorskich (brak studyjnego +źródła). Playback przez extractor `mypornerleakcom` (_embed_iframe → hoster). """ from __future__ import annotations +import html +import logging import re +from collections.abc import Iterator +from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawTag from app.connectors.direct_scrapers._search_base import BaseSearchScraper +from app.extractors import browser_get +from app.normalize.text import slugify + +log = logging.getLogger(__name__) + +_BASE = "https://mypornerleak.com" +_ARTICLE_RE = re.compile(r']*\bclass="([^"]+)"', re.IGNORECASE) +_LINK_RE = re.compile(r'(?:<[^>]+>)*\s*(\d{1,2}:\d{2}(?::\d{2})?)', re.IGNORECASE) +_THUMB_RE = re.compile(r'data-src="([^"]+)"', re.IGNORECASE) +_WN_HOST_RE = re.compile(r"https?://w\d+\.mypornerleak\.com", re.IGNORECASE) +_CLASS_ACTOR_RE = re.compile(r"\bactors-([a-z0-9-]+)") +_CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)") + + +def _parse_duration(s: str) -> int | None: + parts = s.split(":") + try: + if len(parts) == 2: + return int(parts[0]) * 60 + int(parts[1]) + if len(parts) == 3: + return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) + except ValueError: + return None + return None + + +def _name_from_slug(slug: str) -> str: + return " ".join(w.capitalize() for w in slug.split("-") if w) class MyPornerLeakScraper(BaseSearchScraper): sitetag = "mypornerleakcom" - _search_url_template = "https://mypornerleak.com/page/{page}/?s={query}" - _scene_url_re = re.compile( - r'href="(?Phttps://mypornerleak\.com/(?P[a-z0-9][a-z0-9\-]+))/"', - re.IGNORECASE, - ) + + def search( + self, query: str, *, page: int = 1, limit: int | None = None + ) -> Iterator[RawScene]: + actor_slug = slugify(query) + if not actor_slug: + return + url = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "") + try: + r = browser_get(url, timeout=self._timeout) + except Exception as e: + log.warning("mypornerleak actor-page fetch failed (%s): %s", url, e) + return + if r.status_code != 200: + return + + text = r.text + anchors = list(_ARTICLE_RE.finditer(text)) + seen: set[str] = set() + yielded = 0 + for idx, m in enumerate(anchors): + cls = m.group(1) + win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500 + window = text[m.start():win_end] + + link_m = _LINK_RE.search(window) + if not link_m: + continue + scene_url = _WN_HOST_RE.sub(_BASE, link_m.group(1)).rstrip("/") + "/" + if scene_url in seen: + continue + seen.add(scene_url) + title = html.unescape(link_m.group(2)).strip() + if not title: + continue + + dur_m = _DURATION_RE.search(window) + duration_sec = _parse_duration(dur_m.group(1)) if dur_m else None + thumb_m = _THUMB_RE.search(window) + thumb = thumb_m.group(1) if thumb_m else None + + # Performerzy + tagi z klasy
. + performers: list[RawPerformer] = [] + seen_perf: set[str] = set() + for am in _CLASS_ACTOR_RE.finditer(cls): + sl = am.group(1) + if sl in seen_perf: + continue + seen_perf.add(sl) + performers.append( + RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=_name_from_slug(sl)) + ) + if not performers: + performers.append( + RawPerformer(external_id=f"{self.sitetag}:performer:{actor_slug}", name=query.strip()) + ) + + tags: list[RawTag] = [] + seen_tag: set[str] = set() + for cm in _CLASS_CAT_RE.finditer(cls): + sl = re.sub(r"-(porn|leaks?|videos?)$", "", cm.group(1)) + if not sl or sl in seen_tag: + continue + seen_tag.add(sl) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl)) + + yield RawScene( + external_id=f"{self.sitetag}:{scene_url}", + title=title, + duration_sec=duration_sec, + url=scene_url, + performers=performers, + tags=tags, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumb, + ) + ], + ) + yielded += 1 + if limit is not None and yielded >= limit: + return