diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index da28cd8..0c8c84e 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -32,10 +32,12 @@ from app.connectors.direct_scrapers.hqporner import HQPornerScraper from app.connectors.direct_scrapers.latestleaks import LatestLeaksScraper from app.connectors.direct_scrapers.latestpornvideo import LatestPornVideoScraper from app.connectors.direct_scrapers.mypornerleak import MyPornerLeakScraper +from app.connectors.direct_scrapers.mypornerleak_browse import MyPornerLeakBrowseScraper from app.connectors.direct_scrapers.perverzija import PerverzijaScraper from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper from app.connectors.direct_scrapers.porndish import PornDishScraper from app.connectors.direct_scrapers.porntrex import PornTrexScraper +from app.connectors.direct_scrapers.porntrex_browse import PornTrexBrowseScraper from app.connectors.direct_scrapers.siska import SiskaScraper from app.connectors.direct_scrapers.sxyland import SxyLandScraper from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper @@ -142,6 +144,11 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ PerverzijaScraper, PornDishScraper, + # Browse równolegle do istniejącego search scrapera (wzorzec xvideos/eporner): + # search zostaje (pokrycie back-catalogu performerów), browse gwarantuje świeżość + # wprost z feedu (watchdog 48h zamiast 168h). Konwersja 2026-06-24 (user request). + PornTrexBrowseScraper, + MyPornerLeakBrowseScraper, FreshpornoScraper, FpoxxxScraper, # LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven diff --git a/app/connectors/direct_scrapers/mypornerleak_browse.py b/app/connectors/direct_scrapers/mypornerleak_browse.py new file mode 100644 index 0000000..39bc7f6 --- /dev/null +++ b/app/connectors/direct_scrapers/mypornerleak_browse.py @@ -0,0 +1,150 @@ +"""mypornerleak.com — latest BROWSE scraper via WordPress REST API, obok search scrapera. + +MyPornerLeakScraper (search) zostaje w ALL_DIRECT_SCRAPERS; ten browse dokłada +świeżość wprost z WP REST (`/wp-json/wp/v2/posts?_embed=1`). W odróżnieniu od +perverzija/porndish, mypornerleak WYSTAWIA custom taksonomię `actors` w REST → +mamy też performerów (nie tylko studio z `category` + tagi z `post_tag`). + +Playback: post page embeduje hoster iframe → extractor `mypornerleakcom` → +`_embed_iframe`, resolwowany phone-side (bez zmian). +""" +from __future__ import annotations + +import html +import json +import logging +from datetime import date, datetime + +from app.connectors.base import ( + RawFingerprint, + RawPerformer, + RawPlaybackSource, + RawScene, + RawStudio, + RawTag, +) +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, +) +from app.extractors import browser_get +from app.normalize.text import slugify + +log = logging.getLogger(__name__) + +_BASE = "https://mypornerleak.com" +_PER_PAGE = 20 + + +def _parse_date(value: str | None) -> date | None: + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")).date() + except ValueError: + return None + + +class MyPornerLeakBrowseScraper(BaseBrowseScraper): + sitetag = "mypornerleakcom" + + def _listing_url(self, page: int) -> str: + return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1" + + def _extract_scene_urls(self, listing_html: str) -> list[str]: + return [] + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + return None + + def crawl_page(self, page: int) -> list[RawScene] | None: + url = self._listing_url(page) + try: + res = browser_get(url, timeout=self._timeout) + except Exception as e: + log.warning("mypornerleak REST fetch failed (page %d): %s", page, e) + return None + if res.status_code != 200: + return [] + try: + posts = json.loads(res.text) + except (json.JSONDecodeError, ValueError): + log.warning("mypornerleak REST: bad JSON page %d", page) + return None + if not isinstance(posts, list) or not posts: + return [] + + out: list[RawScene] = [] + for p in posts: + link = (p.get("link") or "").strip() + title = html.unescape((p.get("title") or {}).get("rendered", "")).strip() + if not link or not title: + continue + release_date = _parse_date(p.get("date")) + + emb = p.get("_embedded") or {} + fm = emb.get("wp:featuredmedia") or [] + thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None + + studio: RawStudio | None = None + tags: list[RawTag] = [] + performers: list[RawPerformer] = [] + seen_tag: set[str] = set() + seen_perf: set[str] = set() + for group in emb.get("wp:term") or []: + if not group: + continue + tax = group[0].get("taxonomy") + if tax == "category" and studio is None: + sname = (group[0].get("name") or "").strip() + if sname: + studio = RawStudio( + external_id=f"{self.sitetag}:studio:{slugify(sname)}", + name=sname, slug=slugify(sname), + ) + elif tax == "actors": + for g in group: + name = (g.get("name") or "").strip() + sl = slugify(name) + if not name or sl in seen_perf: + continue + seen_perf.add(sl) + performers.append( + RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name) + ) + elif tax == "post_tag": + for g in group: + name = (g.get("name") or "").strip() + sl = (g.get("slug") or slugify(name)).strip() + if not name or sl in seen_tag: + continue + seen_tag.add(sl) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl)) + + fingerprints: list[RawFingerprint] = [] + if thumb: + ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + out.append( + RawScene( + external_id=f"{self.sitetag}:{link}", + title=title, + release_date=release_date, + url=link, + studio=studio, + performers=performers, + tags=tags, + fingerprints=fingerprints, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=link, + thumbnail_url=thumb, + ) + ], + ) + ) + log.info("mypornerleak REST page %d: %d scenes", page, len(out)) + return out diff --git a/app/connectors/direct_scrapers/porntrex_browse.py b/app/connectors/direct_scrapers/porntrex_browse.py new file mode 100644 index 0000000..9da409a --- /dev/null +++ b/app/connectors/direct_scrapers/porntrex_browse.py @@ -0,0 +1,122 @@ +"""porntrex.com — latest-vids BROWSE scraper (KVS), obok istniejącego search scrapera. + +PornTrexScraper (search, performer-driven) zostaje w ALL_DIRECT_SCRAPERS — daje +pokrycie back-catalogu performerów. Ten browse dokłada gwarancję świeżości wprost +z feedu `/latest-updates//` (próg watchdog 48h zamiast 168h, nie zależy od kolejki +performerów). Wzorzec jak xvideos (search + browse równolegle). + +KVS listing tile: +
+ <Tytuł> +
MM:SS
+Playback: KVS, natywny extractor `porntrexcom` (token expires+md5, portable) — bez zmian. +""" +from __future__ import annotations + +import html +import logging +import re + +from app.connectors.base import RawFingerprint, RawPlaybackSource, RawScene +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, +) +from app.extractors import browser_get + +log = logging.getLogger(__name__) + +_BASE = "https://www.porntrex.com" +_A_RE = re.compile( + r'https?://(?:www\.)?porntrex\.com/video/\d+/[^"]*)"', re.IGNORECASE +) +_ALT_RE = re.compile(r'alt="([^"]*)"') +_THUMB_RE = re.compile(r'data-src="(//[^"]+\.(?:jpg|jpeg|webp|png)[^"]*)"', re.IGNORECASE) +_DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<') + + +def _parse_duration(text: str | None) -> int | None: + if not text: + return None + try: + nums = [int(p.strip()) for p in text.split(":")] + except ValueError: + return None + if len(nums) == 2: + return nums[0] * 60 + nums[1] + if len(nums) == 3: + return nums[0] * 3600 + nums[1] * 60 + nums[2] + return None + + +class PornTrexBrowseScraper(BaseBrowseScraper): + sitetag = "porntrexcom" + + def _listing_url(self, page: int) -> str: + return f"{_BASE}/latest-updates/{page}/" + + def _extract_scene_urls(self, listing_html: str) -> list[str]: + return [m.group("url") for m in _A_RE.finditer(listing_html)] + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + return None + + def crawl_page(self, page: int) -> list[RawScene] | None: + url = self._listing_url(page) + try: + res = browser_get(url, timeout=self._timeout) + text = res.text if hasattr(res, "text") else res + except Exception as e: + log.warning("porntrex browse fetch failed (page %d): %s", page, e) + return None + + out: list[RawScene] = [] + seen: set[str] = set() + anchors = list(_A_RE.finditer(text)) + for idx, m in enumerate(anchors): + scene_url = m.group("url").replace("://www.", "://").rstrip("/") + if scene_url in seen: + continue + seen.add(scene_url) + win = text[m.start(): (anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 700)] + + am = _ALT_RE.search(win) + title = html.unescape(am.group(1)).strip() if am else "" + if not title: + # fallback: slug → tytuł + sl = re.search(r"/video/\d+/([a-z0-9\-]+)", scene_url) + title = sl.group(1).replace("-", " ").strip().title() if sl else "" + if not title: + continue + tm = _THUMB_RE.search(win) + thumb = ("https:" + tm.group(1)) if tm else None + dm = _DUR_RE.search(win) + duration_sec = _parse_duration(dm.group(1) if dm else None) + + fingerprints: list[RawFingerprint] = [] + if thumb: + ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + out.append( + RawScene( + external_id=f"{self.sitetag}:{scene_url}", + title=title, + duration_sec=duration_sec, + url=scene_url, + performers=[], + tags=[], + fingerprints=fingerprints, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumb, + ) + ], + ) + ) + log.info("porntrex browse page %d: %d scenes", page, len(out)) + return out