feat(ingest): add browse scrapers for porntrex + mypornerleak (alongside search)
Both were search-only — fresh only as long as the performer queue cycles and the site search keeps working. Added browse scrapers next to the existing search ones (xvideos/eporner pattern: search keeps performer back-catalog coverage, browse guarantees latest-feed freshness → watchdog 48h instead of 168h): - porntrex: KVS /latest-updates/<n>/ (title + thumb + phash) - mypornerleak: WP REST /wp-json/wp/v2/posts?_embed=1 (title + date + studio from category + performers from the actors taxonomy) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a10c51aebf
commit
55612e262b
3 changed files with 279 additions and 0 deletions
|
|
@ -32,10 +32,12 @@ from app.connectors.direct_scrapers.hqporner import HQPornerScraper
|
|||
from app.connectors.direct_scrapers.latestleaks import LatestLeaksScraper
|
||||
from app.connectors.direct_scrapers.latestpornvideo import LatestPornVideoScraper
|
||||
from app.connectors.direct_scrapers.mypornerleak import MyPornerLeakScraper
|
||||
from app.connectors.direct_scrapers.mypornerleak_browse import MyPornerLeakBrowseScraper
|
||||
from app.connectors.direct_scrapers.perverzija import PerverzijaScraper
|
||||
from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
||||
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
||||
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
||||
from app.connectors.direct_scrapers.porntrex_browse import PornTrexBrowseScraper
|
||||
from app.connectors.direct_scrapers.siska import SiskaScraper
|
||||
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
||||
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
|
||||
|
|
@ -142,6 +144,11 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
|
|||
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||
PerverzijaScraper,
|
||||
PornDishScraper,
|
||||
# Browse równolegle do istniejącego search scrapera (wzorzec xvideos/eporner):
|
||||
# search zostaje (pokrycie back-catalogu performerów), browse gwarantuje świeżość
|
||||
# wprost z feedu (watchdog 48h zamiast 168h). Konwersja 2026-06-24 (user request).
|
||||
PornTrexBrowseScraper,
|
||||
MyPornerLeakBrowseScraper,
|
||||
FreshpornoScraper,
|
||||
FpoxxxScraper,
|
||||
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
||||
|
|
|
|||
150
app/connectors/direct_scrapers/mypornerleak_browse.py
Normal file
150
app/connectors/direct_scrapers/mypornerleak_browse.py
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
"""mypornerleak.com — latest BROWSE scraper via WordPress REST API, obok search scrapera.
|
||||
|
||||
MyPornerLeakScraper (search) zostaje w ALL_DIRECT_SCRAPERS; ten browse dokłada
|
||||
świeżość wprost z WP REST (`/wp-json/wp/v2/posts?_embed=1`). W odróżnieniu od
|
||||
perverzija/porndish, mypornerleak WYSTAWIA custom taksonomię `actors` w REST →
|
||||
mamy też performerów (nie tylko studio z `category` + tagi z `post_tag`).
|
||||
|
||||
Playback: post page embeduje hoster iframe → extractor `mypornerleakcom` →
|
||||
`_embed_iframe`, resolwowany phone-side (bez zmian).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import json
|
||||
import logging
|
||||
from datetime import date, datetime
|
||||
|
||||
from app.connectors.base import (
|
||||
RawFingerprint,
|
||||
RawPerformer,
|
||||
RawPlaybackSource,
|
||||
RawScene,
|
||||
RawStudio,
|
||||
RawTag,
|
||||
)
|
||||
from app.connectors.direct_scrapers._browse_base import (
|
||||
BaseBrowseScraper,
|
||||
compute_thumbnail_phash,
|
||||
)
|
||||
from app.extractors import browser_get
|
||||
from app.normalize.text import slugify
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_BASE = "https://mypornerleak.com"
|
||||
_PER_PAGE = 20
|
||||
|
||||
|
||||
def _parse_date(value: str | None) -> date | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
class MyPornerLeakBrowseScraper(BaseBrowseScraper):
|
||||
sitetag = "mypornerleakcom"
|
||||
|
||||
def _listing_url(self, page: int) -> str:
|
||||
return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1"
|
||||
|
||||
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||
return []
|
||||
|
||||
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||
return None
|
||||
|
||||
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||
url = self._listing_url(page)
|
||||
try:
|
||||
res = browser_get(url, timeout=self._timeout)
|
||||
except Exception as e:
|
||||
log.warning("mypornerleak REST fetch failed (page %d): %s", page, e)
|
||||
return None
|
||||
if res.status_code != 200:
|
||||
return []
|
||||
try:
|
||||
posts = json.loads(res.text)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
log.warning("mypornerleak REST: bad JSON page %d", page)
|
||||
return None
|
||||
if not isinstance(posts, list) or not posts:
|
||||
return []
|
||||
|
||||
out: list[RawScene] = []
|
||||
for p in posts:
|
||||
link = (p.get("link") or "").strip()
|
||||
title = html.unescape((p.get("title") or {}).get("rendered", "")).strip()
|
||||
if not link or not title:
|
||||
continue
|
||||
release_date = _parse_date(p.get("date"))
|
||||
|
||||
emb = p.get("_embedded") or {}
|
||||
fm = emb.get("wp:featuredmedia") or []
|
||||
thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None
|
||||
|
||||
studio: RawStudio | None = None
|
||||
tags: list[RawTag] = []
|
||||
performers: list[RawPerformer] = []
|
||||
seen_tag: set[str] = set()
|
||||
seen_perf: set[str] = set()
|
||||
for group in emb.get("wp:term") or []:
|
||||
if not group:
|
||||
continue
|
||||
tax = group[0].get("taxonomy")
|
||||
if tax == "category" and studio is None:
|
||||
sname = (group[0].get("name") or "").strip()
|
||||
if sname:
|
||||
studio = RawStudio(
|
||||
external_id=f"{self.sitetag}:studio:{slugify(sname)}",
|
||||
name=sname, slug=slugify(sname),
|
||||
)
|
||||
elif tax == "actors":
|
||||
for g in group:
|
||||
name = (g.get("name") or "").strip()
|
||||
sl = slugify(name)
|
||||
if not name or sl in seen_perf:
|
||||
continue
|
||||
seen_perf.add(sl)
|
||||
performers.append(
|
||||
RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name)
|
||||
)
|
||||
elif tax == "post_tag":
|
||||
for g in group:
|
||||
name = (g.get("name") or "").strip()
|
||||
sl = (g.get("slug") or slugify(name)).strip()
|
||||
if not name or sl in seen_tag:
|
||||
continue
|
||||
seen_tag.add(sl)
|
||||
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
|
||||
|
||||
fingerprints: list[RawFingerprint] = []
|
||||
if thumb:
|
||||
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||
if ph:
|
||||
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||
|
||||
out.append(
|
||||
RawScene(
|
||||
external_id=f"{self.sitetag}:{link}",
|
||||
title=title,
|
||||
release_date=release_date,
|
||||
url=link,
|
||||
studio=studio,
|
||||
performers=performers,
|
||||
tags=tags,
|
||||
fingerprints=fingerprints,
|
||||
playback_sources=[
|
||||
RawPlaybackSource(
|
||||
origin=f"tube:{self.sitetag}",
|
||||
page_url=link,
|
||||
thumbnail_url=thumb,
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
log.info("mypornerleak REST page %d: %d scenes", page, len(out))
|
||||
return out
|
||||
122
app/connectors/direct_scrapers/porntrex_browse.py
Normal file
122
app/connectors/direct_scrapers/porntrex_browse.py
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
"""porntrex.com — latest-vids BROWSE scraper (KVS), obok istniejącego search scrapera.
|
||||
|
||||
PornTrexScraper (search, performer-driven) zostaje w ALL_DIRECT_SCRAPERS — daje
|
||||
pokrycie back-catalogu performerów. Ten browse dokłada gwarancję świeżości wprost
|
||||
z feedu `/latest-updates/<n>/` (próg watchdog 48h zamiast 168h, nie zależy od kolejki
|
||||
performerów). Wzorzec jak xvideos (search + browse równolegle).
|
||||
|
||||
KVS listing tile:
|
||||
<div ... data-item-id="<id>"><a href="https://www.porntrex.com/video/<id>/<slug>">
|
||||
<img data-src="//ptx.cdntrex.com/contents/.../300x168/1.jpg" alt="<Tytuł>">
|
||||
<div class="duration">MM:SS</div>
|
||||
Playback: KVS, natywny extractor `porntrexcom` (token expires+md5, portable) — bez zmian.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import logging
|
||||
import re
|
||||
|
||||
from app.connectors.base import RawFingerprint, RawPlaybackSource, RawScene
|
||||
from app.connectors.direct_scrapers._browse_base import (
|
||||
BaseBrowseScraper,
|
||||
compute_thumbnail_phash,
|
||||
)
|
||||
from app.extractors import browser_get
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_BASE = "https://www.porntrex.com"
|
||||
_A_RE = re.compile(
|
||||
r'<a\s+href="(?P<url>https?://(?:www\.)?porntrex\.com/video/\d+/[^"]*)"', re.IGNORECASE
|
||||
)
|
||||
_ALT_RE = re.compile(r'alt="([^"]*)"')
|
||||
_THUMB_RE = re.compile(r'data-src="(//[^"]+\.(?:jpg|jpeg|webp|png)[^"]*)"', re.IGNORECASE)
|
||||
_DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<')
|
||||
|
||||
|
||||
def _parse_duration(text: str | None) -> int | None:
|
||||
if not text:
|
||||
return None
|
||||
try:
|
||||
nums = [int(p.strip()) for p in text.split(":")]
|
||||
except ValueError:
|
||||
return None
|
||||
if len(nums) == 2:
|
||||
return nums[0] * 60 + nums[1]
|
||||
if len(nums) == 3:
|
||||
return nums[0] * 3600 + nums[1] * 60 + nums[2]
|
||||
return None
|
||||
|
||||
|
||||
class PornTrexBrowseScraper(BaseBrowseScraper):
|
||||
sitetag = "porntrexcom"
|
||||
|
||||
def _listing_url(self, page: int) -> str:
|
||||
return f"{_BASE}/latest-updates/{page}/"
|
||||
|
||||
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||
return [m.group("url") for m in _A_RE.finditer(listing_html)]
|
||||
|
||||
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||
return None
|
||||
|
||||
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||
url = self._listing_url(page)
|
||||
try:
|
||||
res = browser_get(url, timeout=self._timeout)
|
||||
text = res.text if hasattr(res, "text") else res
|
||||
except Exception as e:
|
||||
log.warning("porntrex browse fetch failed (page %d): %s", page, e)
|
||||
return None
|
||||
|
||||
out: list[RawScene] = []
|
||||
seen: set[str] = set()
|
||||
anchors = list(_A_RE.finditer(text))
|
||||
for idx, m in enumerate(anchors):
|
||||
scene_url = m.group("url").replace("://www.", "://").rstrip("/")
|
||||
if scene_url in seen:
|
||||
continue
|
||||
seen.add(scene_url)
|
||||
win = text[m.start(): (anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 700)]
|
||||
|
||||
am = _ALT_RE.search(win)
|
||||
title = html.unescape(am.group(1)).strip() if am else ""
|
||||
if not title:
|
||||
# fallback: slug → tytuł
|
||||
sl = re.search(r"/video/\d+/([a-z0-9\-]+)", scene_url)
|
||||
title = sl.group(1).replace("-", " ").strip().title() if sl else ""
|
||||
if not title:
|
||||
continue
|
||||
tm = _THUMB_RE.search(win)
|
||||
thumb = ("https:" + tm.group(1)) if tm else None
|
||||
dm = _DUR_RE.search(win)
|
||||
duration_sec = _parse_duration(dm.group(1) if dm else None)
|
||||
|
||||
fingerprints: list[RawFingerprint] = []
|
||||
if thumb:
|
||||
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||
if ph:
|
||||
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||
|
||||
out.append(
|
||||
RawScene(
|
||||
external_id=f"{self.sitetag}:{scene_url}",
|
||||
title=title,
|
||||
duration_sec=duration_sec,
|
||||
url=scene_url,
|
||||
performers=[],
|
||||
tags=[],
|
||||
fingerprints=fingerprints,
|
||||
playback_sources=[
|
||||
RawPlaybackSource(
|
||||
origin=f"tube:{self.sitetag}",
|
||||
page_url=scene_url,
|
||||
duration_sec=duration_sec,
|
||||
thumbnail_url=thumb,
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
log.info("porntrex browse page %d: %d scenes", page, len(out))
|
||||
return out
|
||||
Loading…
Add table
Reference in a new issue