Both were search-only — fresh only as long as the performer queue cycles and the site search keeps working. Added browse scrapers next to the existing search ones (xvideos/eporner pattern: search keeps performer back-catalog coverage, browse guarantees latest-feed freshness → watchdog 48h instead of 168h): - porntrex: KVS /latest-updates/<n>/ (title + thumb + phash) - mypornerleak: WP REST /wp-json/wp/v2/posts?_embed=1 (title + date + studio from category + performers from the actors taxonomy) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
122 lines
4.5 KiB
Python
122 lines
4.5 KiB
Python
"""porntrex.com — latest-vids BROWSE scraper (KVS), obok istniejącego search scrapera.
|
|
|
|
PornTrexScraper (search, performer-driven) zostaje w ALL_DIRECT_SCRAPERS — daje
|
|
pokrycie back-catalogu performerów. Ten browse dokłada gwarancję świeżości wprost
|
|
z feedu `/latest-updates/<n>/` (próg watchdog 48h zamiast 168h, nie zależy od kolejki
|
|
performerów). Wzorzec jak xvideos (search + browse równolegle).
|
|
|
|
KVS listing tile:
|
|
<div ... data-item-id="<id>"><a href="https://www.porntrex.com/video/<id>/<slug>">
|
|
<img data-src="//ptx.cdntrex.com/contents/.../300x168/1.jpg" alt="<Tytuł>">
|
|
<div class="duration">MM:SS</div>
|
|
Playback: KVS, natywny extractor `porntrexcom` (token expires+md5, portable) — bez zmian.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import html
|
|
import logging
|
|
import re
|
|
|
|
from app.connectors.base import RawFingerprint, RawPlaybackSource, RawScene
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
)
|
|
from app.extractors import browser_get
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_BASE = "https://www.porntrex.com"
|
|
_A_RE = re.compile(
|
|
r'<a\s+href="(?P<url>https?://(?:www\.)?porntrex\.com/video/\d+/[^"]*)"', re.IGNORECASE
|
|
)
|
|
_ALT_RE = re.compile(r'alt="([^"]*)"')
|
|
_THUMB_RE = re.compile(r'data-src="(//[^"]+\.(?:jpg|jpeg|webp|png)[^"]*)"', re.IGNORECASE)
|
|
_DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<')
|
|
|
|
|
|
def _parse_duration(text: str | None) -> int | None:
|
|
if not text:
|
|
return None
|
|
try:
|
|
nums = [int(p.strip()) for p in text.split(":")]
|
|
except ValueError:
|
|
return None
|
|
if len(nums) == 2:
|
|
return nums[0] * 60 + nums[1]
|
|
if len(nums) == 3:
|
|
return nums[0] * 3600 + nums[1] * 60 + nums[2]
|
|
return None
|
|
|
|
|
|
class PornTrexBrowseScraper(BaseBrowseScraper):
|
|
sitetag = "porntrexcom"
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
return f"{_BASE}/latest-updates/{page}/"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
return [m.group("url") for m in _A_RE.finditer(listing_html)]
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
return None
|
|
|
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
|
url = self._listing_url(page)
|
|
try:
|
|
res = browser_get(url, timeout=self._timeout)
|
|
text = res.text if hasattr(res, "text") else res
|
|
except Exception as e:
|
|
log.warning("porntrex browse fetch failed (page %d): %s", page, e)
|
|
return None
|
|
|
|
out: list[RawScene] = []
|
|
seen: set[str] = set()
|
|
anchors = list(_A_RE.finditer(text))
|
|
for idx, m in enumerate(anchors):
|
|
scene_url = m.group("url").replace("://www.", "://").rstrip("/")
|
|
if scene_url in seen:
|
|
continue
|
|
seen.add(scene_url)
|
|
win = text[m.start(): (anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 700)]
|
|
|
|
am = _ALT_RE.search(win)
|
|
title = html.unescape(am.group(1)).strip() if am else ""
|
|
if not title:
|
|
# fallback: slug → tytuł
|
|
sl = re.search(r"/video/\d+/([a-z0-9\-]+)", scene_url)
|
|
title = sl.group(1).replace("-", " ").strip().title() if sl else ""
|
|
if not title:
|
|
continue
|
|
tm = _THUMB_RE.search(win)
|
|
thumb = ("https:" + tm.group(1)) if tm else None
|
|
dm = _DUR_RE.search(win)
|
|
duration_sec = _parse_duration(dm.group(1) if dm else None)
|
|
|
|
fingerprints: list[RawFingerprint] = []
|
|
if thumb:
|
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
|
if ph:
|
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
|
|
out.append(
|
|
RawScene(
|
|
external_id=f"{self.sitetag}:{scene_url}",
|
|
title=title,
|
|
duration_sec=duration_sec,
|
|
url=scene_url,
|
|
performers=[],
|
|
tags=[],
|
|
fingerprints=fingerprints,
|
|
playback_sources=[
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumb,
|
|
)
|
|
],
|
|
)
|
|
)
|
|
log.info("porntrex browse page %d: %d scenes", page, len(out))
|
|
return out
|