"""porntrex.com — latest-vids BROWSE scraper (KVS), obok istniejącego search scrapera. PornTrexScraper (search, performer-driven) zostaje w ALL_DIRECT_SCRAPERS — daje pokrycie back-catalogu performerów. Ten browse dokłada gwarancję świeżości wprost z feedu `/latest-updates//` (próg watchdog 48h zamiast 168h, nie zależy od kolejki performerów). Wzorzec jak xvideos (search + browse równolegle). KVS listing tile:
<Tytuł>
MM:SS
Playback: KVS, natywny extractor `porntrexcom` (token expires+md5, portable) — bez zmian. """ from __future__ import annotations import html import logging import re from app.connectors.base import RawFingerprint, RawPlaybackSource, RawScene from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, ) from app.extractors import browser_get log = logging.getLogger(__name__) _BASE = "https://www.porntrex.com" _A_RE = re.compile( r'https?://(?:www\.)?porntrex\.com/video/\d+/[^"]*)"', re.IGNORECASE ) _ALT_RE = re.compile(r'alt="([^"]*)"') _THUMB_RE = re.compile(r'data-src="(//[^"]+\.(?:jpg|jpeg|webp|png)[^"]*)"', re.IGNORECASE) _DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<') def _parse_duration(text: str | None) -> int | None: if not text: return None try: nums = [int(p.strip()) for p in text.split(":")] except ValueError: return None if len(nums) == 2: return nums[0] * 60 + nums[1] if len(nums) == 3: return nums[0] * 3600 + nums[1] * 60 + nums[2] return None class PornTrexBrowseScraper(BaseBrowseScraper): sitetag = "porntrexcom" def _listing_url(self, page: int) -> str: return f"{_BASE}/latest-updates/{page}/" def _extract_scene_urls(self, listing_html: str) -> list[str]: return [m.group("url") for m in _A_RE.finditer(listing_html)] def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: return None def crawl_page(self, page: int) -> list[RawScene] | None: url = self._listing_url(page) try: res = browser_get(url, timeout=self._timeout) text = res.text if hasattr(res, "text") else res except Exception as e: log.warning("porntrex browse fetch failed (page %d): %s", page, e) return None out: list[RawScene] = [] seen: set[str] = set() anchors = list(_A_RE.finditer(text)) for idx, m in enumerate(anchors): scene_url = m.group("url").replace("://www.", "://").rstrip("/") if scene_url in seen: continue seen.add(scene_url) win = text[m.start(): (anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 700)] am = _ALT_RE.search(win) title = html.unescape(am.group(1)).strip() if am else "" if not title: # fallback: slug → tytuł sl = re.search(r"/video/\d+/([a-z0-9\-]+)", scene_url) title = sl.group(1).replace("-", " ").strip().title() if sl else "" if not title: continue tm = _THUMB_RE.search(win) thumb = ("https:" + tm.group(1)) if tm else None dm = _DUR_RE.search(win) duration_sec = _parse_duration(dm.group(1) if dm else None) fingerprints: list[RawFingerprint] = [] if thumb: ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) out.append( RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, duration_sec=duration_sec, url=scene_url, performers=[], tags=[], fingerprints=fingerprints, playback_sources=[ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumb, ) ], ) ) log.info("porntrex browse page %d: %d scenes", page, len(out)) return out