goon/app/connectors/direct_scrapers/porntrex_browse.py

"""porntrex.com — latest-vids BROWSE scraper (KVS), obok istniejącego search scrapera.

PornTrexScraper (search, performer-driven) zostaje w ALL_DIRECT_SCRAPERS — daje
pokrycie back-catalogu performerów. Ten browse dokłada gwarancję świeżości wprost
z feedu `/latest-updates/<n>/` (próg watchdog 48h zamiast 168h, nie zależy od kolejki
performerów). Wzorzec jak xvideos (search + browse równolegle).

KVS listing tile:
  <div ... data-item-id="<id>"><a href="https://www.porntrex.com/video/<id>/<slug>">
    <img data-src="//ptx.cdntrex.com/contents/.../300x168/1.jpg" alt="<Tytuł>">
    <div class="duration">MM:SS</div>
Playback: KVS, natywny extractor `porntrexcom` (token expires+md5, portable) — bez zmian.
"""
from __future__ import annotations

import html
import logging
import re

from app.connectors.base import RawFingerprint, RawPlaybackSource, RawScene
from app.connectors.direct_scrapers._browse_base import (
    BaseBrowseScraper,
    compute_thumbnail_phash,
)
from app.extractors import browser_get

log = logging.getLogger(__name__)

_BASE = "https://www.porntrex.com"
_A_RE = re.compile(
    r'<a\s+href="(?P<url>https?://(?:www\.)?porntrex\.com/video/\d+/[^"]*)"', re.IGNORECASE
)
_ALT_RE = re.compile(r'alt="([^"]*)"')
_THUMB_RE = re.compile(r'data-src="(//[^"]+\.(?:jpg|jpeg|webp|png)[^"]*)"', re.IGNORECASE)
_DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<')


def _parse_duration(text: str | None) -> int | None:
    if not text:
        return None
    try:
        nums = [int(p.strip()) for p in text.split(":")]
    except ValueError:
        return None
    if len(nums) == 2:
        return nums[0] * 60 + nums[1]
    if len(nums) == 3:
        return nums[0] * 3600 + nums[1] * 60 + nums[2]
    return None


class PornTrexBrowseScraper(BaseBrowseScraper):
    sitetag = "porntrexcom"

    def _listing_url(self, page: int) -> str:
        return f"{_BASE}/latest-updates/{page}/"

    def _extract_scene_urls(self, listing_html: str) -> list[str]:
        return [m.group("url") for m in _A_RE.finditer(listing_html)]

    def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
        return None

    def crawl_page(self, page: int) -> list[RawScene] | None:
        url = self._listing_url(page)
        try:
            res = browser_get(url, timeout=self._timeout)
            text = res.text if hasattr(res, "text") else res
        except Exception as e:
            log.warning("porntrex browse fetch failed (page %d): %s", page, e)
            return None

        out: list[RawScene] = []
        seen: set[str] = set()
        anchors = list(_A_RE.finditer(text))
        for idx, m in enumerate(anchors):
            scene_url = m.group("url").replace("://www.", "://").rstrip("/")
            if scene_url in seen:
                continue
            seen.add(scene_url)
            win = text[m.start(): (anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 700)]

            am = _ALT_RE.search(win)
            title = html.unescape(am.group(1)).strip() if am else ""
            if not title:
                # fallback: slug → tytuł
                sl = re.search(r"/video/\d+/([a-z0-9\-]+)", scene_url)
                title = sl.group(1).replace("-", " ").strip().title() if sl else ""
            if not title:
                continue
            tm = _THUMB_RE.search(win)
            thumb = ("https:" + tm.group(1)) if tm else None
            dm = _DUR_RE.search(win)
            duration_sec = _parse_duration(dm.group(1) if dm else None)

            fingerprints: list[RawFingerprint] = []
            if thumb:
                ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
                if ph:
                    fingerprints.append(RawFingerprint(kind="phash", value=ph))

            out.append(
                RawScene(
                    external_id=f"{self.sitetag}:{scene_url}",
                    title=title,
                    duration_sec=duration_sec,
                    url=scene_url,
                    performers=[],
                    tags=[],
                    fingerprints=fingerprints,
                    playback_sources=[
                        RawPlaybackSource(
                            origin=f"tube:{self.sitetag}",
                            page_url=scene_url,
                            duration_sec=duration_sec,
                            thumbnail_url=thumb,
                        )
                    ],
                )
            )
        log.info("porntrex browse page %d: %d scenes", page, len(out))
        return out