"""eporner.com — deep-crawl przez oficjalne JSON API (api/v2/video/search). Detail HTML eporner jest JS-heavy (brak SSR duration/title/tagów) — ALE eporner ma publiczne API zwracające KOMPLETNĄ metadatę w jednym callu: `title`, `length_sec` (duration), `keywords` (tagi), `added` (data), thumb, embed, url. ~100k filmów, `order=latest`, ~100/stronę → ~1000 szybkich calli (BEZ detail-fetch). To czyni eporner idealnym SSR-bogatym źródłem deep-crawla (analiza 2026-06-03: porntrex/hqporner odrzucone — KVS bez SSR duration; eporner-API je zastępuje). Override `crawl_page()` (API flow). HTML-owe _listing_url/_extract/_parse to stuby (BaseBrowseScraper ABC ich wymaga, ale nieużywane). Sitetag `epornercom` = ten sam co search-scraper EpornerScraper → external_id namespace wspólny (dedup). """ from __future__ import annotations import logging from datetime import date, datetime import httpx from app.connectors.base import RawPlaybackSource, RawScene, RawTag from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper from app.normalize.text import slugify log = logging.getLogger(__name__) _API = "https://www.eporner.com/api/v2/video/search/" _PER_PAGE = 100 _UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" def _parse_added(value: str | None) -> date | None: if not value: return None try: return datetime.strptime(value.strip(), "%Y-%m-%d %H:%M:%S").date() except ValueError: return None def _keywords_to_tags(keywords: str | None, sitetag: str) -> list[RawTag]: if not keywords: return [] tags: list[RawTag] = [] seen: set[str] = set() for kw in keywords.split(","): name = kw.strip() # Pomijamy puste + title-jak-keyword (eporner czasem wrzuca cały title jako keyword). if not name or len(name) > 40: continue slug = slugify(name) if not slug or slug in seen or len(slug) > 60: continue seen.add(slug) tags.append(RawTag(external_id=f"{sitetag}:tag:{slug}", name=name, slug=slug)) return tags class EpornerApiScraper(BaseBrowseScraper): sitetag = "epornercom" def crawl_page(self, page: int) -> list[RawScene] | None: params = { "query": "", "per_page": _PER_PAGE, "page": page, "order": "latest", "thumbsize": "medium", "format": "json", } try: with httpx.Client(timeout=self._timeout, follow_redirects=True, headers={"User-Agent": _UA}) as c: r = c.get(_API, params=params) if r.status_code != 200: log.warning("eporner api page %d status %d", page, r.status_code) return None data = r.json() except Exception as e: log.warning("eporner api page %d failed: %s", page, e) return None videos = data.get("videos") or [] if not videos: return [] # poza ostatnią stroną → koniec katalogu (exhausted) out: list[RawScene] = [] for v in videos: url = (v.get("url") or "").strip() title = (v.get("title") or "").strip() if not url or not title: continue dur = v.get("length_sec") duration_sec = int(dur) if dur else None thumb = (v.get("default_thumb") or {}).get("src") out.append( RawScene( external_id=f"{self.sitetag}:{url}", title=title, duration_sec=duration_sec, release_date=_parse_added(v.get("added")), url=url, tags=_keywords_to_tags(v.get("keywords"), self.sitetag), playback_sources=[ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=url, duration_sec=duration_sec, thumbnail_url=thumb, ) ], raw={"source": "eporner_api", "id": v.get("id")}, ) ) return out # HTML stuby — nieużywane (crawl_page nadpisany API-flow), ale ABC ich wymaga. def _listing_url(self, page: int) -> str: # pragma: no cover raise NotImplementedError("EpornerApiScraper używa crawl_page (API), nie HTML listingu") def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover raise NotImplementedError def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: # pragma: no cover raise NotImplementedError