From ee4915770f503e448698518d89f63e8f0bf586da Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Wed, 3 Jun 2026 10:37:20 +0200 Subject: [PATCH] feat(deep-crawl): eporner via JSON API as SSR-rich source (Phase 2b alternative) porntrex/hqporner rejected for deep-crawl: KVS sites with no SSR metadata (77% of existing porntrex has no duration -> invisible under the app's >=60 filter). eporner instead exposes a public JSON API (api/v2/video/search) returning title + length_sec + keywords + added per video; ~100k videos, ~100/page, no per-scene detail fetch. - BaseBrowseScraper.crawl_page(page): factored out of latest_scenes; returns None (transient fail) / [] (catalog end) / [scenes]. API subclasses override it. - deep_crawl drives via crawl_page (supports HTML-listing AND API sources). - EpornerApiScraper: crawl_page hits the eporner API -> RawScene with duration+tags+ date+thumb+playback; registered in ALL_BROWSE_SCRAPERS. - Pilot (2 API pages): 192 new, 100% playable + tagged + visible (>=60); the <180s trailer filter dropped 6 short clips. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/connectors/direct_scrapers/__init__.py | 8 +- .../direct_scrapers/_browse_base.py | 83 ++++++------ app/connectors/direct_scrapers/eporner_api.py | 124 ++++++++++++++++++ app/scheduler/deep_crawl.py | 25 +--- 4 files changed, 181 insertions(+), 59 deletions(-) create mode 100644 app/connectors/direct_scrapers/eporner_api.py diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index 887d955..3c6ce6c 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -128,7 +128,7 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [ # Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer. # Phash thumbnail fingerprint (waga 0.40 w composite scoring) auto-mergeuje do # canonical (TPDB/StashDB) gdy tube hot-linkuje studio thumbnail. Schedulowane -# co 6h (4×/dobę), pages 1-5. Patrz `_browse_base.BaseBrowseScraper` + +# raz dziennie, pages 1-5. Patrz `_browse_base.BaseBrowseScraper` + # `app/scheduler/browse_latest.py`. # # **Pilot results (2026-05-12):** @@ -143,6 +143,7 @@ from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402 from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401 from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402 from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402 +from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402 ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ FreshpornoScraper, @@ -171,6 +172,11 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ # Nawet bez canonical match: grywalny content z inferred tagami (mission: daily tagged ingest). FullmoviesScraper, HDPornGGScraper, + # EpornerApiScraper — dołączony 2026-06-03 (Faza 2b alternatywa). eporner detail to + # JS-heavy KVS bez SSR metadanych (jak porntrex/hqporner — odrzucone), ALE eporner ma + # publiczne JSON API (api/v2/video/search): 1 call = 100 filmów z title+length_sec+ + # keywords+added+thumb. ~100k filmów, deep-crawl przez crawl_page() (API, bez detail-fetch). + EpornerApiScraper, # 4k69.com — NIE dołączony: homepage JS-rendered, brak og:/KVS markerów w surowym HTML # (probe 2026-06-01). Wymagałby headless render — odłożony. # ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory). diff --git a/app/connectors/direct_scrapers/_browse_base.py b/app/connectors/direct_scrapers/_browse_base.py index 9a76397..c94a646 100644 --- a/app/connectors/direct_scrapers/_browse_base.py +++ b/app/connectors/direct_scrapers/_browse_base.py @@ -65,50 +65,55 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC): Zwraca None gdy scena niedostępna / parse fail — caller pominie ten URL, nie aborti całe browse.""" - def latest_scenes(self, *, max_pages: int = 5) -> Iterator[RawScene]: - """Iteruje sceny od najnowszych: page 1..max_pages × N scen/page. + def crawl_page(self, page: int) -> list[RawScene] | None: + """Crawl JEDNEJ strony listingu → lista RawScene. Wspólne dla browse_latest + (top-N) i deep_crawl (kursor). Zwraca: + None — transient fetch-fail listingu (caller: stop, NIE oznaczaj exhausted), + [] — pusty listing = koniec katalogu (caller: exhausted), + [...] — sceny z tej strony. - Domyślnie max_pages=5 → ~100 scen per tube per run (shyfap, freshporno - ~20 scen/page). Schedulowane co 6h (4×/dobę) → catch-up po przerwie. - - Dedup po external_id zachodzi w resolverze (path 1 same_source) — gdy - scena już była, update last_seen + skip. Więc bezpieczne nawet gdy te - same N scen pojawia się przez kilka dni. + API-based subclasses (np. EpornerApiScraper) override'ują crawl_page bezpośrednio + (call API zamiast listing→detail). HTML browse subclasses dostarczają + _listing_url/_extract_scene_urls/_parse_detail i używają tej domyślnej impl. """ - # search() nie jest implementowany przez subclass dla browse-only tube'ów — - # `BaseDirectTubeScraper.search` to abstrakt, więc dodajemy stub żeby - # przepuścić abc, ale faktyczna ścieżka pracy idzie przez latest_scenes(). - for page in range(1, max_pages + 1): - url = self._listing_url(page) + url = self._listing_url(page) + try: + res = browser_get(url, timeout=self._timeout) + html = res.text if hasattr(res, "text") else res + except Exception as e: + log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e) + return None + + urls = self._extract_scene_urls(html) + if not urls: + return [] + + log.info("%s browse page %d: %d scene URLs", self.sitetag, page, len(urls)) + out: list[RawScene] = [] + for scene_url in urls: try: - res = browser_get(url, timeout=self._timeout) - html = res.text if hasattr(res, "text") else res + res = browser_get(scene_url, timeout=self._timeout) + detail_html = res.text if hasattr(res, "text") else res except Exception as e: - log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e) + log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e) + continue + try: + raw = self._parse_detail(scene_url, detail_html) + except Exception as e: + log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e) + continue + if raw is not None: + out.append(raw) + return out + + def latest_scenes(self, *, max_pages: int = 5) -> Iterator[RawScene]: + """Iteruje sceny od najnowszych: page 1..max_pages (browse_latest forward-fill). + Deep-crawl używa crawl_page() z kursorem osobno. Stop na None/[] (fail/koniec).""" + for page in range(1, max_pages + 1): + scenes = self.crawl_page(page) + if not scenes: # None (fetch fail) lub [] (pusty listing = koniec) → stop break - - urls = self._extract_scene_urls(html) - if not urls: - log.info("%s browse: empty listing page %d, stopping", self.sitetag, page) - break - - log.info("%s browse page %d: %d scene URLs", self.sitetag, page, len(urls)) - for scene_url in urls: - try: - res = browser_get(scene_url, timeout=self._timeout) - detail_html = res.text if hasattr(res, "text") else res - except Exception as e: - log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e) - continue - - try: - raw = self._parse_detail(scene_url, detail_html) - except Exception as e: - log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e) - continue - - if raw is not None: - yield raw + yield from scenes # Stub `search()` — BaseDirectTubeScraper wymaga implementacji. Dla browse-only # tubes nie supportujemy performer-driven search; zwracamy pusty iterator. Tube'y diff --git a/app/connectors/direct_scrapers/eporner_api.py b/app/connectors/direct_scrapers/eporner_api.py new file mode 100644 index 0000000..141d8d5 --- /dev/null +++ b/app/connectors/direct_scrapers/eporner_api.py @@ -0,0 +1,124 @@ +"""eporner.com — deep-crawl przez oficjalne JSON API (api/v2/video/search). + +Detail HTML eporner jest JS-heavy (brak SSR duration/title/tagów) — ALE eporner ma +publiczne API zwracające KOMPLETNĄ metadatę w jednym callu: `title`, `length_sec` +(duration), `keywords` (tagi), `added` (data), thumb, embed, url. ~100k filmów, +`order=latest`, ~100/stronę → ~1000 szybkich calli (BEZ detail-fetch). To czyni +eporner idealnym SSR-bogatym źródłem deep-crawla (analiza 2026-06-03: porntrex/hqporner +odrzucone — KVS bez SSR duration; eporner-API je zastępuje). + +Override `crawl_page()` (API flow). HTML-owe _listing_url/_extract/_parse to stuby +(BaseBrowseScraper ABC ich wymaga, ale nieużywane). Sitetag `epornercom` = ten sam co +search-scraper EpornerScraper → external_id namespace wspólny (dedup). +""" +from __future__ import annotations + +import logging +from datetime import date, datetime + +import httpx + +from app.connectors.base import RawPlaybackSource, RawScene, RawTag +from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper +from app.normalize.text import slugify + +log = logging.getLogger(__name__) + +_API = "https://www.eporner.com/api/v2/video/search/" +_PER_PAGE = 100 +_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" + + +def _parse_added(value: str | None) -> date | None: + if not value: + return None + try: + return datetime.strptime(value.strip(), "%Y-%m-%d %H:%M:%S").date() + except ValueError: + return None + + +def _keywords_to_tags(keywords: str | None, sitetag: str) -> list[RawTag]: + if not keywords: + return [] + tags: list[RawTag] = [] + seen: set[str] = set() + for kw in keywords.split(","): + name = kw.strip() + # Pomijamy puste + title-jak-keyword (eporner czasem wrzuca cały title jako keyword). + if not name or len(name) > 40: + continue + slug = slugify(name) + if not slug or slug in seen or len(slug) > 60: + continue + seen.add(slug) + tags.append(RawTag(external_id=f"{sitetag}:tag:{slug}", name=name, slug=slug)) + return tags + + +class EpornerApiScraper(BaseBrowseScraper): + sitetag = "epornercom" + + def crawl_page(self, page: int) -> list[RawScene] | None: + params = { + "query": "", + "per_page": _PER_PAGE, + "page": page, + "order": "latest", + "thumbsize": "medium", + "format": "json", + } + try: + with httpx.Client(timeout=self._timeout, follow_redirects=True, headers={"User-Agent": _UA}) as c: + r = c.get(_API, params=params) + if r.status_code != 200: + log.warning("eporner api page %d status %d", page, r.status_code) + return None + data = r.json() + except Exception as e: + log.warning("eporner api page %d failed: %s", page, e) + return None + + videos = data.get("videos") or [] + if not videos: + return [] # poza ostatnią stroną → koniec katalogu (exhausted) + + out: list[RawScene] = [] + for v in videos: + url = (v.get("url") or "").strip() + title = (v.get("title") or "").strip() + if not url or not title: + continue + dur = v.get("length_sec") + duration_sec = int(dur) if dur else None + thumb = (v.get("default_thumb") or {}).get("src") + out.append( + RawScene( + external_id=f"{self.sitetag}:{url}", + title=title, + duration_sec=duration_sec, + release_date=_parse_added(v.get("added")), + url=url, + tags=_keywords_to_tags(v.get("keywords"), self.sitetag), + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=url, + duration_sec=duration_sec, + thumbnail_url=thumb, + ) + ], + raw={"source": "eporner_api", "id": v.get("id")}, + ) + ) + return out + + # HTML stuby — nieużywane (crawl_page nadpisany API-flow), ale ABC ich wymaga. + def _listing_url(self, page: int) -> str: # pragma: no cover + raise NotImplementedError("EpornerApiScraper używa crawl_page (API), nie HTML listingu") + + def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover + raise NotImplementedError + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: # pragma: no cover + raise NotImplementedError diff --git a/app/scheduler/deep_crawl.py b/app/scheduler/deep_crawl.py index b89d9b8..1d8ec1f 100644 --- a/app/scheduler/deep_crawl.py +++ b/app/scheduler/deep_crawl.py @@ -24,7 +24,6 @@ from pathlib import Path from app.config import get_settings from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS from app.db import session_scope -from app.extractors import browser_get from app.ingest import _process_scene, get_or_create_source from app.models.source import SourceKind @@ -111,28 +110,16 @@ def run_deep_crawl(*, pages_per_run: int = 60, sitetags: list[str] | None = None exhausted = False for page in range(start, end + 1): - try: - res = browser_get(scraper._listing_url(page), timeout=30) - html = res.text if hasattr(res, "text") else res - except Exception as e: - log.warning("deep-crawl %s listing page %d failed: %s", sitetag, page, e) - break # nie awansuj kursora przez błąd sieci — następny run powtórzy - urls = scraper._extract_scene_urls(html) - if not urls: + scenes = scraper.crawl_page(page) + if scenes is None: + # transient fetch-fail listingu — NIE awansuj kursora, następny run powtórzy + break + if not scenes: log.info("deep-crawl %s: empty page %d → catalog end (exhausted)", sitetag, page) exhausted = True last_done = page break - for u in urls: - try: - r = browser_get(u, timeout=30) - dh = r.text if hasattr(r, "text") else r - raw = scraper._parse_detail(u, dh) - except Exception: - counters["errors"] += 1 - continue - if raw is None: - continue + for raw in scenes: counters["seen"] += 1 try: _process_scene(source_id=source_id, raw_scene=raw, counters=counters)