feat(deep-crawl): eporner via JSON API as SSR-rich source (Phase 2b alternative)
porntrex/hqporner rejected for deep-crawl: KVS sites with no SSR metadata (77% of existing porntrex has no duration -> invisible under the app's >=60 filter). eporner instead exposes a public JSON API (api/v2/video/search) returning title + length_sec + keywords + added per video; ~100k videos, ~100/page, no per-scene detail fetch. - BaseBrowseScraper.crawl_page(page): factored out of latest_scenes; returns None (transient fail) / [] (catalog end) / [scenes]. API subclasses override it. - deep_crawl drives via crawl_page (supports HTML-listing AND API sources). - EpornerApiScraper: crawl_page hits the eporner API -> RawScene with duration+tags+ date+thumb+playback; registered in ALL_BROWSE_SCRAPERS. - Pilot (2 API pages): 192 new, 100% playable + tagged + visible (>=60); the <180s trailer filter dropped 6 short clips. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0f19a61789
commit
ee4915770f
4 changed files with 181 additions and 59 deletions
|
|
@ -128,7 +128,7 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer.
|
# Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer.
|
||||||
# Phash thumbnail fingerprint (waga 0.40 w composite scoring) auto-mergeuje do
|
# Phash thumbnail fingerprint (waga 0.40 w composite scoring) auto-mergeuje do
|
||||||
# canonical (TPDB/StashDB) gdy tube hot-linkuje studio thumbnail. Schedulowane
|
# canonical (TPDB/StashDB) gdy tube hot-linkuje studio thumbnail. Schedulowane
|
||||||
# co 6h (4×/dobę), pages 1-5. Patrz `_browse_base.BaseBrowseScraper` +
|
# raz dziennie, pages 1-5. Patrz `_browse_base.BaseBrowseScraper` +
|
||||||
# `app/scheduler/browse_latest.py`.
|
# `app/scheduler/browse_latest.py`.
|
||||||
#
|
#
|
||||||
# **Pilot results (2026-05-12):**
|
# **Pilot results (2026-05-12):**
|
||||||
|
|
@ -143,6 +143,7 @@ from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401
|
from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401
|
||||||
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
|
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
|
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
|
||||||
|
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
|
||||||
|
|
||||||
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
FreshpornoScraper,
|
FreshpornoScraper,
|
||||||
|
|
@ -171,6 +172,11 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
# Nawet bez canonical match: grywalny content z inferred tagami (mission: daily tagged ingest).
|
# Nawet bez canonical match: grywalny content z inferred tagami (mission: daily tagged ingest).
|
||||||
FullmoviesScraper,
|
FullmoviesScraper,
|
||||||
HDPornGGScraper,
|
HDPornGGScraper,
|
||||||
|
# EpornerApiScraper — dołączony 2026-06-03 (Faza 2b alternatywa). eporner detail to
|
||||||
|
# JS-heavy KVS bez SSR metadanych (jak porntrex/hqporner — odrzucone), ALE eporner ma
|
||||||
|
# publiczne JSON API (api/v2/video/search): 1 call = 100 filmów z title+length_sec+
|
||||||
|
# keywords+added+thumb. ~100k filmów, deep-crawl przez crawl_page() (API, bez detail-fetch).
|
||||||
|
EpornerApiScraper,
|
||||||
# 4k69.com — NIE dołączony: homepage JS-rendered, brak og:/KVS markerów w surowym HTML
|
# 4k69.com — NIE dołączony: homepage JS-rendered, brak og:/KVS markerów w surowym HTML
|
||||||
# (probe 2026-06-01). Wymagałby headless render — odłożony.
|
# (probe 2026-06-01). Wymagałby headless render — odłożony.
|
||||||
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
|
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
|
||||||
|
|
|
||||||
|
|
@ -65,50 +65,55 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC):
|
||||||
Zwraca None gdy scena niedostępna / parse fail — caller pominie ten URL,
|
Zwraca None gdy scena niedostępna / parse fail — caller pominie ten URL,
|
||||||
nie aborti całe browse."""
|
nie aborti całe browse."""
|
||||||
|
|
||||||
def latest_scenes(self, *, max_pages: int = 5) -> Iterator[RawScene]:
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||||
"""Iteruje sceny od najnowszych: page 1..max_pages × N scen/page.
|
"""Crawl JEDNEJ strony listingu → lista RawScene. Wspólne dla browse_latest
|
||||||
|
(top-N) i deep_crawl (kursor). Zwraca:
|
||||||
|
None — transient fetch-fail listingu (caller: stop, NIE oznaczaj exhausted),
|
||||||
|
[] — pusty listing = koniec katalogu (caller: exhausted),
|
||||||
|
[...] — sceny z tej strony.
|
||||||
|
|
||||||
Domyślnie max_pages=5 → ~100 scen per tube per run (shyfap, freshporno
|
API-based subclasses (np. EpornerApiScraper) override'ują crawl_page bezpośrednio
|
||||||
~20 scen/page). Schedulowane co 6h (4×/dobę) → catch-up po przerwie.
|
(call API zamiast listing→detail). HTML browse subclasses dostarczają
|
||||||
|
_listing_url/_extract_scene_urls/_parse_detail i używają tej domyślnej impl.
|
||||||
Dedup po external_id zachodzi w resolverze (path 1 same_source) — gdy
|
|
||||||
scena już była, update last_seen + skip. Więc bezpieczne nawet gdy te
|
|
||||||
same N scen pojawia się przez kilka dni.
|
|
||||||
"""
|
"""
|
||||||
# search() nie jest implementowany przez subclass dla browse-only tube'ów —
|
url = self._listing_url(page)
|
||||||
# `BaseDirectTubeScraper.search` to abstrakt, więc dodajemy stub żeby
|
try:
|
||||||
# przepuścić abc, ale faktyczna ścieżka pracy idzie przez latest_scenes().
|
res = browser_get(url, timeout=self._timeout)
|
||||||
for page in range(1, max_pages + 1):
|
html = res.text if hasattr(res, "text") else res
|
||||||
url = self._listing_url(page)
|
except Exception as e:
|
||||||
|
log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
urls = self._extract_scene_urls(html)
|
||||||
|
if not urls:
|
||||||
|
return []
|
||||||
|
|
||||||
|
log.info("%s browse page %d: %d scene URLs", self.sitetag, page, len(urls))
|
||||||
|
out: list[RawScene] = []
|
||||||
|
for scene_url in urls:
|
||||||
try:
|
try:
|
||||||
res = browser_get(url, timeout=self._timeout)
|
res = browser_get(scene_url, timeout=self._timeout)
|
||||||
html = res.text if hasattr(res, "text") else res
|
detail_html = res.text if hasattr(res, "text") else res
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e)
|
log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
raw = self._parse_detail(scene_url, detail_html)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e)
|
||||||
|
continue
|
||||||
|
if raw is not None:
|
||||||
|
out.append(raw)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def latest_scenes(self, *, max_pages: int = 5) -> Iterator[RawScene]:
|
||||||
|
"""Iteruje sceny od najnowszych: page 1..max_pages (browse_latest forward-fill).
|
||||||
|
Deep-crawl używa crawl_page() z kursorem osobno. Stop na None/[] (fail/koniec)."""
|
||||||
|
for page in range(1, max_pages + 1):
|
||||||
|
scenes = self.crawl_page(page)
|
||||||
|
if not scenes: # None (fetch fail) lub [] (pusty listing = koniec) → stop
|
||||||
break
|
break
|
||||||
|
yield from scenes
|
||||||
urls = self._extract_scene_urls(html)
|
|
||||||
if not urls:
|
|
||||||
log.info("%s browse: empty listing page %d, stopping", self.sitetag, page)
|
|
||||||
break
|
|
||||||
|
|
||||||
log.info("%s browse page %d: %d scene URLs", self.sitetag, page, len(urls))
|
|
||||||
for scene_url in urls:
|
|
||||||
try:
|
|
||||||
res = browser_get(scene_url, timeout=self._timeout)
|
|
||||||
detail_html = res.text if hasattr(res, "text") else res
|
|
||||||
except Exception as e:
|
|
||||||
log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
raw = self._parse_detail(scene_url, detail_html)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if raw is not None:
|
|
||||||
yield raw
|
|
||||||
|
|
||||||
# Stub `search()` — BaseDirectTubeScraper wymaga implementacji. Dla browse-only
|
# Stub `search()` — BaseDirectTubeScraper wymaga implementacji. Dla browse-only
|
||||||
# tubes nie supportujemy performer-driven search; zwracamy pusty iterator. Tube'y
|
# tubes nie supportujemy performer-driven search; zwracamy pusty iterator. Tube'y
|
||||||
|
|
|
||||||
124
app/connectors/direct_scrapers/eporner_api.py
Normal file
124
app/connectors/direct_scrapers/eporner_api.py
Normal file
|
|
@ -0,0 +1,124 @@
|
||||||
|
"""eporner.com — deep-crawl przez oficjalne JSON API (api/v2/video/search).
|
||||||
|
|
||||||
|
Detail HTML eporner jest JS-heavy (brak SSR duration/title/tagów) — ALE eporner ma
|
||||||
|
publiczne API zwracające KOMPLETNĄ metadatę w jednym callu: `title`, `length_sec`
|
||||||
|
(duration), `keywords` (tagi), `added` (data), thumb, embed, url. ~100k filmów,
|
||||||
|
`order=latest`, ~100/stronę → ~1000 szybkich calli (BEZ detail-fetch). To czyni
|
||||||
|
eporner idealnym SSR-bogatym źródłem deep-crawla (analiza 2026-06-03: porntrex/hqporner
|
||||||
|
odrzucone — KVS bez SSR duration; eporner-API je zastępuje).
|
||||||
|
|
||||||
|
Override `crawl_page()` (API flow). HTML-owe _listing_url/_extract/_parse to stuby
|
||||||
|
(BaseBrowseScraper ABC ich wymaga, ale nieużywane). Sitetag `epornercom` = ten sam co
|
||||||
|
search-scraper EpornerScraper → external_id namespace wspólny (dedup).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from app.connectors.base import RawPlaybackSource, RawScene, RawTag
|
||||||
|
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_API = "https://www.eporner.com/api/v2/video/search/"
|
||||||
|
_PER_PAGE = 100
|
||||||
|
_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_added(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.strptime(value.strip(), "%Y-%m-%d %H:%M:%S").date()
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _keywords_to_tags(keywords: str | None, sitetag: str) -> list[RawTag]:
|
||||||
|
if not keywords:
|
||||||
|
return []
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for kw in keywords.split(","):
|
||||||
|
name = kw.strip()
|
||||||
|
# Pomijamy puste + title-jak-keyword (eporner czasem wrzuca cały title jako keyword).
|
||||||
|
if not name or len(name) > 40:
|
||||||
|
continue
|
||||||
|
slug = slugify(name)
|
||||||
|
if not slug or slug in seen or len(slug) > 60:
|
||||||
|
continue
|
||||||
|
seen.add(slug)
|
||||||
|
tags.append(RawTag(external_id=f"{sitetag}:tag:{slug}", name=name, slug=slug))
|
||||||
|
return tags
|
||||||
|
|
||||||
|
|
||||||
|
class EpornerApiScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "epornercom"
|
||||||
|
|
||||||
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||||
|
params = {
|
||||||
|
"query": "",
|
||||||
|
"per_page": _PER_PAGE,
|
||||||
|
"page": page,
|
||||||
|
"order": "latest",
|
||||||
|
"thumbsize": "medium",
|
||||||
|
"format": "json",
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
with httpx.Client(timeout=self._timeout, follow_redirects=True, headers={"User-Agent": _UA}) as c:
|
||||||
|
r = c.get(_API, params=params)
|
||||||
|
if r.status_code != 200:
|
||||||
|
log.warning("eporner api page %d status %d", page, r.status_code)
|
||||||
|
return None
|
||||||
|
data = r.json()
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("eporner api page %d failed: %s", page, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
videos = data.get("videos") or []
|
||||||
|
if not videos:
|
||||||
|
return [] # poza ostatnią stroną → koniec katalogu (exhausted)
|
||||||
|
|
||||||
|
out: list[RawScene] = []
|
||||||
|
for v in videos:
|
||||||
|
url = (v.get("url") or "").strip()
|
||||||
|
title = (v.get("title") or "").strip()
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
dur = v.get("length_sec")
|
||||||
|
duration_sec = int(dur) if dur else None
|
||||||
|
thumb = (v.get("default_thumb") or {}).get("src")
|
||||||
|
out.append(
|
||||||
|
RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{url}",
|
||||||
|
title=title,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
release_date=_parse_added(v.get("added")),
|
||||||
|
url=url,
|
||||||
|
tags=_keywords_to_tags(v.get("keywords"), self.sitetag),
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
raw={"source": "eporner_api", "id": v.get("id")},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return out
|
||||||
|
|
||||||
|
# HTML stuby — nieużywane (crawl_page nadpisany API-flow), ale ABC ich wymaga.
|
||||||
|
def _listing_url(self, page: int) -> str: # pragma: no cover
|
||||||
|
raise NotImplementedError("EpornerApiScraper używa crawl_page (API), nie HTML listingu")
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: # pragma: no cover
|
||||||
|
raise NotImplementedError
|
||||||
|
|
@ -24,7 +24,6 @@ from pathlib import Path
|
||||||
from app.config import get_settings
|
from app.config import get_settings
|
||||||
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS
|
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS
|
||||||
from app.db import session_scope
|
from app.db import session_scope
|
||||||
from app.extractors import browser_get
|
|
||||||
from app.ingest import _process_scene, get_or_create_source
|
from app.ingest import _process_scene, get_or_create_source
|
||||||
from app.models.source import SourceKind
|
from app.models.source import SourceKind
|
||||||
|
|
||||||
|
|
@ -111,28 +110,16 @@ def run_deep_crawl(*, pages_per_run: int = 60, sitetags: list[str] | None = None
|
||||||
exhausted = False
|
exhausted = False
|
||||||
|
|
||||||
for page in range(start, end + 1):
|
for page in range(start, end + 1):
|
||||||
try:
|
scenes = scraper.crawl_page(page)
|
||||||
res = browser_get(scraper._listing_url(page), timeout=30)
|
if scenes is None:
|
||||||
html = res.text if hasattr(res, "text") else res
|
# transient fetch-fail listingu — NIE awansuj kursora, następny run powtórzy
|
||||||
except Exception as e:
|
break
|
||||||
log.warning("deep-crawl %s listing page %d failed: %s", sitetag, page, e)
|
if not scenes:
|
||||||
break # nie awansuj kursora przez błąd sieci — następny run powtórzy
|
|
||||||
urls = scraper._extract_scene_urls(html)
|
|
||||||
if not urls:
|
|
||||||
log.info("deep-crawl %s: empty page %d → catalog end (exhausted)", sitetag, page)
|
log.info("deep-crawl %s: empty page %d → catalog end (exhausted)", sitetag, page)
|
||||||
exhausted = True
|
exhausted = True
|
||||||
last_done = page
|
last_done = page
|
||||||
break
|
break
|
||||||
for u in urls:
|
for raw in scenes:
|
||||||
try:
|
|
||||||
r = browser_get(u, timeout=30)
|
|
||||||
dh = r.text if hasattr(r, "text") else r
|
|
||||||
raw = scraper._parse_detail(u, dh)
|
|
||||||
except Exception:
|
|
||||||
counters["errors"] += 1
|
|
||||||
continue
|
|
||||||
if raw is None:
|
|
||||||
continue
|
|
||||||
counters["seen"] += 1
|
counters["seen"] += 1
|
||||||
try:
|
try:
|
||||||
_process_scene(source_id=source_id, raw_scene=raw, counters=counters)
|
_process_scene(source_id=source_id, raw_scene=raw, counters=counters)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue