From e42217773f0275e72ca1eca93bda2509ab417eb5 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Wed, 3 Jun 2026 11:16:44 +0200 Subject: [PATCH] feat(deep-crawl): xvideos browse source (capped) + per-tube page cap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xvideos SSR's JSON-LD VideoObject (duration/title/uploadDate) + on-page /models/ (perf) + /tags/. Sample: median ~10.5min, 93% >=3min. Pilot (2 pages): 29 new, 100% playable + visible + tagged (performers sparse — xvideos 'new' is amateur-heavy; /models/ tagged mostly on studio rips). - XVideosBrowseScraper (JSON-LD + page-parse models/tags), in ALL_BROWSE_SCRAPERS. - deep_crawl._PAGE_CAP: per-sitetag depth cap; xvideoscom=1800 (~newest 50k). At the cap the tube is marked exhausted (reset -> incremental re-sweep) so a mega-tube cannot monopolize the round-robin or balloon the DB. - ported yesporn.py into the public repo (was prod-only, like hdporngg) ending the __init__ public/prod divergence. youporn rejected: JSON-LD lacks actor/keywords, its /pornstar//category/ links are A-Z nav not scene-specific. xhamster: 429/Cloudflare from the VPS IP. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/connectors/direct_scrapers/__init__.py | 18 + .../direct_scrapers/xvideos_browse.py | 147 ++++++++ app/connectors/direct_scrapers/yesporn.py | 321 ++++++++++++++++++ app/scheduler/deep_crawl.py | 50 ++- 4 files changed, 520 insertions(+), 16 deletions(-) create mode 100644 app/connectors/direct_scrapers/xvideos_browse.py create mode 100644 app/connectors/direct_scrapers/yesporn.py diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index 3c6ce6c..0e9d4e8 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -141,9 +141,11 @@ from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402 from app.connectors.direct_scrapers.porndoe import PornDoeScraper # noqa: E402 from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402 from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401 +from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402 from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402 from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402 from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402 +from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402 ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ FreshpornoScraper, @@ -164,6 +166,16 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ # komplet sygnałów. Phash hit-rate niski (własne crop-thumbnaile), studio + # performer + date + duration nadrabiają. PornDoeScraper, + # YesPornVipScraper — dołączony 2026-05-27 (user audit). JSON-LD VideoObject + # + `` per scena (Goon ma + # duration w sekundach gotowe + ISO 8601 release_date z timezone). Studio + + # performerzy z `btn gold` linków (`/channels//` + `/models//`). + # 941k organic monthly (SE Ranking, comparable z porndoe 731k / porntrex 790k). + # Scraper-of-paysites (DogFart / HardX / TeamSkeet / Vixen) — wysokie expected + # canonical match dla studio scenes. Korekta: theporndude scorecard rank 26 + # ('yespornvip.com', score -0.5, auth wall) dotyczył **innej domeny** — pdude.link + # redirect do porndudecams affiliate. Prawdziwa kanoniczna domena to TLD `.vip`. + YesPornVipScraper, # FullmoviesScraper + HDPornGGScraper — dołączone 2026-06-01. KVS engine (sponsor_groups # stack, `/videos//` + `/latest-updates/`). Studio teraz z PREFIKSU tytułu # ("Studio - Scene") — sidebar `/networks/` listował WSZYSTKIE sieci, więc pierwszy match @@ -177,8 +189,14 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ # publiczne JSON API (api/v2/video/search): 1 call = 100 filmów z title+length_sec+ # keywords+added+thumb. ~100k filmów, deep-crawl przez crawl_page() (API, bez detail-fetch). EpornerApiScraper, + # XVideosBrowseScraper — dołączony 2026-06-03. SSR JSON-LD (duration/title/uploadDate) + # + page-parse /models/ (performerzy) + /tags/. Sample: median ~10.5min, 93% ≥3min. + # Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie + # full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.) + XVideosBrowseScraper, # 4k69.com — NIE dołączony: homepage JS-rendered, brak og:/KVS markerów w surowym HTML # (probe 2026-06-01). Wymagałby headless render — odłożony. + # porntrex/hqporner/youporn — NIE: KVS/JS bez SSR duration → niewidoczne orphany (2026-06-03). # ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory). ] diff --git a/app/connectors/direct_scrapers/xvideos_browse.py b/app/connectors/direct_scrapers/xvideos_browse.py new file mode 100644 index 0000000..4e6dd51 --- /dev/null +++ b/app/connectors/direct_scrapers/xvideos_browse.py @@ -0,0 +1,147 @@ +"""xvideos.com — deep-crawl browse scraper (JSON-LD + page-parse). + +xvideos SSR-uje JSON-LD VideoObject (duration, name, uploadDate) ORAZ na detail-stronie +linki `/models/` (performerzy tej sceny) + `/tags/` (tagi). Sample 2026-06-03 +(15 scen): median ~10.5min, 93% ≥3min — dobry full-scene content (nie trailery). + +Mega-katalog (~13M) → deep_crawl z per-tube page-cap (xvideoscom w deep_crawl._PAGE_CAP), +żeby nie monopolizował round-robin ani nie zalał bazy. Listing: /new/ (newest). +Scene: /video./. Playback: page_url + origin tube:xvideoscom (istniejący +extractor `xvideoscom` resolvuje stream mobile-side). Phash pominięty (xvideos robi +własne crop-thumbnaile — 0% hit do canonical, jak fullmovies/hdporn). +""" +from __future__ import annotations + +import json +import logging +import re +from datetime import date, datetime + +from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawTag +from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper, meta_content +from app.normalize.text import slugify + +log = logging.getLogger(__name__) + +_BASE = "https://www.xvideos.com" +_SCENE_URL_RE = re.compile(r'href="(/video\.[0-9a-z]+/[a-z0-9_]+)"', re.IGNORECASE) +_JSONLD_RE = re.compile( + r']+type=["\']application/ld\+json["\'][^>]*>(.*?)', re.IGNORECASE | re.DOTALL +) +_MODEL_RE = re.compile(r'href="/models/([a-z0-9_-]+)"[^>]*>([^<]{2,60})', re.IGNORECASE) +_TAG_RE = re.compile(r'href="/tags/([a-z0-9_-]+)"', re.IGNORECASE) +_SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)") +_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE) + + +def _dur_to_sec(value: str | None) -> int | None: + if not value: + return None + m = _ISO_DUR_RE.match(str(value).strip()) + if not m: + return None + total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0) + return total or None + + +def _iso_date(value: str | None) -> date | None: + if not value: + return None + try: + return datetime.fromisoformat(str(value).replace("Z", "+00:00")).date() + except ValueError: + m = re.match(r"(\d{4}-\d{2}-\d{2})", str(value)) + return date.fromisoformat(m.group(1)) if m else None + + +def _video_object(html: str) -> dict | None: + for m in _JSONLD_RE.finditer(html): + raw = m.group(1).strip() + if not raw: + continue + try: + data = json.loads(raw) + except (json.JSONDecodeError, ValueError): + continue + items = data if isinstance(data, list) else (data.get("@graph", [data]) if isinstance(data, dict) else []) + for obj in items: + if isinstance(obj, dict) and obj.get("@type") == "VideoObject": + return obj + return None + + +class XVideosBrowseScraper(BaseBrowseScraper): + sitetag = "xvideoscom" + + def _listing_url(self, page: int) -> str: + return f"{_BASE}/new/{page}" + + def _extract_scene_urls(self, listing_html: str) -> list[str]: + seen: set[str] = set() + out: list[str] = [] + for m in _SCENE_URL_RE.finditer(listing_html): + url = f"{_BASE}{m.group(1)}" + if url in seen: + continue + seen.add(url) + out.append(url) + return out + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + video = _video_object(detail_html) or {} + + title = (video.get("name") or "").strip() + if not title: + m = _SETTITLE_RE.search(detail_html) + title = m.group(1).strip() if m else (meta_content(detail_html, property="og:title") or "").strip() + if not title: + return None + + duration_sec = _dur_to_sec(video.get("duration")) + release_date = _iso_date(video.get("uploadDate") or video.get("datePublished")) + thumbnail_url = video.get("thumbnailUrl") or meta_content(detail_html, property="og:image") + if isinstance(thumbnail_url, list): + thumbnail_url = thumbnail_url[0] if thumbnail_url else None + + # Performerzy: linki /models/ (scene-specific; nav xvideos używa innego patternu). + performers: list[RawPerformer] = [] + seen_perf: set[str] = set() + for m in _MODEL_RE.finditer(detail_html): + slug, name = m.group(1), m.group(2).strip() + if not name or slug in seen_perf or name.lower() in ("models", "pornstars"): + continue + seen_perf.add(slug) + performers.append(RawPerformer(external_id=f"{self.sitetag}:model:{slug}", name=name)) + if len(performers) >= 8: + break + + # Tagi: /tags/. + tags: list[RawTag] = [] + seen_tag: set[str] = set() + for m in _TAG_RE.finditer(detail_html): + slug = m.group(1) + if slug in seen_tag or len(slug) > 60: + continue + seen_tag.add(slug) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=slug.replace("-", " "), slug=slug)) + if len(tags) >= 15: + break + + return RawScene( + external_id=f"{self.sitetag}:{scene_url}", + title=title, + duration_sec=duration_sec, + release_date=release_date, + url=scene_url, + performers=performers, + tags=tags, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumbnail_url, + ) + ], + raw={"source": "xvideos_browse"}, + ) diff --git a/app/connectors/direct_scrapers/yesporn.py b/app/connectors/direct_scrapers/yesporn.py new file mode 100644 index 0000000..2354d31 --- /dev/null +++ b/app/connectors/direct_scrapers/yesporn.py @@ -0,0 +1,321 @@ +"""yesporn.vip — latest-vids browse scraper. + +Dołączony 2026-05-27. Identyfikowany przez user audit jako "scraper-of-paysites" +(DogFart / HardX / TeamSkeet / Vixen / Brazzers content). Wcześniejszy theporndude +audit pomylił domeny: `yespornvip.com` (z theporndude rankingu) redirectuje przez +pdude.link do `porndudecams.com` affiliate spam — kanoniczna domena ma TLD `.vip`. + +Czemu wart (parity z porndoe): + - **JSON-LD VideoObject** w każdym scene page: name, description, uploadDate + (ISO `YYYY-MM-DDTHH:MM:SS`), duration (ISO `PT0H39M00S`), thumbnailUrl + (BunnyCDN: `yesnn.b-cdn.net/contents/videos_screenshots/...`). + - **``** — durations już w sekundach + (fallback gdy ISO-duration parse fail). + - **``** — ISO 8601 z timezone, redundant z + JSON-LD uploadDate ale czystszy format. + - **``** (multiple) — kanoniczna lista tagów (np. + "Big Ass", "Threesome"). Główne źródło tagów; alternatywnie DOM ma `btn gold` + linki ale te miksują performerów/studio z tagami. + - **Studio + Performers**: oba w sekcji `` + (studio, singular) i `` (performerzy, + multiple). Slugi mają stable per-type salt (`*-i459s7` dla modeli, `*-7p72tp` + dla channels) — zachowują się jak hash z site-version, ale stabilne przez + sesje. + +External_id strategia: `yespornvip:` (`/video/69841/...` → `69841`). +Slug w URL ma `*-npu57w` suffix który wygląda na stałe-per-page-type, ale id +numeryczne jest bezpieczniejsze gdyby site zmienił salt. + +URL patterns: + - Listing: `/latest-updates/` (page 1) / `/latest-updates/N/` (page>1) + - Scene: `/video///` (id numeryczny, slug = title slug + 6-char salt) + - Studio: `/channels//` + - Performer: `/models//` + - Search: `/search//` (nie używane w browse-mode — można dorobić jako + osobny tryb dla performer-driven backfill jeśli będzie potrzeba) + +Playback: download endpoint `/view_video_download.php?id=&format=<480|720|1080>` +z `data-attach-session="PHPSESSID"` — wymaga session cookie, więc nie direct mp4 +z server-side. Plus jest `embedUrl: /embed/` w JSON-LD. Extractor → +`_vps_blocked_fallback.extract` (zgodne z pre-public bandwidth/anonymity policy): +mobile WebView fetcha embed z phone IP, INJECTED_JS scrape'uje ``. +# Slug `` zawiera stable per-type salt (`*-npu57w` dla videos). +_SCENE_URL_RE = re.compile( + r'href="(https://yesporn\.vip/video/(\d+)/[a-z0-9\-]+/)"', + re.IGNORECASE, +) +_VIDEO_ID_RE = re.compile(r"/video/(\d+)/", re.IGNORECASE) + +# Studio (singular) i performerzy (multiple) w ``. +# Studio: `/channels//`. Performer: `/models//`. Tekst linka = +# nazwa wyświetlana (może zawierać CSS-y/inne tagi, więc strip tagów po fakcie). +_STUDIO_LINK_RE = re.compile( + r']*>(.*?)', + re.IGNORECASE | re.DOTALL, +) +_PERFORMER_LINK_RE = re.compile( + r']*>(.*?)', + re.IGNORECASE | re.DOTALL, +) +_HTML_TAG_RE = re.compile(r"<[^>]+>") + +# JSON-LD VideoObject — pełny blok między `', + re.IGNORECASE | re.DOTALL, +) + +# `` — multiple, jeden tag per meta. +_META_TAG_RE = re.compile( + r' int | None: + if not value: + return None + m = _ISO_DUR_RE.match(value.strip()) + if not m: + return None + h = int(m.group(1) or 0) + mn = int(m.group(2) or 0) + s = int(m.group(3) or 0) + total = h * 3600 + mn * 60 + s + return total or None + + +def _parse_iso_date(value: str | None) -> date | None: + """`2026-05-26T19:23:29Z` / `2026-05-26T19:23:29.EDT` → date.""" + if not value: + return None + # yesporn emituje `.EDT` jako "timezone" w JSON-LD uploadDate — strip żeby + # `fromisoformat` nie crash'ował. video:release_date meta ma czysty `Z`. + cleaned = re.sub(r"\.[A-Z]{2,4}$", "", value.strip()) + try: + return datetime.fromisoformat(cleaned.replace("Z", "+00:00")).date() + except ValueError: + m = re.match(r"(\d{4}-\d{2}-\d{2})", cleaned) + if m: + try: + return date.fromisoformat(m.group(1)) + except ValueError: + return None + return None + + +def _iter_jsonld_objects(data: object): + """Spłaszcza JSON-LD: dict / list / @graph → strumień dict-ów.""" + if isinstance(data, dict): + graph = data.get("@graph") + if isinstance(graph, list): + for item in graph: + yield from _iter_jsonld_objects(item) + else: + yield data + elif isinstance(data, list): + for item in data: + yield from _iter_jsonld_objects(item) + + +def _extract_video_object(html: str) -> dict | None: + for m in _JSONLD_RE.finditer(html): + raw = m.group(1).strip() + if not raw: + continue + try: + data = json.loads(raw) + except (json.JSONDecodeError, ValueError): + continue + for obj in _iter_jsonld_objects(data): + if obj.get("@type") == "VideoObject": + return obj + return None + + +def _clean_link_text(raw: str) -> str: + """Strip HTML tagów + decode entities + whitespace normalize.""" + text = _HTML_TAG_RE.sub("", raw) + text = html_mod.unescape(text) + return " ".join(text.split()).strip() + + +class YesPornVipScraper(BaseBrowseScraper): + sitetag = "yespornvip" + + def _listing_url(self, page: int) -> str: + if page <= 1: + return f"{_BASE}/latest-updates/" + return f"{_BASE}/latest-updates/{page}/" + + def _extract_scene_urls(self, listing_html: str) -> list[str]: + seen: set[str] = set() + out: list[str] = [] + for m in _SCENE_URL_RE.finditer(listing_html): + url = m.group(1) + if url in seen: + continue + seen.add(url) + out.append(url) + return out + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + video = _extract_video_object(detail_html) + if not video: + log.info("yesporn: no JSON-LD VideoObject on %s", scene_url) + return None + + title = (video.get("name") or "").strip() + if not title: + return None + + video_id_m = _VIDEO_ID_RE.search(scene_url) + video_id = video_id_m.group(1) if video_id_m else None + + description = (video.get("description") or "").strip() or None + + # Duration: preferuj `` (czyste sekundy), + # fallback do JSON-LD ISO format. + duration_sec: int | None = None + meta_dur = meta_content(detail_html, property="video:duration") + if meta_dur and meta_dur.isdigit(): + duration_sec = int(meta_dur) or None + if duration_sec is None: + duration_sec = _parse_iso_duration(video.get("duration")) + + # Release date: preferuj `` (czystszy + # format z timezone), fallback do JSON-LD uploadDate. + release_date = _parse_iso_date( + meta_content(detail_html, property="video:release_date") + or video.get("uploadDate") + ) + + thumbnail_url = video.get("thumbnailUrl") or None + + # Studio: pierwszy `btn gold` link do `/channels//`. Strona renderuje + # tylko jednego per scenę (logo studia obok performerów). + studio: RawStudio | None = None + for m in _STUDIO_LINK_RE.finditer(detail_html): + slug = m.group(1).strip() + name = _clean_link_text(m.group(2)) + if not name: + continue + studio = RawStudio( + external_id=f"{self.sitetag}:channel:{slug}", + name=name, + slug=slug, + ) + break + + # Performers: wszystkie `btn gold` linki do `/models//` (multiple). + performers: list[RawPerformer] = [] + seen_perf: set[str] = set() + for m in _PERFORMER_LINK_RE.finditer(detail_html): + slug = m.group(1).strip() + if slug in seen_perf: + continue + name = _clean_link_text(m.group(2)) + if not name: + continue + seen_perf.add(slug) + performers.append( + RawPerformer( + external_id=f"{self.sitetag}:performer:{slug}", + name=name, + ) + ) + + # Tagi: `` (multiple). + # Deny-list: pomiń wszystkie all-lowercase tagi. yesporn.vip SEO-stuffuje + # `meta video:tag` tokenami z tytułu i imionami performerów + gibberish + # ("bella", "rose", "reverse", "deep", "throat", "ddca"), wszystkie always + # lowercase. Legit kategorie są zawsze Title Case ("Big Ass", "Deep + # Throat", "Blonde", "Gangbang") lub UPPER ("MILF", "BBW"). Potwierdzone + # w 20-scene dry-run 2026-05-27. Trade-off: stracimy hipotetyczne legit + # lowercase tagi (np. "interracial" gdyby site je nie capitalize'ował) — + # akceptowalne bo tags mają wagę tylko 0.05 w composite scoring resolvera. + tags: list[RawTag] = [] + seen_tag: set[str] = set() + for m in _META_TAG_RE.finditer(detail_html): + name = html_mod.unescape(m.group(1)).strip() + if not name: + continue + if name == name.lower(): + continue + slug = re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-") + if slug in seen_tag: + continue + seen_tag.add(slug) + tags.append( + RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug) + ) + + # Phash z thumbnailUrl — BunnyCDN `yesnn.b-cdn.net` hostuje 1.jpg per scene. + # Hit-rate vs canonical TPDB/StashDB nieznany do pilot run; graceful: brak + # phash → resolver spada do composite scoring (studio + performer + date + + # duration + title token-set) — wszystkie dostępne dzięki JSON-LD. + fingerprints: list[RawFingerprint] = [] + if thumbnail_url: + ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + # Playback — page_url do strony sceny. Direct mp4 (`view_video_download.php`) + # wymaga PHPSESSID cookie (data-attach-session attribute), więc nie usable + # server-side. Extractor `yespornvip` → `_vps_blocked_fallback.extract`: + # mobile WebView z phone IP łapie session natively, INJECTED_JS scrape. + playback_sources = [ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumbnail_url, + ) + ] + + return RawScene( + external_id=f"{self.sitetag}:{video_id or scene_url}", + title=title, + description=description, + release_date=release_date, + duration_sec=duration_sec, + url=scene_url, + studio=studio, + performers=performers, + tags=tags, + fingerprints=fingerprints, + playback_sources=playback_sources, + ) diff --git a/app/scheduler/deep_crawl.py b/app/scheduler/deep_crawl.py index 1d8ec1f..5aef6f7 100644 --- a/app/scheduler/deep_crawl.py +++ b/app/scheduler/deep_crawl.py @@ -31,6 +31,14 @@ log = logging.getLogger(__name__) _DEFAULT_STATE = Path(__file__).resolve().parent.parent / "_state" / "deepcrawl_state.json" +# Per-tube depth cap (stron). Mega-tube'y (xvideos ~13M scen) crawlowane do końca +# zmonopolizowałyby round-robin i zalały bazę — capujemy do ~najnowszych N stron, potem +# exhausted→reset (incremental re-sweep świeżych). Tube'y skończone (porndoe/eporner) bez +# capu (None) → naturalny koniec katalogu. xvideos /new/ ~27 scen/stronę → 1800 ≈ ~50k. +_PAGE_CAP: dict[str, int] = { + "xvideoscom": 1800, +} + def _state_path() -> Path: return Path(getattr(get_settings(), "deepcrawl_state_path", None) or _DEFAULT_STATE) @@ -97,8 +105,11 @@ def run_deep_crawl(*, pages_per_run: int = 60, sitetags: list[str] | None = None return {} scraper = scrapers[sitetag]() + cap = _PAGE_CAP.get(sitetag) # mega-tube depth cap (None = crawl do końca katalogu) start = int(state.get(sitetag, {}).get("last_page", 0)) + 1 end = start + pages_per_run - 1 + if cap is not None: + end = min(end, cap) with session_scope() as session: src = get_or_create_source(session, kind=SourceKind.scraper, name="pornapp") @@ -109,23 +120,30 @@ def run_deep_crawl(*, pages_per_run: int = 60, sitetags: list[str] | None = None last_done = start - 1 exhausted = False - for page in range(start, end + 1): - scenes = scraper.crawl_page(page) - if scenes is None: - # transient fetch-fail listingu — NIE awansuj kursora, następny run powtórzy - break - if not scenes: - log.info("deep-crawl %s: empty page %d → catalog end (exhausted)", sitetag, page) - exhausted = True + if cap is not None and start > cap: + # kursor osiągnął per-tube cap → traktuj jak koniec katalogu (reset re-sweepuje od 1) + exhausted = True + else: + for page in range(start, end + 1): + scenes = scraper.crawl_page(page) + if scenes is None: + # transient fetch-fail listingu — NIE awansuj kursora, następny run powtórzy + break + if not scenes: + log.info("deep-crawl %s: empty page %d → catalog end (exhausted)", sitetag, page) + exhausted = True + last_done = page + break + for raw in scenes: + counters["seen"] += 1 + try: + _process_scene(source_id=source_id, raw_scene=raw, counters=counters) + except Exception: + counters["errors"] += 1 last_done = page - break - for raw in scenes: - counters["seen"] += 1 - try: - _process_scene(source_id=source_id, raw_scene=raw, counters=counters) - except Exception: - counters["errors"] += 1 - last_done = page + if cap is not None and last_done >= cap: + log.info("deep-crawl %s: reached page cap %d (exhausted)", sitetag, cap) + exhausted = True st = state.setdefault(sitetag, {}) st["last_page"] = last_done