From 2f3e57c0accc27e4c495404d0feae6640e8d1200 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Mon, 22 Jun 2026 12:04:05 +0200 Subject: [PATCH] =?UTF-8?q?feat(ingest):=20revive=20fpoxxx=20=E2=80=94=20s?= =?UTF-8?q?earch=E2=86=92browse=20(KVS=20/new-N/)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fpo.xxx is a KVS site, not WordPress, so the old `?s=` search scraper matched nothing (frozen since 2026-05-07). Converted to a browse scraper reading /new-/ (title + duration + thumbnail + phash from the listing tile; performers via canonical merge). Playback was already phone-side (KVS). 32 fresh scenes on first crawl. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/connectors/direct_scrapers/__init__.py | 5 +- app/connectors/direct_scrapers/fpoxxx.py | 131 +++++++++++++++++++-- 2 files changed, 123 insertions(+), 13 deletions(-) diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index af3474b..0c9d962 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -121,7 +121,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [ # Special SxyPrnScraper, PerverzijaScraper, - FpoxxxScraper, + # FpoxxxScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22, + # user request). fpo.xxx to KVS, nie WordPress → search `?s=` zwracał 0; browse z + # `/new-/` daje listing tile (tytuł/thumb/duration). Playback i tak phone-side (KVS). ] # Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer. @@ -152,6 +154,7 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ FreshpornoScraper, + FpoxxxScraper, # LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven # nie brał feedu "latest"). Listing card: tytuł (z embedded " YY MM DD"), # thumb (studio+date w nazwie), category-* jako tag. Performerów listing nie ma diff --git a/app/connectors/direct_scrapers/fpoxxx.py b/app/connectors/direct_scrapers/fpoxxx.py index f53f1f7..650dd51 100644 --- a/app/connectors/direct_scrapers/fpoxxx.py +++ b/app/connectors/direct_scrapers/fpoxxx.py @@ -1,22 +1,129 @@ -"""fpoxxx — direct HTML scrape search results. +"""fpo.xxx — latest-vids browse scraper (KVS engine). -UWAGA: dokładna domena fpoxxx (sitetag w bazie) niekoniecznie zawiera "com" ani -"net" — porn-app DEFAULT_SITETAGS używa "fpoxxx" jako sitetag. Best-guess: fpo.xxx. +Historia: dawniej WordPress-search scraper (`?s=`), ale fpo.xxx to KVS, nie WP — +search zwracał 0 (regex slug-URL nie pasował do `/video//`). Przerobione na +BROWSE (latest z `/new-/`), 2026-06-22 (user request: ożywić zamrożone tuby). -Search: `https://fpo.xxx/page//?s=` (WordPress). -Scene URL: `https://fpo.xxx//`. +Listing tile (`/new-/`): + + → thumb + 1:59:10 → duration +→ tytuł, miniatura, duration, URL sceny. Performerów/tagów listing nie ma czysto + (tytuł bywa JAV-code "Imai Kaho-RKI-602 ..."), więc puste → dorabia canonical-merge. + +Playback: KVS (kt_player + license_code na detail page) — token IP-bound, resolve +PO STRONIE TELEFONU (fpoxxxResolver.ts / WebView fallback, extractor `fpoxxx`). """ from __future__ import annotations +import html +import logging import re -from app.connectors.direct_scrapers._search_base import BaseSearchScraper +from app.connectors.base import ( + RawFingerprint, + RawPlaybackSource, + RawScene, +) +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, +) +from app.extractors import browser_get + +log = logging.getLogger(__name__) + +_BASE = "https://www.fpo.xxx" +# Kafelek: . Reszta pól w oknie. +_A_RE = re.compile( + r'https?://(?:www\.)?fpo\.xxx/video/\d+/[^"]*)"\s+title="(?P[^"]*)"', + re.IGNORECASE, +) +_THUMB_RE = re.compile(r'data-original="([^"]+)"', re.IGNORECASE) +_DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<') -class FpoxxxScraper(BaseSearchScraper): +def _parse_duration(text: str | None) -> int | None: + """`1:59:10`→7150 (H:MM:SS); `40:27`→2427 (MM:SS). None gdy brak.""" + if not text: + return None + try: + nums = [int(p.strip()) for p in text.split(":")] + except ValueError: + return None + if len(nums) == 2: + return nums[0] * 60 + nums[1] + if len(nums) == 3: + return nums[0] * 3600 + nums[1] * 60 + nums[2] + return None + + +class FpoxxxScraper(BaseBrowseScraper): sitetag = "fpoxxx" - _search_url_template = "https://fpo.xxx/page/{page}/?s={query}" - _scene_url_re = re.compile( - r'href="(?P<url>https://fpo\.xxx/(?P<slug>[a-z0-9][a-z0-9\-]+))/"', - re.IGNORECASE, - ) + + def _listing_url(self, page: int) -> str: + return f"{_BASE}/new-{page}/" + + # crawl_page nadpisany → abstrakcje nieużywane, ale wymagane do instancji. + def _extract_scene_urls(self, listing_html: str) -> list[str]: + return [m.group("url") for m in _A_RE.finditer(listing_html)] + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + return None + + def crawl_page(self, page: int) -> list[RawScene] | None: + url = self._listing_url(page) + try: + res = browser_get(url, timeout=self._timeout) + text = res.text if hasattr(res, "text") else res + except Exception as e: + log.warning("fpoxxx browse listing fetch failed (page %d): %s", page, e) + return None + + out: list[RawScene] = [] + seen: set[str] = set() + anchors = list(_A_RE.finditer(text)) + for idx, m in enumerate(anchors): + scene_url = m.group("url").replace("://www.", "://").rstrip("/") + "/" + if scene_url in seen: + continue + seen.add(scene_url) + title = html.unescape(m.group("title") or "").strip() + if not title: + continue + win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 900 + window = text[m.start():win_end] + + tm = _THUMB_RE.search(window) + thumb = tm.group(1) if tm else None + dm = _DUR_RE.search(window) + duration_sec = _parse_duration(dm.group(1) if dm else None) + + fingerprints: list[RawFingerprint] = [] + if thumb: + ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + out.append( + RawScene( + external_id=f"{self.sitetag}:{scene_url}", + title=title, + duration_sec=duration_sec, + url=scene_url, + performers=[], + tags=[], + fingerprints=fingerprints, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumb, + ) + ], + ) + ) + + log.info("fpoxxx browse page %d: %d scenes", page, len(out)) + return out