From 2f3e57c0accc27e4c495404d0feae6640e8d1200 Mon Sep 17 00:00:00 2001
From: jtrzupek <jtrzupek@gmail.com>
Date: Mon, 22 Jun 2026 12:04:05 +0200
Subject: [PATCH] =?UTF-8?q?feat(ingest):=20revive=20fpoxxx=20=E2=80=94=20s?=
 =?UTF-8?q?earch=E2=86=92browse=20(KVS=20/new-N/)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fpo.xxx is a KVS site, not WordPress, so the old `?s=` search scraper matched
nothing (frozen since 2026-05-07). Converted to a browse scraper reading /new-<n>/
(title + duration + thumbnail + phash from the listing tile; performers via canonical
merge). Playback was already phone-side (KVS). 32 fresh scenes on first crawl.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/connectors/direct_scrapers/__init__.py |   5 +-
 app/connectors/direct_scrapers/fpoxxx.py   | 131 +++++++++++++++++++--
 2 files changed, 123 insertions(+), 13 deletions(-)
diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py
index af3474b..0c9d962 100644
--- a/app/connectors/direct_scrapers/__init__.py
+++ b/app/connectors/direct_scrapers/__init__.py
@@ -121,7 +121,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
     # Special
     SxyPrnScraper,
     PerverzijaScraper,
-    FpoxxxScraper,
+    # FpoxxxScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22,
+    # user request). fpo.xxx to KVS, nie WordPress → search `?s=` zwracał 0; browse z
+    # `/new-<n>/` daje listing tile (tytuł/thumb/duration). Playback i tak phone-side (KVS).
 ]
 
 # Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer.
@@ -152,6 +154,7 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
 
 ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
     FreshpornoScraper,
+    FpoxxxScraper,
     # LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
     # nie brał feedu "latest"). Listing card: tytuł (z embedded "<Studio> YY MM DD"),
     # thumb (studio+date w nazwie), category-* jako tag. Performerów listing nie ma
diff --git a/app/connectors/direct_scrapers/fpoxxx.py b/app/connectors/direct_scrapers/fpoxxx.py
index f53f1f7..650dd51 100644
--- a/app/connectors/direct_scrapers/fpoxxx.py
+++ b/app/connectors/direct_scrapers/fpoxxx.py
@@ -1,22 +1,129 @@
-"""fpoxxx — direct HTML scrape search results.
+"""fpo.xxx — latest-vids browse scraper (KVS engine).
 
-UWAGA: dokładna domena fpoxxx (sitetag w bazie) niekoniecznie zawiera "com" ani
-"net" — porn-app DEFAULT_SITETAGS używa "fpoxxx" jako sitetag. Best-guess: fpo.xxx.
+Historia: dawniej WordPress-search scraper (`?s=`), ale fpo.xxx to KVS, nie WP —
+search zwracał 0 (regex slug-URL nie pasował do `/video/<id>/`). Przerobione na
+BROWSE (latest z `/new-<n>/`), 2026-06-22 (user request: ożywić zamrożone tuby).
 
-Search: `https://fpo.xxx/page/<n>/?s=<q>` (WordPress).
-Scene URL: `https://fpo.xxx/<slug>/`.
+Listing tile (`/new-<n>/`):
+  <a href="https://www.fpo.xxx/video/<id>/<slug>/" title="<Tytuł>">
+    <img data-original="...screenshots/.../320x180/1.jpg">          → thumb
+    <span class="duration">1:59:10</span>                           → duration
+→ tytuł, miniatura, duration, URL sceny. Performerów/tagów listing nie ma czysto
+  (tytuł bywa JAV-code "Imai Kaho-RKI-602 ..."), więc puste → dorabia canonical-merge.
+
+Playback: KVS (kt_player + license_code na detail page) — token IP-bound, resolve
+PO STRONIE TELEFONU (fpoxxxResolver.ts / WebView fallback, extractor `fpoxxx`).
 """
 from __future__ import annotations
 
+import html
+import logging
 import re
 
-from app.connectors.direct_scrapers._search_base import BaseSearchScraper
+from app.connectors.base import (
+    RawFingerprint,
+    RawPlaybackSource,
+    RawScene,
+)
+from app.connectors.direct_scrapers._browse_base import (
+    BaseBrowseScraper,
+    compute_thumbnail_phash,
+)
+from app.extractors import browser_get
+
+log = logging.getLogger(__name__)
+
+_BASE = "https://www.fpo.xxx"
+# Kafelek: <a href="...fpo.xxx/video/<id>/<slug>/" title="<tytuł>">. Reszta pól w oknie.
+_A_RE = re.compile(
+    r'<a\s+href="(?P<url>https?://(?:www\.)?fpo\.xxx/video/\d+/[^"]*)"\s+title="(?P<title>[^"]*)"',
+    re.IGNORECASE,
+)
+_THUMB_RE = re.compile(r'data-original="([^"]+)"', re.IGNORECASE)
+_DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<')
 
 
-class FpoxxxScraper(BaseSearchScraper):
+def _parse_duration(text: str | None) -> int | None:
+    """`1:59:10`→7150 (H:MM:SS); `40:27`→2427 (MM:SS). None gdy brak."""
+    if not text:
+        return None
+    try:
+        nums = [int(p.strip()) for p in text.split(":")]
+    except ValueError:
+        return None
+    if len(nums) == 2:
+        return nums[0] * 60 + nums[1]
+    if len(nums) == 3:
+        return nums[0] * 3600 + nums[1] * 60 + nums[2]
+    return None
+
+
+class FpoxxxScraper(BaseBrowseScraper):
     sitetag = "fpoxxx"
-    _search_url_template = "https://fpo.xxx/page/{page}/?s={query}"
-    _scene_url_re = re.compile(
-        r'href="(?P<url>https://fpo\.xxx/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
-        re.IGNORECASE,
-    )
+
+    def _listing_url(self, page: int) -> str:
+        return f"{_BASE}/new-{page}/"
+
+    # crawl_page nadpisany → abstrakcje nieużywane, ale wymagane do instancji.
+    def _extract_scene_urls(self, listing_html: str) -> list[str]:
+        return [m.group("url") for m in _A_RE.finditer(listing_html)]
+
+    def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
+        return None
+
+    def crawl_page(self, page: int) -> list[RawScene] | None:
+        url = self._listing_url(page)
+        try:
+            res = browser_get(url, timeout=self._timeout)
+            text = res.text if hasattr(res, "text") else res
+        except Exception as e:
+            log.warning("fpoxxx browse listing fetch failed (page %d): %s", page, e)
+            return None
+
+        out: list[RawScene] = []
+        seen: set[str] = set()
+        anchors = list(_A_RE.finditer(text))
+        for idx, m in enumerate(anchors):
+            scene_url = m.group("url").replace("://www.", "://").rstrip("/") + "/"
+            if scene_url in seen:
+                continue
+            seen.add(scene_url)
+            title = html.unescape(m.group("title") or "").strip()
+            if not title:
+                continue
+            win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 900
+            window = text[m.start():win_end]
+
+            tm = _THUMB_RE.search(window)
+            thumb = tm.group(1) if tm else None
+            dm = _DUR_RE.search(window)
+            duration_sec = _parse_duration(dm.group(1) if dm else None)
+
+            fingerprints: list[RawFingerprint] = []
+            if thumb:
+                ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
+                if ph:
+                    fingerprints.append(RawFingerprint(kind="phash", value=ph))
+
+            out.append(
+                RawScene(
+                    external_id=f"{self.sitetag}:{scene_url}",
+                    title=title,
+                    duration_sec=duration_sec,
+                    url=scene_url,
+                    performers=[],
+                    tags=[],
+                    fingerprints=fingerprints,
+                    playback_sources=[
+                        RawPlaybackSource(
+                            origin=f"tube:{self.sitetag}",
+                            page_url=scene_url,
+                            duration_sec=duration_sec,
+                            thumbnail_url=thumb,
+                        )
+                    ],
+                )
+            )
+
+        log.info("fpoxxx browse page %d: %d scenes", page, len(out))
+        return out