"""fpo.xxx — latest-vids browse scraper (KVS engine). Historia: dawniej WordPress-search scraper (`?s=`), ale fpo.xxx to KVS, nie WP — search zwracał 0 (regex slug-URL nie pasował do `/video//`). Przerobione na BROWSE (latest z `/new-/`), 2026-06-22 (user request: ożywić zamrożone tuby). Listing tile (`/new-/`): → thumb 1:59:10 → duration → tytuł, miniatura, duration, URL sceny. Performerów/tagów listing nie ma czysto (tytuł bywa JAV-code "Imai Kaho-RKI-602 ..."), więc puste → dorabia canonical-merge. Playback: KVS (kt_player + license_code na detail page) — token IP-bound, resolve PO STRONIE TELEFONU (fpoxxxResolver.ts / WebView fallback, extractor `fpoxxx`). """ from __future__ import annotations import html import logging import re from app.connectors.base import ( RawFingerprint, RawPlaybackSource, RawScene, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, ) from app.extractors import browser_get log = logging.getLogger(__name__) _BASE = "https://www.fpo.xxx" # Kafelek: . Reszta pól w oknie. _A_RE = re.compile( r'https?://(?:www\.)?fpo\.xxx/video/\d+/[^"]*)"\s+title="(?P[^"]*)"', re.IGNORECASE, ) _THUMB_RE = re.compile(r'data-original="([^"]+)"', re.IGNORECASE) _DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<') def _parse_duration(text: str | None) -> int | None: """`1:59:10`→7150 (H:MM:SS); `40:27`→2427 (MM:SS). None gdy brak.""" if not text: return None try: nums = [int(p.strip()) for p in text.split(":")] except ValueError: return None if len(nums) == 2: return nums[0] * 60 + nums[1] if len(nums) == 3: return nums[0] * 3600 + nums[1] * 60 + nums[2] return None class FpoxxxScraper(BaseBrowseScraper): sitetag = "fpoxxx" def _listing_url(self, page: int) -> str: return f"{_BASE}/new-{page}/" # crawl_page nadpisany → abstrakcje nieużywane, ale wymagane do instancji. def _extract_scene_urls(self, listing_html: str) -> list[str]: return [m.group("url") for m in _A_RE.finditer(listing_html)] def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: return None def crawl_page(self, page: int) -> list[RawScene] | None: url = self._listing_url(page) try: res = browser_get(url, timeout=self._timeout) text = res.text if hasattr(res, "text") else res except Exception as e: log.warning("fpoxxx browse listing fetch failed (page %d): %s", page, e) return None out: list[RawScene] = [] seen: set[str] = set() anchors = list(_A_RE.finditer(text)) for idx, m in enumerate(anchors): scene_url = m.group("url").replace("://www.", "://").rstrip("/") + "/" if scene_url in seen: continue seen.add(scene_url) title = html.unescape(m.group("title") or "").strip() if not title: continue win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 900 window = text[m.start():win_end] tm = _THUMB_RE.search(window) thumb = tm.group(1) if tm else None dm = _DUR_RE.search(window) duration_sec = _parse_duration(dm.group(1) if dm else None) fingerprints: list[RawFingerprint] = [] if thumb: ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) out.append( RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, duration_sec=duration_sec, url=scene_url, performers=[], tags=[], fingerprints=fingerprints, playback_sources=[ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumb, ) ], ) ) log.info("fpoxxx browse page %d: %d scenes", page, len(out)) return out