feat(ingest): revive fpoxxx — search→browse (KVS /new-N/)
fpo.xxx is a KVS site, not WordPress, so the old `?s=` search scraper matched nothing (frozen since 2026-05-07). Converted to a browse scraper reading /new-<n>/ (title + duration + thumbnail + phash from the listing tile; performers via canonical merge). Playback was already phone-side (KVS). 32 fresh scenes on first crawl. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
90e391e255
commit
2f3e57c0ac
2 changed files with 123 additions and 13 deletions
|
|
@ -121,7 +121,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# Special
|
# Special
|
||||||
SxyPrnScraper,
|
SxyPrnScraper,
|
||||||
PerverzijaScraper,
|
PerverzijaScraper,
|
||||||
FpoxxxScraper,
|
# FpoxxxScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22,
|
||||||
|
# user request). fpo.xxx to KVS, nie WordPress → search `?s=` zwracał 0; browse z
|
||||||
|
# `/new-<n>/` daje listing tile (tytuł/thumb/duration). Playback i tak phone-side (KVS).
|
||||||
]
|
]
|
||||||
|
|
||||||
# Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer.
|
# Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer.
|
||||||
|
|
@ -152,6 +154,7 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
|
||||||
|
|
||||||
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
FreshpornoScraper,
|
FreshpornoScraper,
|
||||||
|
FpoxxxScraper,
|
||||||
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
||||||
# nie brał feedu "latest"). Listing card: tytuł (z embedded "<Studio> YY MM DD"),
|
# nie brał feedu "latest"). Listing card: tytuł (z embedded "<Studio> YY MM DD"),
|
||||||
# thumb (studio+date w nazwie), category-* jako tag. Performerów listing nie ma
|
# thumb (studio+date w nazwie), category-* jako tag. Performerów listing nie ma
|
||||||
|
|
|
||||||
|
|
@ -1,22 +1,129 @@
|
||||||
"""fpoxxx — direct HTML scrape search results.
|
"""fpo.xxx — latest-vids browse scraper (KVS engine).
|
||||||
|
|
||||||
UWAGA: dokładna domena fpoxxx (sitetag w bazie) niekoniecznie zawiera "com" ani
|
Historia: dawniej WordPress-search scraper (`?s=`), ale fpo.xxx to KVS, nie WP —
|
||||||
"net" — porn-app DEFAULT_SITETAGS używa "fpoxxx" jako sitetag. Best-guess: fpo.xxx.
|
search zwracał 0 (regex slug-URL nie pasował do `/video/<id>/`). Przerobione na
|
||||||
|
BROWSE (latest z `/new-<n>/`), 2026-06-22 (user request: ożywić zamrożone tuby).
|
||||||
|
|
||||||
Search: `https://fpo.xxx/page/<n>/?s=<q>` (WordPress).
|
Listing tile (`/new-<n>/`):
|
||||||
Scene URL: `https://fpo.xxx/<slug>/`.
|
<a href="https://www.fpo.xxx/video/<id>/<slug>/" title="<Tytuł>">
|
||||||
|
<img data-original="...screenshots/.../320x180/1.jpg"> → thumb
|
||||||
|
<span class="duration">1:59:10</span> → duration
|
||||||
|
→ tytuł, miniatura, duration, URL sceny. Performerów/tagów listing nie ma czysto
|
||||||
|
(tytuł bywa JAV-code "Imai Kaho-RKI-602 ..."), więc puste → dorabia canonical-merge.
|
||||||
|
|
||||||
|
Playback: KVS (kt_player + license_code na detail page) — token IP-bound, resolve
|
||||||
|
PO STRONIE TELEFONU (fpoxxxResolver.ts / WebView fallback, extractor `fpoxxx`).
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://www.fpo.xxx"
|
||||||
|
# Kafelek: <a href="...fpo.xxx/video/<id>/<slug>/" title="<tytuł>">. Reszta pól w oknie.
|
||||||
|
_A_RE = re.compile(
|
||||||
|
r'<a\s+href="(?P<url>https?://(?:www\.)?fpo\.xxx/video/\d+/[^"]*)"\s+title="(?P<title>[^"]*)"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
_THUMB_RE = re.compile(r'data-original="([^"]+)"', re.IGNORECASE)
|
||||||
|
_DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<')
|
||||||
|
|
||||||
|
|
||||||
class FpoxxxScraper(BaseSearchScraper):
|
def _parse_duration(text: str | None) -> int | None:
|
||||||
|
"""`1:59:10`→7150 (H:MM:SS); `40:27`→2427 (MM:SS). None gdy brak."""
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
nums = [int(p.strip()) for p in text.split(":")]
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
if len(nums) == 2:
|
||||||
|
return nums[0] * 60 + nums[1]
|
||||||
|
if len(nums) == 3:
|
||||||
|
return nums[0] * 3600 + nums[1] * 60 + nums[2]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class FpoxxxScraper(BaseBrowseScraper):
|
||||||
sitetag = "fpoxxx"
|
sitetag = "fpoxxx"
|
||||||
_search_url_template = "https://fpo.xxx/page/{page}/?s={query}"
|
|
||||||
_scene_url_re = re.compile(
|
def _listing_url(self, page: int) -> str:
|
||||||
r'href="(?P<url>https://fpo\.xxx/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
return f"{_BASE}/new-{page}/"
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
# crawl_page nadpisany → abstrakcje nieużywane, ale wymagane do instancji.
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
return [m.group("url") for m in _A_RE.finditer(listing_html)]
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||||
|
url = self._listing_url(page)
|
||||||
|
try:
|
||||||
|
res = browser_get(url, timeout=self._timeout)
|
||||||
|
text = res.text if hasattr(res, "text") else res
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("fpoxxx browse listing fetch failed (page %d): %s", page, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
out: list[RawScene] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
anchors = list(_A_RE.finditer(text))
|
||||||
|
for idx, m in enumerate(anchors):
|
||||||
|
scene_url = m.group("url").replace("://www.", "://").rstrip("/") + "/"
|
||||||
|
if scene_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(scene_url)
|
||||||
|
title = html.unescape(m.group("title") or "").strip()
|
||||||
|
if not title:
|
||||||
|
continue
|
||||||
|
win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 900
|
||||||
|
window = text[m.start():win_end]
|
||||||
|
|
||||||
|
tm = _THUMB_RE.search(window)
|
||||||
|
thumb = tm.group(1) if tm else None
|
||||||
|
dm = _DUR_RE.search(window)
|
||||||
|
duration_sec = _parse_duration(dm.group(1) if dm else None)
|
||||||
|
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumb:
|
||||||
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
out.append(
|
||||||
|
RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
performers=[],
|
||||||
|
tags=[],
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info("fpoxxx browse page %d: %d scenes", page, len(out))
|
||||||
|
return out
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue