Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
304 lines
13 KiB
Python
304 lines
13 KiB
Python
"""pornxp.ph — latest-vids browse scraper.
|
|
|
|
URL patterns:
|
|
- Listing: `https://pornxp.ph/` (page 1, 72 cards) lub `?p=N` (pagination).
|
|
URL-e w listing mają randomized suffix per request (`/videos/94528971225` vs
|
|
`/videos/94528971837`) — **`data-id` (np. `94528971`) jest stable** i tego
|
|
używamy dla external_id zamiast całego URL.
|
|
- Detail: `/videos/<id_with_suffix>`.
|
|
- Tags: `/tags/<URL-encoded-name>`. Trzy kategorie wnioskowane heurystyką
|
|
z `_classify_tag` (studio vs performer vs tag).
|
|
|
|
Rich signals (perfekt dla canonical match scoring):
|
|
- Title (`<div class="item_title">` w listing card + `<h1>` na detail)
|
|
- Studio (z `<div class="tags">` pierwszy tag z `.com`/`.co` LUB CamelCase concat)
|
|
- Performers (z tags w `<div class="tags">`, Capital + space + Capital)
|
|
- Release year (regex `Released:` na detail page bodyText)
|
|
- Duration (`<div class="item_dur">MM:SS</div>` listing card)
|
|
- Direct mp4 streams (`<source src="https://sv.porn-xp.com/.../720.mp4">`) — no hoster
|
|
- Animated preview (`data-preview="//t.porn-xp.com/.../<id>.mp4"`)
|
|
|
|
Thumbnail: `<img class="item_img" src="/<id>.jpg">` — relatywny, pornxp's own CDN.
|
|
Phash hit-rate niskie ale studio+performer+title fuzzy match wystarczy do canonical.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from datetime import date
|
|
from urllib.parse import unquote, urljoin
|
|
|
|
from app.connectors.base import (
|
|
RawFingerprint,
|
|
RawPerformer,
|
|
RawPlaybackSource,
|
|
RawScene,
|
|
RawStudio,
|
|
RawTag,
|
|
)
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
)
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_BASE = "https://pornxp.ph"
|
|
|
|
# Listing card — DOTALL bo HTML cards są wieloliniowe.
|
|
# Wariant 1 (eager): `<img class="item_img" src="/<id>.jpg">`
|
|
# Wariant 2 (lazy): `<img class="item_img lazy" src="/images/fluid_spinner.svg" data-src="/<id>.jpg">`
|
|
# Łapiemy obie warianty — w `_parse_listing_thumb` preferujemy `data-src` nad `src`.
|
|
_LISTING_CARD_RE = re.compile(
|
|
r'<div class="item preview"\s+data-id="(?P<id>\d+)"'
|
|
r'(?:\s+data-preview="(?P<preview>[^"]*)")?[^>]*>'
|
|
r'\s*<a href="(?P<url>/videos/\d+)"[^>]*>'
|
|
r'.*?<img class="item_img(?:\s+[\w\-]+)*"\s+(?P<img_attrs>[^>]+)>'
|
|
r'.*?<div class="item_dur">(?P<dur>[^<]+)</div>'
|
|
r'.*?<div class="item_title">(?P<title>[^<]+)</div>',
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
_IMG_SRC_RE = re.compile(r'\bsrc="([^"]+)"', re.IGNORECASE)
|
|
_IMG_DATASRC_RE = re.compile(r'\bdata-src="([^"]+)"', re.IGNORECASE)
|
|
|
|
# Detail page — tags wrapper. Sometimes <div class="tags">, sometimes inline.
|
|
# Bierzemy do najbliższego </div> bo tagi tej sceny są w jednym divie.
|
|
_DETAIL_TAGS_BLOCK_RE = re.compile(
|
|
r'<div class="tags">(?P<inner>.*?)</div>', re.IGNORECASE | re.DOTALL,
|
|
)
|
|
_TAG_LINK_RE = re.compile(
|
|
r'<a\s+href="/tags/([^"]+)"[^>]*>([^<]+)</a>', re.IGNORECASE,
|
|
)
|
|
_RELEASED_RE = re.compile(r'Released:\s*(\d{4})', re.IGNORECASE)
|
|
_H1_RE = re.compile(r'<h1[^>]*>([^<]+)</h1>', re.IGNORECASE)
|
|
# Direct mp4/m3u8 sources — preferujemy 720 nad 360. Format często protocol-relative:
|
|
# `<source src="//sv.porn-xp.com/.../720.mp4">` — normalize do `https://...` w consumerze.
|
|
_SOURCE_RE = re.compile(
|
|
r'<source\s+src="(?P<url>(?:https?:)?//[^"]+\.(?:mp4|m3u8))"',
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _parse_mmss(s: str) -> int | None:
|
|
"""`16:12` → 972, `1:20:37` → 4837. None gdy format niepoprawny."""
|
|
parts = s.strip().split(":")
|
|
try:
|
|
if len(parts) == 2:
|
|
return int(parts[0]) * 60 + int(parts[1])
|
|
if len(parts) == 3:
|
|
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
|
|
except ValueError:
|
|
return None
|
|
return None
|
|
|
|
|
|
def _classify_tag(name: str) -> str:
|
|
"""Zwraca 'studio' | 'performer' | 'tag'.
|
|
|
|
Heurystyka oparta na sample analysis pornxp.ph tagów:
|
|
- Studio: zawiera `.` (`TheTeenBay.co`, `Clips4sale.tv`) LUB CamelCase concat
|
|
bez spacji (`LegalPorno`, `DirtyWivesClub`, `AnalMom`, `Clips4sale`)
|
|
- Performer: dokładnie 2 słowa Capital + Capital (`Alix Lynx`, `Reagan Foxx`)
|
|
- Tag/category: pozostałe — lowercase single word LUB Cap single word
|
|
(`oral`, `Lesbians`, `Incest`, `BBC`)
|
|
|
|
Edge case: single-word studio jak "Brazzers", "Vixen" → klasyfikowane jako tag.
|
|
To akceptowalne — composite score scoring tags ma niższą wagę niż studio match,
|
|
więc fallback z 1+ performer match wystarczy.
|
|
"""
|
|
name = name.strip()
|
|
if not name:
|
|
return "tag"
|
|
if "." in name:
|
|
return "studio"
|
|
if " " in name:
|
|
parts = name.split()
|
|
if len(parts) == 2 and all(p[:1].isupper() for p in parts if p):
|
|
return "performer"
|
|
return "tag"
|
|
# No spaces:
|
|
# ALL-uppercase (BBC, POV, BDSM, MILF) → tag (skróty/akronimy)
|
|
if name.isupper():
|
|
return "tag"
|
|
# CamelCase mix (LegalPorno, AnalMom, DirtyWivesClub) → studio
|
|
if any(c.isupper() for c in name[1:]):
|
|
return "studio"
|
|
return "tag"
|
|
|
|
|
|
def _slugify(name: str) -> str:
|
|
"""`Alix Lynx` → `alix-lynx`. Lowercase, spaces→hyphens, alphanum only."""
|
|
return re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-")
|
|
|
|
|
|
class PornXPScraper(BaseBrowseScraper):
|
|
sitetag = "pornxpph"
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
# Cache listing card metadata per scene URL — populated w `_extract_scene_urls`,
|
|
# consumed w `_parse_detail`. Detail page sam nie ma `<div class="item_dur">`
|
|
# ani thumbnail URL, tylko h1+tags+sources. Cache reset per page (każde
|
|
# _extract_scene_urls override'uje).
|
|
self._listing_cache: dict[str, dict] = {}
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
# Page 1 = homepage. Pagination `?p=N` (sprawdzone 2026-05-17 chrome devtools).
|
|
if page <= 1:
|
|
return f"{_BASE}/"
|
|
return f"{_BASE}/?p={page}"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
"""Zwraca listę URL-i scen + cache'uje meta z listing card (duration, thumb,
|
|
title, data-id) w `self._listing_cache[url]`."""
|
|
self._listing_cache = {}
|
|
seen: set[str] = set()
|
|
out: list[str] = []
|
|
for m in _LISTING_CARD_RE.finditer(listing_html):
|
|
rel_url = m.group("url")
|
|
url = urljoin(_BASE, rel_url)
|
|
if url in seen:
|
|
continue
|
|
seen.add(url)
|
|
# Parse img_attrs: prefer data-src (lazy-load actual URL) nad src
|
|
# (placeholder spinner.svg dla lazy variant). Eager cards mają tylko src.
|
|
img_attrs = m.group("img_attrs") or ""
|
|
thumb = None
|
|
if (dm := _IMG_DATASRC_RE.search(img_attrs)):
|
|
thumb = dm.group(1)
|
|
elif (sm := _IMG_SRC_RE.search(img_attrs)):
|
|
src = sm.group(1)
|
|
# Skipnij placeholder spinner jeśli nie ma data-src.
|
|
if "spinner" not in src.lower():
|
|
thumb = src
|
|
if thumb and not thumb.startswith("http"):
|
|
thumb = urljoin(_BASE, thumb)
|
|
self._listing_cache[url] = {
|
|
"data_id": m.group("id"),
|
|
"preview_mp4": (
|
|
"https:" + m.group("preview")
|
|
if m.group("preview") and m.group("preview").startswith("//")
|
|
else m.group("preview")
|
|
),
|
|
"thumb": thumb,
|
|
"duration_sec": _parse_mmss(m.group("dur") or ""),
|
|
"title": m.group("title").strip(),
|
|
}
|
|
out.append(url)
|
|
return out
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
# Listing-card meta (preferowane — detail page nie ma duration/thumb)
|
|
meta = self._listing_cache.get(scene_url, {})
|
|
data_id = meta.get("data_id")
|
|
if not data_id:
|
|
# URL nie pasuje do listingu (random suffix mismatch po pagination redo).
|
|
# Wyciągnij data-id z URL: /videos/<id>... — pierwsze 8-10 cyfr.
|
|
id_match = re.search(r"/videos/(\d{6,12})", scene_url)
|
|
data_id = id_match.group(1) if id_match else None
|
|
|
|
# Title: prefer h1 over listing card title (detail h1 jest cleaner)
|
|
title = meta.get("title") or ""
|
|
if (m := _H1_RE.search(detail_html)):
|
|
title = m.group(1).strip() or title
|
|
if not title:
|
|
return None
|
|
|
|
duration_sec = meta.get("duration_sec")
|
|
thumb = meta.get("thumb")
|
|
|
|
# Release year — `Released: 2016`. RawScene ma `release_date` (typu `date`),
|
|
# nie samo year — wpisujemy Jan 1 jako placeholder żeby resolver miał year
|
|
# signal (date proximity scoring tylko sprawdza year w composite).
|
|
release_date: date | None = None
|
|
if (m := _RELEASED_RE.search(detail_html)):
|
|
try:
|
|
year = int(m.group(1))
|
|
if 1970 <= year <= 2100:
|
|
release_date = date(year, 1, 1)
|
|
except ValueError:
|
|
pass
|
|
|
|
# Tags: tylko block <div class="tags">...</div> tej sceny (nie related).
|
|
studio: RawStudio | None = None
|
|
performers: list[RawPerformer] = []
|
|
tags: list[RawTag] = []
|
|
seen_perf_slugs: set[str] = set()
|
|
seen_tag_slugs: set[str] = set()
|
|
if (block := _DETAIL_TAGS_BLOCK_RE.search(detail_html)):
|
|
for tag_m in _TAG_LINK_RE.finditer(block.group("inner")):
|
|
url_part = tag_m.group(1)
|
|
name = tag_m.group(2).strip()
|
|
# URL-encoded space → real space. Niektóre tagi mają `%20`.
|
|
decoded_name = unquote(url_part).strip()
|
|
# Display name z anchor preferowane (czasem rożni się od URL slug).
|
|
display = name or decoded_name
|
|
kind = _classify_tag(display)
|
|
slug = _slugify(display)
|
|
if not slug:
|
|
continue
|
|
ext_id = f"{self.sitetag}:{kind}:{slug}"
|
|
if kind == "studio":
|
|
if studio is None: # pierwszy studio-tag wygrywa
|
|
studio = RawStudio(external_id=ext_id, name=display, slug=slug)
|
|
elif kind == "performer":
|
|
if slug not in seen_perf_slugs:
|
|
seen_perf_slugs.add(slug)
|
|
performers.append(RawPerformer(external_id=ext_id, name=display))
|
|
else:
|
|
if slug not in seen_tag_slugs:
|
|
seen_tag_slugs.add(slug)
|
|
tags.append(RawTag(external_id=ext_id, name=display, slug=slug))
|
|
|
|
# Playback: direct mp4 streams `<source src="//sv.porn-xp.com/.../720.mp4">`.
|
|
# URL-e są protocol-relative — normalize do `https:`. Preferujemy 720 nad 360.
|
|
def _norm(u: str) -> str:
|
|
return "https:" + u if u.startswith("//") else u
|
|
|
|
stream_url: str | None = None
|
|
all_sources = [_norm(m.group("url")) for m in _SOURCE_RE.finditer(detail_html)]
|
|
if all_sources:
|
|
for u in all_sources:
|
|
if "720" in u:
|
|
stream_url = u
|
|
break
|
|
stream_url = stream_url or all_sources[0]
|
|
|
|
# Phash z thumbnail (pornxp własny CDN — expected niski match rate, ale
|
|
# try). Reseter ścieżek do canonical odbędzie się głównie przez
|
|
# studio+performer+year+title scoring.
|
|
fingerprints: list[RawFingerprint] = []
|
|
if thumb:
|
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
|
if ph:
|
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
|
|
# Normalize page_url: pornxp homepage serwuje random URL suffix per request
|
|
# (`/videos/94528971225` vs `/videos/94528971836` ten sam scene). PlaybackSource
|
|
# unique key to `(origin, page_url)` — bez normalize generujemy 3x duplikaty
|
|
# na każdym scrape run. Canonical URL = `/videos/<data_id>`.
|
|
canonical_url = (
|
|
f"{_BASE}/videos/{data_id}" if data_id else scene_url
|
|
)
|
|
playback_sources = [
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=canonical_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumb,
|
|
stream_url=stream_url,
|
|
)
|
|
]
|
|
|
|
return RawScene(
|
|
external_id=f"{self.sitetag}:{data_id}" if data_id else f"{self.sitetag}:{scene_url}",
|
|
title=title,
|
|
release_date=release_date,
|
|
duration_sec=duration_sec,
|
|
url=scene_url,
|
|
studio=studio,
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
playback_sources=playback_sources,
|
|
)
|