goon/app/connectors/direct_scrapers/pornxp.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

304 lines
13 KiB
Python

"""pornxp.ph — latest-vids browse scraper.
URL patterns:
- Listing: `https://pornxp.ph/` (page 1, 72 cards) lub `?p=N` (pagination).
URL-e w listing mają randomized suffix per request (`/videos/94528971225` vs
`/videos/94528971837`) — **`data-id` (np. `94528971`) jest stable** i tego
używamy dla external_id zamiast całego URL.
- Detail: `/videos/<id_with_suffix>`.
- Tags: `/tags/<URL-encoded-name>`. Trzy kategorie wnioskowane heurystyką
z `_classify_tag` (studio vs performer vs tag).
Rich signals (perfekt dla canonical match scoring):
- Title (`<div class="item_title">` w listing card + `<h1>` na detail)
- Studio (z `<div class="tags">` pierwszy tag z `.com`/`.co` LUB CamelCase concat)
- Performers (z tags w `<div class="tags">`, Capital + space + Capital)
- Release year (regex `Released:` na detail page bodyText)
- Duration (`<div class="item_dur">MM:SS</div>` listing card)
- Direct mp4 streams (`<source src="https://sv.porn-xp.com/.../720.mp4">`) — no hoster
- Animated preview (`data-preview="//t.porn-xp.com/.../<id>.mp4"`)
Thumbnail: `<img class="item_img" src="/<id>.jpg">` — relatywny, pornxp's own CDN.
Phash hit-rate niskie ale studio+performer+title fuzzy match wystarczy do canonical.
"""
from __future__ import annotations
import logging
import re
from datetime import date
from urllib.parse import unquote, urljoin
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
log = logging.getLogger(__name__)
_BASE = "https://pornxp.ph"
# Listing card — DOTALL bo HTML cards są wieloliniowe.
# Wariant 1 (eager): `<img class="item_img" src="/<id>.jpg">`
# Wariant 2 (lazy): `<img class="item_img lazy" src="/images/fluid_spinner.svg" data-src="/<id>.jpg">`
# Łapiemy obie warianty — w `_parse_listing_thumb` preferujemy `data-src` nad `src`.
_LISTING_CARD_RE = re.compile(
r'<div class="item preview"\s+data-id="(?P<id>\d+)"'
r'(?:\s+data-preview="(?P<preview>[^"]*)")?[^>]*>'
r'\s*<a href="(?P<url>/videos/\d+)"[^>]*>'
r'.*?<img class="item_img(?:\s+[\w\-]+)*"\s+(?P<img_attrs>[^>]+)>'
r'.*?<div class="item_dur">(?P<dur>[^<]+)</div>'
r'.*?<div class="item_title">(?P<title>[^<]+)</div>',
re.IGNORECASE | re.DOTALL,
)
_IMG_SRC_RE = re.compile(r'\bsrc="([^"]+)"', re.IGNORECASE)
_IMG_DATASRC_RE = re.compile(r'\bdata-src="([^"]+)"', re.IGNORECASE)
# Detail page — tags wrapper. Sometimes <div class="tags">, sometimes inline.
# Bierzemy do najbliższego </div> bo tagi tej sceny są w jednym divie.
_DETAIL_TAGS_BLOCK_RE = re.compile(
r'<div class="tags">(?P<inner>.*?)</div>', re.IGNORECASE | re.DOTALL,
)
_TAG_LINK_RE = re.compile(
r'<a\s+href="/tags/([^"]+)"[^>]*>([^<]+)</a>', re.IGNORECASE,
)
_RELEASED_RE = re.compile(r'Released:\s*(\d{4})', re.IGNORECASE)
_H1_RE = re.compile(r'<h1[^>]*>([^<]+)</h1>', re.IGNORECASE)
# Direct mp4/m3u8 sources — preferujemy 720 nad 360. Format często protocol-relative:
# `<source src="//sv.porn-xp.com/.../720.mp4">` — normalize do `https://...` w consumerze.
_SOURCE_RE = re.compile(
r'<source\s+src="(?P<url>(?:https?:)?//[^"]+\.(?:mp4|m3u8))"',
re.IGNORECASE,
)
def _parse_mmss(s: str) -> int | None:
"""`16:12` → 972, `1:20:37` → 4837. None gdy format niepoprawny."""
parts = s.strip().split(":")
try:
if len(parts) == 2:
return int(parts[0]) * 60 + int(parts[1])
if len(parts) == 3:
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
except ValueError:
return None
return None
def _classify_tag(name: str) -> str:
"""Zwraca 'studio' | 'performer' | 'tag'.
Heurystyka oparta na sample analysis pornxp.ph tagów:
- Studio: zawiera `.` (`TheTeenBay.co`, `Clips4sale.tv`) LUB CamelCase concat
bez spacji (`LegalPorno`, `DirtyWivesClub`, `AnalMom`, `Clips4sale`)
- Performer: dokładnie 2 słowa Capital + Capital (`Alix Lynx`, `Reagan Foxx`)
- Tag/category: pozostałe — lowercase single word LUB Cap single word
(`oral`, `Lesbians`, `Incest`, `BBC`)
Edge case: single-word studio jak "Brazzers", "Vixen" → klasyfikowane jako tag.
To akceptowalne — composite score scoring tags ma niższą wagę niż studio match,
więc fallback z 1+ performer match wystarczy.
"""
name = name.strip()
if not name:
return "tag"
if "." in name:
return "studio"
if " " in name:
parts = name.split()
if len(parts) == 2 and all(p[:1].isupper() for p in parts if p):
return "performer"
return "tag"
# No spaces:
# ALL-uppercase (BBC, POV, BDSM, MILF) → tag (skróty/akronimy)
if name.isupper():
return "tag"
# CamelCase mix (LegalPorno, AnalMom, DirtyWivesClub) → studio
if any(c.isupper() for c in name[1:]):
return "studio"
return "tag"
def _slugify(name: str) -> str:
"""`Alix Lynx` → `alix-lynx`. Lowercase, spaces→hyphens, alphanum only."""
return re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-")
class PornXPScraper(BaseBrowseScraper):
sitetag = "pornxpph"
def __init__(self) -> None:
super().__init__()
# Cache listing card metadata per scene URL — populated w `_extract_scene_urls`,
# consumed w `_parse_detail`. Detail page sam nie ma `<div class="item_dur">`
# ani thumbnail URL, tylko h1+tags+sources. Cache reset per page (każde
# _extract_scene_urls override'uje).
self._listing_cache: dict[str, dict] = {}
def _listing_url(self, page: int) -> str:
# Page 1 = homepage. Pagination `?p=N` (sprawdzone 2026-05-17 chrome devtools).
if page <= 1:
return f"{_BASE}/"
return f"{_BASE}/?p={page}"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
"""Zwraca listę URL-i scen + cache'uje meta z listing card (duration, thumb,
title, data-id) w `self._listing_cache[url]`."""
self._listing_cache = {}
seen: set[str] = set()
out: list[str] = []
for m in _LISTING_CARD_RE.finditer(listing_html):
rel_url = m.group("url")
url = urljoin(_BASE, rel_url)
if url in seen:
continue
seen.add(url)
# Parse img_attrs: prefer data-src (lazy-load actual URL) nad src
# (placeholder spinner.svg dla lazy variant). Eager cards mają tylko src.
img_attrs = m.group("img_attrs") or ""
thumb = None
if (dm := _IMG_DATASRC_RE.search(img_attrs)):
thumb = dm.group(1)
elif (sm := _IMG_SRC_RE.search(img_attrs)):
src = sm.group(1)
# Skipnij placeholder spinner jeśli nie ma data-src.
if "spinner" not in src.lower():
thumb = src
if thumb and not thumb.startswith("http"):
thumb = urljoin(_BASE, thumb)
self._listing_cache[url] = {
"data_id": m.group("id"),
"preview_mp4": (
"https:" + m.group("preview")
if m.group("preview") and m.group("preview").startswith("//")
else m.group("preview")
),
"thumb": thumb,
"duration_sec": _parse_mmss(m.group("dur") or ""),
"title": m.group("title").strip(),
}
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
# Listing-card meta (preferowane — detail page nie ma duration/thumb)
meta = self._listing_cache.get(scene_url, {})
data_id = meta.get("data_id")
if not data_id:
# URL nie pasuje do listingu (random suffix mismatch po pagination redo).
# Wyciągnij data-id z URL: /videos/<id>... — pierwsze 8-10 cyfr.
id_match = re.search(r"/videos/(\d{6,12})", scene_url)
data_id = id_match.group(1) if id_match else None
# Title: prefer h1 over listing card title (detail h1 jest cleaner)
title = meta.get("title") or ""
if (m := _H1_RE.search(detail_html)):
title = m.group(1).strip() or title
if not title:
return None
duration_sec = meta.get("duration_sec")
thumb = meta.get("thumb")
# Release year — `Released: 2016`. RawScene ma `release_date` (typu `date`),
# nie samo year — wpisujemy Jan 1 jako placeholder żeby resolver miał year
# signal (date proximity scoring tylko sprawdza year w composite).
release_date: date | None = None
if (m := _RELEASED_RE.search(detail_html)):
try:
year = int(m.group(1))
if 1970 <= year <= 2100:
release_date = date(year, 1, 1)
except ValueError:
pass
# Tags: tylko block <div class="tags">...</div> tej sceny (nie related).
studio: RawStudio | None = None
performers: list[RawPerformer] = []
tags: list[RawTag] = []
seen_perf_slugs: set[str] = set()
seen_tag_slugs: set[str] = set()
if (block := _DETAIL_TAGS_BLOCK_RE.search(detail_html)):
for tag_m in _TAG_LINK_RE.finditer(block.group("inner")):
url_part = tag_m.group(1)
name = tag_m.group(2).strip()
# URL-encoded space → real space. Niektóre tagi mają `%20`.
decoded_name = unquote(url_part).strip()
# Display name z anchor preferowane (czasem rożni się od URL slug).
display = name or decoded_name
kind = _classify_tag(display)
slug = _slugify(display)
if not slug:
continue
ext_id = f"{self.sitetag}:{kind}:{slug}"
if kind == "studio":
if studio is None: # pierwszy studio-tag wygrywa
studio = RawStudio(external_id=ext_id, name=display, slug=slug)
elif kind == "performer":
if slug not in seen_perf_slugs:
seen_perf_slugs.add(slug)
performers.append(RawPerformer(external_id=ext_id, name=display))
else:
if slug not in seen_tag_slugs:
seen_tag_slugs.add(slug)
tags.append(RawTag(external_id=ext_id, name=display, slug=slug))
# Playback: direct mp4 streams `<source src="//sv.porn-xp.com/.../720.mp4">`.
# URL-e są protocol-relative — normalize do `https:`. Preferujemy 720 nad 360.
def _norm(u: str) -> str:
return "https:" + u if u.startswith("//") else u
stream_url: str | None = None
all_sources = [_norm(m.group("url")) for m in _SOURCE_RE.finditer(detail_html)]
if all_sources:
for u in all_sources:
if "720" in u:
stream_url = u
break
stream_url = stream_url or all_sources[0]
# Phash z thumbnail (pornxp własny CDN — expected niski match rate, ale
# try). Reseter ścieżek do canonical odbędzie się głównie przez
# studio+performer+year+title scoring.
fingerprints: list[RawFingerprint] = []
if thumb:
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
# Normalize page_url: pornxp homepage serwuje random URL suffix per request
# (`/videos/94528971225` vs `/videos/94528971836` ten sam scene). PlaybackSource
# unique key to `(origin, page_url)` — bez normalize generujemy 3x duplikaty
# na każdym scrape run. Canonical URL = `/videos/<data_id>`.
canonical_url = (
f"{_BASE}/videos/{data_id}" if data_id else scene_url
)
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=canonical_url,
duration_sec=duration_sec,
thumbnail_url=thumb,
stream_url=stream_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{data_id}" if data_id else f"{self.sitetag}:{scene_url}",
title=title,
release_date=release_date,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)