"""pornxp.ph — latest-vids browse scraper. URL patterns: - Listing: `https://pornxp.ph/` (page 1, 72 cards) lub `?p=N` (pagination). URL-e w listing mają randomized suffix per request (`/videos/94528971225` vs `/videos/94528971837`) — **`data-id` (np. `94528971`) jest stable** i tego używamy dla external_id zamiast całego URL. - Detail: `/videos/`. - Tags: `/tags/`. Trzy kategorie wnioskowane heurystyką z `_classify_tag` (studio vs performer vs tag). Rich signals (perfekt dla canonical match scoring): - Title (`
` w listing card + `

` na detail) - Studio (z `
` pierwszy tag z `.com`/`.co` LUB CamelCase concat) - Performers (z tags w `
`, Capital + space + Capital) - Release year (regex `Released:` na detail page bodyText) - Duration (`
MM:SS
` listing card) - Direct mp4 streams (``) — no hoster - Animated preview (`data-preview="//t.porn-xp.com/.../.mp4"`) Thumbnail: `` — relatywny, pornxp's own CDN. Phash hit-rate niskie ale studio+performer+title fuzzy match wystarczy do canonical. """ from __future__ import annotations import logging import re from datetime import date from urllib.parse import unquote, urljoin from app.connectors.base import ( RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, ) log = logging.getLogger(__name__) _BASE = "https://pornxp.ph" # Listing card — DOTALL bo HTML cards są wieloliniowe. # Wariant 1 (eager): `` # Wariant 2 (lazy): `` # Łapiemy obie warianty — w `_parse_listing_thumb` preferujemy `data-src` nad `src`. _LISTING_CARD_RE = re.compile( r'
]*>' r'\s*]*>' r'.*?[^>]+)>' r'.*?
(?P[^<]+)
' r'.*?
(?P[^<]+)</div>', re.IGNORECASE | re.DOTALL, ) _IMG_SRC_RE = re.compile(r'\bsrc="([^"]+)"', re.IGNORECASE) _IMG_DATASRC_RE = re.compile(r'\bdata-src="([^"]+)"', re.IGNORECASE) # Detail page — tags wrapper. Sometimes <div class="tags">, sometimes inline. # Bierzemy do najbliższego </div> bo tagi tej sceny są w jednym divie. _DETAIL_TAGS_BLOCK_RE = re.compile( r'<div class="tags">(?P<inner>.*?)</div>', re.IGNORECASE | re.DOTALL, ) _TAG_LINK_RE = re.compile( r'<a\s+href="/tags/([^"]+)"[^>]*>([^<]+)</a>', re.IGNORECASE, ) _RELEASED_RE = re.compile(r'Released:\s*(\d{4})', re.IGNORECASE) _H1_RE = re.compile(r'<h1[^>]*>([^<]+)</h1>', re.IGNORECASE) # Direct mp4/m3u8 sources — preferujemy 720 nad 360. Format często protocol-relative: # `<source src="//sv.porn-xp.com/.../720.mp4">` — normalize do `https://...` w consumerze. _SOURCE_RE = re.compile( r'<source\s+src="(?P<url>(?:https?:)?//[^"]+\.(?:mp4|m3u8))"', re.IGNORECASE, ) def _parse_mmss(s: str) -> int | None: """`16:12` → 972, `1:20:37` → 4837. None gdy format niepoprawny.""" parts = s.strip().split(":") try: if len(parts) == 2: return int(parts[0]) * 60 + int(parts[1]) if len(parts) == 3: return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) except ValueError: return None return None def _classify_tag(name: str) -> str: """Zwraca 'studio' | 'performer' | 'tag'. Heurystyka oparta na sample analysis pornxp.ph tagów: - Studio: zawiera `.` (`TheTeenBay.co`, `Clips4sale.tv`) LUB CamelCase concat bez spacji (`LegalPorno`, `DirtyWivesClub`, `AnalMom`, `Clips4sale`) - Performer: dokładnie 2 słowa Capital + Capital (`Alix Lynx`, `Reagan Foxx`) - Tag/category: pozostałe — lowercase single word LUB Cap single word (`oral`, `Lesbians`, `Incest`, `BBC`) Edge case: single-word studio jak "Brazzers", "Vixen" → klasyfikowane jako tag. To akceptowalne — composite score scoring tags ma niższą wagę niż studio match, więc fallback z 1+ performer match wystarczy. """ name = name.strip() if not name: return "tag" if "." in name: return "studio" if " " in name: parts = name.split() if len(parts) == 2 and all(p[:1].isupper() for p in parts if p): return "performer" return "tag" # No spaces: # ALL-uppercase (BBC, POV, BDSM, MILF) → tag (skróty/akronimy) if name.isupper(): return "tag" # CamelCase mix (LegalPorno, AnalMom, DirtyWivesClub) → studio if any(c.isupper() for c in name[1:]): return "studio" return "tag" def _slugify(name: str) -> str: """`Alix Lynx` → `alix-lynx`. Lowercase, spaces→hyphens, alphanum only.""" return re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-") class PornXPScraper(BaseBrowseScraper): sitetag = "pornxpph" def __init__(self) -> None: super().__init__() # Cache listing card metadata per scene URL — populated w `_extract_scene_urls`, # consumed w `_parse_detail`. Detail page sam nie ma `<div class="item_dur">` # ani thumbnail URL, tylko h1+tags+sources. Cache reset per page (każde # _extract_scene_urls override'uje). self._listing_cache: dict[str, dict] = {} def _listing_url(self, page: int) -> str: # Page 1 = homepage. Pagination `?p=N` (sprawdzone 2026-05-17 chrome devtools). if page <= 1: return f"{_BASE}/" return f"{_BASE}/?p={page}" def _extract_scene_urls(self, listing_html: str) -> list[str]: """Zwraca listę URL-i scen + cache'uje meta z listing card (duration, thumb, title, data-id) w `self._listing_cache[url]`.""" self._listing_cache = {} seen: set[str] = set() out: list[str] = [] for m in _LISTING_CARD_RE.finditer(listing_html): rel_url = m.group("url") url = urljoin(_BASE, rel_url) if url in seen: continue seen.add(url) # Parse img_attrs: prefer data-src (lazy-load actual URL) nad src # (placeholder spinner.svg dla lazy variant). Eager cards mają tylko src. img_attrs = m.group("img_attrs") or "" thumb = None if (dm := _IMG_DATASRC_RE.search(img_attrs)): thumb = dm.group(1) elif (sm := _IMG_SRC_RE.search(img_attrs)): src = sm.group(1) # Skipnij placeholder spinner jeśli nie ma data-src. if "spinner" not in src.lower(): thumb = src if thumb and not thumb.startswith("http"): thumb = urljoin(_BASE, thumb) self._listing_cache[url] = { "data_id": m.group("id"), "preview_mp4": ( "https:" + m.group("preview") if m.group("preview") and m.group("preview").startswith("//") else m.group("preview") ), "thumb": thumb, "duration_sec": _parse_mmss(m.group("dur") or ""), "title": m.group("title").strip(), } out.append(url) return out def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: # Listing-card meta (preferowane — detail page nie ma duration/thumb) meta = self._listing_cache.get(scene_url, {}) data_id = meta.get("data_id") if not data_id: # URL nie pasuje do listingu (random suffix mismatch po pagination redo). # Wyciągnij data-id z URL: /videos/<id>... — pierwsze 8-10 cyfr. id_match = re.search(r"/videos/(\d{6,12})", scene_url) data_id = id_match.group(1) if id_match else None # Title: prefer h1 over listing card title (detail h1 jest cleaner) title = meta.get("title") or "" if (m := _H1_RE.search(detail_html)): title = m.group(1).strip() or title if not title: return None duration_sec = meta.get("duration_sec") thumb = meta.get("thumb") # Release year — `Released: 2016`. RawScene ma `release_date` (typu `date`), # nie samo year — wpisujemy Jan 1 jako placeholder żeby resolver miał year # signal (date proximity scoring tylko sprawdza year w composite). release_date: date | None = None if (m := _RELEASED_RE.search(detail_html)): try: year = int(m.group(1)) if 1970 <= year <= 2100: release_date = date(year, 1, 1) except ValueError: pass # Tags: tylko block <div class="tags">...</div> tej sceny (nie related). studio: RawStudio | None = None performers: list[RawPerformer] = [] tags: list[RawTag] = [] seen_perf_slugs: set[str] = set() seen_tag_slugs: set[str] = set() if (block := _DETAIL_TAGS_BLOCK_RE.search(detail_html)): for tag_m in _TAG_LINK_RE.finditer(block.group("inner")): url_part = tag_m.group(1) name = tag_m.group(2).strip() # URL-encoded space → real space. Niektóre tagi mają `%20`. decoded_name = unquote(url_part).strip() # Display name z anchor preferowane (czasem rożni się od URL slug). display = name or decoded_name kind = _classify_tag(display) slug = _slugify(display) if not slug: continue ext_id = f"{self.sitetag}:{kind}:{slug}" if kind == "studio": if studio is None: # pierwszy studio-tag wygrywa studio = RawStudio(external_id=ext_id, name=display, slug=slug) elif kind == "performer": if slug not in seen_perf_slugs: seen_perf_slugs.add(slug) performers.append(RawPerformer(external_id=ext_id, name=display)) else: if slug not in seen_tag_slugs: seen_tag_slugs.add(slug) tags.append(RawTag(external_id=ext_id, name=display, slug=slug)) # Playback: direct mp4 streams `<source src="//sv.porn-xp.com/.../720.mp4">`. # URL-e są protocol-relative — normalize do `https:`. Preferujemy 720 nad 360. def _norm(u: str) -> str: return "https:" + u if u.startswith("//") else u stream_url: str | None = None all_sources = [_norm(m.group("url")) for m in _SOURCE_RE.finditer(detail_html)] if all_sources: for u in all_sources: if "720" in u: stream_url = u break stream_url = stream_url or all_sources[0] # Phash z thumbnail (pornxp własny CDN — expected niski match rate, ale # try). Reseter ścieżek do canonical odbędzie się głównie przez # studio+performer+year+title scoring. fingerprints: list[RawFingerprint] = [] if thumb: ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) # Normalize page_url: pornxp homepage serwuje random URL suffix per request # (`/videos/94528971225` vs `/videos/94528971836` ten sam scene). PlaybackSource # unique key to `(origin, page_url)` — bez normalize generujemy 3x duplikaty # na każdym scrape run. Canonical URL = `/videos/<data_id>`. canonical_url = ( f"{_BASE}/videos/{data_id}" if data_id else scene_url ) playback_sources = [ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=canonical_url, duration_sec=duration_sec, thumbnail_url=thumb, stream_url=stream_url, ) ] return RawScene( external_id=f"{self.sitetag}:{data_id}" if data_id else f"{self.sitetag}:{scene_url}", title=title, release_date=release_date, duration_sec=duration_sec, url=scene_url, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=playback_sources, )