"""yesporn.vip — latest-vids browse scraper. Dołączony 2026-05-27. Identyfikowany przez user audit jako "scraper-of-paysites" (DogFart / HardX / TeamSkeet / Vixen / Brazzers content). Wcześniejszy theporndude audit pomylił domeny: `yespornvip.com` (z theporndude rankingu) redirectuje przez pdude.link do `porndudecams.com` affiliate spam — kanoniczna domena ma TLD `.vip`. Czemu wart (parity z porndoe): - **JSON-LD VideoObject** w każdym scene page: name, description, uploadDate (ISO `YYYY-MM-DDTHH:MM:SS`), duration (ISO `PT0H39M00S`), thumbnailUrl (BunnyCDN: `yesnn.b-cdn.net/contents/videos_screenshots/...`). - **``** — durations już w sekundach (fallback gdy ISO-duration parse fail). - **``** — ISO 8601 z timezone, redundant z JSON-LD uploadDate ale czystszy format. - **``** (multiple) — kanoniczna lista tagów (np. "Big Ass", "Threesome"). Główne źródło tagów; alternatywnie DOM ma `btn gold` linki ale te miksują performerów/studio z tagami. - **Studio + Performers**: oba w sekcji `` (studio, singular) i `` (performerzy, multiple). Slugi mają stable per-type salt (`*-i459s7` dla modeli, `*-7p72tp` dla channels) — zachowują się jak hash z site-version, ale stabilne przez sesje. External_id strategia: `yespornvip:` (`/video/69841/...` → `69841`). Slug w URL ma `*-npu57w` suffix który wygląda na stałe-per-page-type, ale id numeryczne jest bezpieczniejsze gdyby site zmienił salt. URL patterns: - Listing: `/latest-updates/` (page 1) / `/latest-updates/N/` (page>1) - Scene: `/video///` (id numeryczny, slug = title slug + 6-char salt) - Studio: `/channels//` - Performer: `/models//` - Search: `/search//` (nie używane w browse-mode — można dorobić jako osobny tryb dla performer-driven backfill jeśli będzie potrzeba) Playback: download endpoint `/view_video_download.php?id=&format=<480|720|1080>` z `data-attach-session="PHPSESSID"` — wymaga session cookie, więc nie direct mp4 z server-side. Plus jest `embedUrl: /embed/` w JSON-LD. Extractor → `_vps_blocked_fallback.extract` (zgodne z pre-public bandwidth/anonymity policy): mobile WebView fetcha embed z phone IP, INJECTED_JS scrape'uje ``. # Slug `` zawiera stable per-type salt (`*-npu57w` dla videos). _SCENE_URL_RE = re.compile( r'href="(https://yesporn\.vip/video/(\d+)/[a-z0-9\-]+/)"', re.IGNORECASE, ) _VIDEO_ID_RE = re.compile(r"/video/(\d+)/", re.IGNORECASE) # Studio (singular) i performerzy (multiple) w ``. # Studio: `/channels//`. Performer: `/models//`. Tekst linka = # nazwa wyświetlana (może zawierać CSS-y/inne tagi, więc strip tagów po fakcie). _STUDIO_LINK_RE = re.compile( r']*>(.*?)', re.IGNORECASE | re.DOTALL, ) _PERFORMER_LINK_RE = re.compile( r']*>(.*?)', re.IGNORECASE | re.DOTALL, ) _HTML_TAG_RE = re.compile(r"<[^>]+>") # JSON-LD VideoObject — pełny blok między `', re.IGNORECASE | re.DOTALL, ) # `` — multiple, jeden tag per meta. _META_TAG_RE = re.compile( r' int | None: if not value: return None m = _ISO_DUR_RE.match(value.strip()) if not m: return None h = int(m.group(1) or 0) mn = int(m.group(2) or 0) s = int(m.group(3) or 0) total = h * 3600 + mn * 60 + s return total or None def _parse_iso_date(value: str | None) -> date | None: """`2026-05-26T19:23:29Z` / `2026-05-26T19:23:29.EDT` → date.""" if not value: return None # yesporn emituje `.EDT` jako "timezone" w JSON-LD uploadDate — strip żeby # `fromisoformat` nie crash'ował. video:release_date meta ma czysty `Z`. cleaned = re.sub(r"\.[A-Z]{2,4}$", "", value.strip()) try: return datetime.fromisoformat(cleaned.replace("Z", "+00:00")).date() except ValueError: m = re.match(r"(\d{4}-\d{2}-\d{2})", cleaned) if m: try: return date.fromisoformat(m.group(1)) except ValueError: return None return None def _iter_jsonld_objects(data: object): """Spłaszcza JSON-LD: dict / list / @graph → strumień dict-ów.""" if isinstance(data, dict): graph = data.get("@graph") if isinstance(graph, list): for item in graph: yield from _iter_jsonld_objects(item) else: yield data elif isinstance(data, list): for item in data: yield from _iter_jsonld_objects(item) def _extract_video_object(html: str) -> dict | None: for m in _JSONLD_RE.finditer(html): raw = m.group(1).strip() if not raw: continue try: data = json.loads(raw) except (json.JSONDecodeError, ValueError): continue for obj in _iter_jsonld_objects(data): if obj.get("@type") == "VideoObject": return obj return None def _clean_link_text(raw: str) -> str: """Strip HTML tagów + decode entities + whitespace normalize.""" text = _HTML_TAG_RE.sub("", raw) text = html_mod.unescape(text) return " ".join(text.split()).strip() class YesPornVipScraper(BaseBrowseScraper): sitetag = "yespornvip" def _listing_url(self, page: int) -> str: if page <= 1: return f"{_BASE}/latest-updates/" return f"{_BASE}/latest-updates/{page}/" def _extract_scene_urls(self, listing_html: str) -> list[str]: seen: set[str] = set() out: list[str] = [] for m in _SCENE_URL_RE.finditer(listing_html): url = m.group(1) if url in seen: continue seen.add(url) out.append(url) return out def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: video = _extract_video_object(detail_html) if not video: log.info("yesporn: no JSON-LD VideoObject on %s", scene_url) return None title = (video.get("name") or "").strip() if not title: return None video_id_m = _VIDEO_ID_RE.search(scene_url) video_id = video_id_m.group(1) if video_id_m else None description = (video.get("description") or "").strip() or None # Duration: preferuj `` (czyste sekundy), # fallback do JSON-LD ISO format. duration_sec: int | None = None meta_dur = meta_content(detail_html, property="video:duration") if meta_dur and meta_dur.isdigit(): duration_sec = int(meta_dur) or None if duration_sec is None: duration_sec = _parse_iso_duration(video.get("duration")) # Release date: preferuj `` (czystszy # format z timezone), fallback do JSON-LD uploadDate. release_date = _parse_iso_date( meta_content(detail_html, property="video:release_date") or video.get("uploadDate") ) thumbnail_url = video.get("thumbnailUrl") or None # Studio: pierwszy `btn gold` link do `/channels//`. Strona renderuje # tylko jednego per scenę (logo studia obok performerów). studio: RawStudio | None = None for m in _STUDIO_LINK_RE.finditer(detail_html): slug = m.group(1).strip() name = _clean_link_text(m.group(2)) if not name: continue studio = RawStudio( external_id=f"{self.sitetag}:channel:{slug}", name=name, slug=slug, ) break # Performers: wszystkie `btn gold` linki do `/models//` (multiple). performers: list[RawPerformer] = [] seen_perf: set[str] = set() for m in _PERFORMER_LINK_RE.finditer(detail_html): slug = m.group(1).strip() if slug in seen_perf: continue name = _clean_link_text(m.group(2)) if not name: continue seen_perf.add(slug) performers.append( RawPerformer( external_id=f"{self.sitetag}:performer:{slug}", name=name, ) ) # Tagi: `` (multiple). # Deny-list: pomiń wszystkie all-lowercase tagi. yesporn.vip SEO-stuffuje # `meta video:tag` tokenami z tytułu i imionami performerów + gibberish # ("bella", "rose", "reverse", "deep", "throat", "ddca"), wszystkie always # lowercase. Legit kategorie są zawsze Title Case ("Big Ass", "Deep # Throat", "Blonde", "Gangbang") lub UPPER ("MILF", "BBW"). Potwierdzone # w 20-scene dry-run 2026-05-27. Trade-off: stracimy hipotetyczne legit # lowercase tagi (np. "interracial" gdyby site je nie capitalize'ował) — # akceptowalne bo tags mają wagę tylko 0.05 w composite scoring resolvera. tags: list[RawTag] = [] seen_tag: set[str] = set() for m in _META_TAG_RE.finditer(detail_html): name = html_mod.unescape(m.group(1)).strip() if not name: continue if name == name.lower(): continue slug = re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-") if slug in seen_tag: continue seen_tag.add(slug) tags.append( RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug) ) # Phash z thumbnailUrl — BunnyCDN `yesnn.b-cdn.net` hostuje 1.jpg per scene. # Hit-rate vs canonical TPDB/StashDB nieznany do pilot run; graceful: brak # phash → resolver spada do composite scoring (studio + performer + date + # duration + title token-set) — wszystkie dostępne dzięki JSON-LD. fingerprints: list[RawFingerprint] = [] if thumbnail_url: ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) # Playback — page_url do strony sceny. Direct mp4 (`view_video_download.php`) # wymaga PHPSESSID cookie (data-attach-session attribute), więc nie usable # server-side. Extractor `yespornvip` → `_vps_blocked_fallback.extract`: # mobile WebView z phone IP łapie session natively, INJECTED_JS scrape. playback_sources = [ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumbnail_url, ) ] return RawScene( external_id=f"{self.sitetag}:{video_id or scene_url}", title=title, description=description, release_date=release_date, duration_sec=duration_sec, url=scene_url, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=playback_sources, )