Mobile / OTA: - Enable Expo Updates (app.json + AndroidManifest) → api.goon-foss.org - Bump 0.1.6 → 0.1.9 (build.gradle, app.json, appVersion.ts, main.py /version) - backend.ts: default public backend auto-connect (no manual login) WebView fallback fix (PlayerScreen INJECTED_JS): - Auto-dismiss cookie/consent gates (hqporner et al. blocked kt_player init) - Context-scoped: only clicks consent buttons inside cookie/gdpr containers - Retry window for <source>.src polling raised 5→15 ticks (post-dismiss init) Resolver: - Series-position + modifier mismatch detector (Episode 2≠4, BTS/unedited) → composite_score hard-reject / cap; wired into scene_score + bulk_dedup - aggregator-mode candidate query: LIMIT 500 + title-match ordering Connectors: - porndoe.com browse scraper (JSON-LD VideoObject) — theporndude audit pilot landing: APK links → goon-v0.1.9.apk Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
271 lines
9.5 KiB
Python
271 lines
9.5 KiB
Python
"""porndoe.com — latest-vids browse scraper.
|
|
|
|
Dołączony 2026-05-21 (theporndude audit). Jedyny verified high-value candidate
|
|
z 172 tube'ów na theporndude.com/top-porn-tube-sites + /full-porn-movies-sites.
|
|
|
|
Czemu wart: każda scena ma kompletny **JSON-LD VideoObject** schema:
|
|
- name (title), description, uploadDate (ISO timestamp), duration (ISO 8601)
|
|
- producer + publisher → named studio z `/channel-profile/<slug>` URL
|
|
- actor[] → named performers z `/pornstars-profile/<slug>` URL
|
|
- thumbnailUrl (CDN p.cdnc.porndoe.com)
|
|
|
|
To wystarczy do composite fuzzy match w resolverze (studio + performer Jaccard +
|
|
date proximity + title token-set + duration). Phash hit-rate niski (porndoe robi
|
|
własne crop-thumbnaile 390x219, nie hot-linkuje studio art) — ale rich metadata
|
|
nadrabia, jak pornxp/porn00.
|
|
|
|
URL patterns:
|
|
- Listing: `/videos/most-recent?page=N` (page 1 = newest, ~31 scen/page)
|
|
- Scene: `/watch/<id>` gdzie id = `pd` + 10 alfanum (stable)
|
|
- Studio: `/channel-profile/<slug>`
|
|
- Performer: `/pornstars-profile/<slug>`
|
|
- Tags/categories: `/categories/<slug>`
|
|
|
|
Playback: stream URL NIE jest inline w SSR HTML — player JS init dopiero po user
|
|
"Play" click. Dajemy playback_source z page_url + origin `tube:porndoecom`;
|
|
extractor w `_REGISTRY` mapuje na `_vps_blocked_fallback.extract` → mobile WebView
|
|
INJECTED_JS scrapuje `<video>.src` po phone IP (0 VPS bandwidth, zgodne z
|
|
pre-public bandwidth/anonimowość priorytet).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from datetime import date, datetime
|
|
|
|
from app.connectors.base import (
|
|
RawFingerprint,
|
|
RawPerformer,
|
|
RawPlaybackSource,
|
|
RawScene,
|
|
RawStudio,
|
|
RawTag,
|
|
)
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
)
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_BASE = "https://porndoe.com"
|
|
|
|
# Scene listing — `<a href="/watch/pd7a3o0e8v2b">`. Id = `pd` + alfanum.
|
|
_SCENE_URL_RE = re.compile(r'href="(/watch/[a-z0-9]+)"', re.IGNORECASE)
|
|
_WATCH_ID_RE = re.compile(r"/watch/([a-z0-9]+)", re.IGNORECASE)
|
|
|
|
# JSON-LD <script> bloki.
|
|
_JSONLD_RE = re.compile(
|
|
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
|
|
# Tagi/kategorie z DOM (JSON-LD genre bywa pusty). porndoe URL: `/category/<id>/<slug>`.
|
|
_TAG_LINK_RE = re.compile(
|
|
r'href="/category/\d+/([a-z0-9\-]+)"[^>]*>([^<]+)</a>', re.IGNORECASE
|
|
)
|
|
|
|
# ISO 8601 duration — porndoe emituje "PT8M0S" (czasem "T8M0S" bez P).
|
|
_ISO_DUR_RE = re.compile(
|
|
r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE
|
|
)
|
|
|
|
|
|
def _parse_iso_duration(value: str | None) -> int | None:
|
|
"""`PT11M7S` / `T8M0S` → sekundy. None gdy format nieznany."""
|
|
if not value:
|
|
return None
|
|
m = _ISO_DUR_RE.match(value.strip())
|
|
if not m:
|
|
return None
|
|
h = int(m.group(1) or 0)
|
|
mn = int(m.group(2) or 0)
|
|
s = int(m.group(3) or 0)
|
|
total = h * 3600 + mn * 60 + s
|
|
return total or None
|
|
|
|
|
|
def _parse_iso_date(value: str | None) -> date | None:
|
|
"""`2026-05-20T14:55:13+00:00` → date. None gdy parse fail."""
|
|
if not value:
|
|
return None
|
|
try:
|
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
|
except ValueError:
|
|
# Fallback: pierwsze 10 znaków YYYY-MM-DD
|
|
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
|
|
if m:
|
|
try:
|
|
return date.fromisoformat(m.group(1))
|
|
except ValueError:
|
|
return None
|
|
return None
|
|
|
|
|
|
def _slug_from_url(url: str | None) -> str | None:
|
|
"""`https://porndoe.com/channel-profile/fantasy-girl-pass` → `fantasy-girl-pass`."""
|
|
if not url:
|
|
return None
|
|
m = re.search(r"/(?:channel-profile|pornstars-profile)/([a-z0-9\-]+)", url, re.IGNORECASE)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def _iter_jsonld_objects(data: object):
|
|
"""Spłaszcza JSON-LD: dict / list / @graph → strumień dict-ów."""
|
|
if isinstance(data, dict):
|
|
graph = data.get("@graph")
|
|
if isinstance(graph, list):
|
|
for item in graph:
|
|
yield from _iter_jsonld_objects(item)
|
|
else:
|
|
yield data
|
|
elif isinstance(data, list):
|
|
for item in data:
|
|
yield from _iter_jsonld_objects(item)
|
|
|
|
|
|
def _extract_video_object(html: str) -> dict | None:
|
|
"""Znajdź pierwszy JSON-LD VideoObject w HTML."""
|
|
for m in _JSONLD_RE.finditer(html):
|
|
raw = m.group(1).strip()
|
|
if not raw:
|
|
continue
|
|
try:
|
|
data = json.loads(raw)
|
|
except (json.JSONDecodeError, ValueError):
|
|
continue
|
|
for obj in _iter_jsonld_objects(data):
|
|
if obj.get("@type") == "VideoObject":
|
|
return obj
|
|
return None
|
|
|
|
|
|
class PornDoeScraper(BaseBrowseScraper):
|
|
sitetag = "porndoecom"
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
if page <= 1:
|
|
return f"{_BASE}/videos/most-recent"
|
|
return f"{_BASE}/videos/most-recent?page={page}"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
seen: set[str] = set()
|
|
out: list[str] = []
|
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
|
url = f"{_BASE}{m.group(1)}"
|
|
if url in seen:
|
|
continue
|
|
seen.add(url)
|
|
out.append(url)
|
|
return out
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
video = _extract_video_object(detail_html)
|
|
if not video:
|
|
log.info("porndoe: no JSON-LD VideoObject on %s", scene_url)
|
|
return None
|
|
|
|
title = (video.get("name") or "").strip()
|
|
if not title:
|
|
return None
|
|
|
|
watch_id_m = _WATCH_ID_RE.search(scene_url)
|
|
watch_id = watch_id_m.group(1) if watch_id_m else None
|
|
|
|
description = (video.get("description") or "").strip() or None
|
|
duration_sec = _parse_iso_duration(video.get("duration"))
|
|
release_date = _parse_iso_date(
|
|
video.get("uploadDate") or video.get("datePublished")
|
|
)
|
|
thumbnail_url = video.get("thumbnailUrl") or None
|
|
|
|
# Studio: producer / publisher (Organization). Preferuj producer.
|
|
studio: RawStudio | None = None
|
|
for key in ("producer", "publisher"):
|
|
org = video.get(key)
|
|
if isinstance(org, dict) and org.get("name"):
|
|
name = org["name"].strip()
|
|
slug = _slug_from_url(org.get("url")) or re.sub(
|
|
r"[^a-z0-9]+", "-", name.lower()
|
|
).strip("-")
|
|
if name:
|
|
studio = RawStudio(
|
|
external_id=f"{self.sitetag}:channel:{slug}",
|
|
name=name,
|
|
slug=slug,
|
|
)
|
|
break
|
|
|
|
# Performers: actor[] (lista Person lub pojedynczy Person).
|
|
performers: list[RawPerformer] = []
|
|
seen_perf: set[str] = set()
|
|
actors = video.get("actor")
|
|
if isinstance(actors, dict):
|
|
actors = [actors]
|
|
if isinstance(actors, list):
|
|
for actor in actors:
|
|
if not isinstance(actor, dict):
|
|
continue
|
|
name = (actor.get("name") or "").strip()
|
|
if not name:
|
|
continue
|
|
slug = _slug_from_url(actor.get("url")) or re.sub(
|
|
r"[^a-z0-9]+", "-", name.lower()
|
|
).strip("-")
|
|
if slug in seen_perf:
|
|
continue
|
|
seen_perf.add(slug)
|
|
performers.append(
|
|
RawPerformer(
|
|
external_id=f"{self.sitetag}:performer:{slug}",
|
|
name=name,
|
|
)
|
|
)
|
|
|
|
# Tagi: z DOM (`/categories/<slug>` / `/tags/<slug>`).
|
|
tags: list[RawTag] = []
|
|
seen_tag: set[str] = set()
|
|
for m in _TAG_LINK_RE.finditer(detail_html):
|
|
slug, name = m.group(1), m.group(2).strip()
|
|
if not name or name.lower() in ("categories", "tags", ""):
|
|
continue
|
|
if slug in seen_tag or len(slug) > 60:
|
|
continue
|
|
seen_tag.add(slug)
|
|
tags.append(
|
|
RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug)
|
|
)
|
|
|
|
# Phash z thumbnail (porndoe robi własne crop-thumbnaile — niski hit-rate
|
|
# oczekiwany, ale graceful: brak match → resolver spada do composite scoring).
|
|
fingerprints: list[RawFingerprint] = []
|
|
if thumbnail_url:
|
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
|
if ph:
|
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
|
|
# Playback — page_url do scena strony. Stream JS-rendered, więc extractor
|
|
# `porndoecom` → `_vps_blocked_fallback.extract` (mobile WebView scrape).
|
|
playback_sources = [
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumbnail_url,
|
|
)
|
|
]
|
|
|
|
return RawScene(
|
|
external_id=f"{self.sitetag}:{watch_id or scene_url}",
|
|
title=title,
|
|
description=description,
|
|
release_date=release_date,
|
|
duration_sec=duration_sec,
|
|
url=scene_url,
|
|
studio=studio,
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
playback_sources=playback_sources,
|
|
)
|