Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
215 lines
7.6 KiB
Python
215 lines
7.6 KiB
Python
"""porn00.org — latest-vids browse scraper.
|
|
|
|
URL patterns:
|
|
- Listing: `/latest-vids/` (page 1), `/latest-vids/2/`, ...
|
|
- Scene: `/video/<slug>/`
|
|
- Performer: `/<slug>/` (np. `/august-skye/`) — w sekcji "Pornstars:" na detail
|
|
- Categories: `/category-name/<slug>/`
|
|
|
|
Sygnały dostępne:
|
|
- Title (listing card + h1 + og:title)
|
|
- Performer(s) (z sekcji "Pornstars:" na detail page — pojedynczy slug per link)
|
|
- Categories (z sekcji "Categories:" — `/category-name/<slug>/`)
|
|
- Duration (listing card `<div class="duration">MM:SS</div>`)
|
|
- Direct mp4 (KVS engine — `video_url: 'https://www.porn00.org/get_file/.../<id>.mp4'`)
|
|
- Thumbnail (own CDN `/contents/videos_screenshots/.../1.jpg`)
|
|
|
|
BRAK:
|
|
- Studio
|
|
- Release year / data
|
|
- Description
|
|
|
|
Tytuł format: `"PerformerName - Scene Title"` (eg "August Skye - Helping Him...").
|
|
Performer name w prefixie tytułu zwykle pokrywa się z first `/pornstars/` link.
|
|
|
|
Expected pilot wynik: niski canonical match rate (~5-10%) bo brak studio/year. Direct
|
|
mp4 to bonus playback source dla scen które matchują canonical z innych źródeł.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from urllib.parse import urljoin
|
|
|
|
from app.connectors.base import (
|
|
RawFingerprint,
|
|
RawPerformer,
|
|
RawPlaybackSource,
|
|
RawScene,
|
|
RawTag,
|
|
)
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
meta_content,
|
|
)
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_BASE = "https://www.porn00.org"
|
|
|
|
# Listing card pattern (z chrome devtools snapshot 2026-05-17):
|
|
# <div class="item">
|
|
# <a href="https://www.porn00.org/video/<slug>/" title="...">
|
|
# <img class="thumb lazy-load" src="...contents/videos_screenshots/<bucket>/<id>/320x180/1.jpg" data-cnt="5">
|
|
# </a>
|
|
# <strong class="title">Title</strong>
|
|
# <div class="duration">34:34</div>
|
|
# </div>
|
|
_LISTING_CARD_RE = re.compile(
|
|
r'<div class="item\s*">'
|
|
r'.*?<a href="(?P<url>https://www\.porn00\.org/video/[^"]+/)"\s+title="(?P<title>[^"]+)"'
|
|
r'.*?<img class="thumb[^"]*"\s+src="(?P<thumb>[^"]+)"'
|
|
r'.*?<div class="duration">(?P<dur>[^<]+)</div>',
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
|
|
# Performer link pattern (porn00 konwencja): `/star-name/<slug>/`
|
|
# (analogicznie do `/category-name/`, `/tags-name/`).
|
|
_PERFORMER_LINK_RE = re.compile(
|
|
r'<a\s+href="https://www\.porn00\.org/star-name/([a-z0-9\-]+)/"[^>]*>([^<]+)</a>',
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Categories: <a href="https://www.porn00.org/category-name/<slug>/">Name</a>
|
|
_CATEGORY_LINK_RE = re.compile(
|
|
r'<a\s+href="https://www\.porn00\.org/category-name/([a-z0-9\-]+)/"[^>]*>([^<]+)</a>',
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Direct mp4 stream z KVS flashvars: `video_url: 'https://.../43144.mp4/?v-acctoken=...'`.
|
|
# URL może mieć cokolwiek po `.mp4`: `/?v-acctoken=...`, `?q=720p`, itp. — bierzemy
|
|
# wszystko do najbliższego `'` lub `"`.
|
|
_VIDEO_URL_RE = re.compile(
|
|
r"""video_url:\s*['"]([^'"]+\.mp4[^'"]*)['"]""", re.IGNORECASE,
|
|
)
|
|
# Wariant 720p (KVS często serwuje 360p domyślnie + 720p w `video_alt_url`).
|
|
_VIDEO_ALT_URL_RE = re.compile(
|
|
r"""video_alt_url:\s*['"]([^'"]+\.mp4[^'"]*)['"]""", re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _parse_mmss(s: str) -> int | None:
|
|
"""`34:34` → 2074, `1:20:37` → 4837."""
|
|
parts = s.strip().split(":")
|
|
try:
|
|
if len(parts) == 2:
|
|
return int(parts[0]) * 60 + int(parts[1])
|
|
if len(parts) == 3:
|
|
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
|
|
except ValueError:
|
|
return None
|
|
return None
|
|
|
|
|
|
class Porn00Scraper(BaseBrowseScraper):
|
|
sitetag = "porn00org"
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
# Cache listing card meta — duration + thumb + title. Detail page nie ma
|
|
# tych pól w meta (brak og:duration), więc listing jest source of truth.
|
|
self._listing_cache: dict[str, dict] = {}
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
if page <= 1:
|
|
return f"{_BASE}/latest-vids/"
|
|
return f"{_BASE}/latest-vids/{page}/"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
self._listing_cache = {}
|
|
seen: set[str] = set()
|
|
out: list[str] = []
|
|
for m in _LISTING_CARD_RE.finditer(listing_html):
|
|
url = m.group("url")
|
|
if url in seen:
|
|
continue
|
|
seen.add(url)
|
|
self._listing_cache[url] = {
|
|
"title": m.group("title").strip(),
|
|
"thumb": m.group("thumb"),
|
|
"duration_sec": _parse_mmss(m.group("dur") or ""),
|
|
}
|
|
out.append(url)
|
|
return out
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
meta = self._listing_cache.get(scene_url, {})
|
|
|
|
# Title: og:title preferowane (cleaner), fallback do listing meta.
|
|
title = meta_content(detail_html, property="og:title") or meta.get("title")
|
|
if not title:
|
|
return None
|
|
|
|
duration_sec = meta.get("duration_sec")
|
|
# Thumbnail: prefer og:image z detail (full-size preview), fallback listing 320x180.
|
|
thumb = meta_content(detail_html, property="og:image") or meta.get("thumb")
|
|
|
|
# Performers — porn00 konwencja `/star-name/<slug>/` (jak `/tags-name/`,
|
|
# `/category-name/`). Wszystkie linki tego pattern to performerzy.
|
|
performers: list[RawPerformer] = []
|
|
seen_perf: set[str] = set()
|
|
for pm in _PERFORMER_LINK_RE.finditer(detail_html):
|
|
slug = pm.group(1).lower()
|
|
if slug in seen_perf or not (2 <= len(slug) <= 60):
|
|
continue
|
|
seen_perf.add(slug)
|
|
performers.append(
|
|
RawPerformer(
|
|
external_id=f"{self.sitetag}:performer:{slug}",
|
|
name=pm.group(2).strip(),
|
|
)
|
|
)
|
|
|
|
# Categories → tags
|
|
tags: list[RawTag] = []
|
|
seen_tag: set[str] = set()
|
|
for cm in _CATEGORY_LINK_RE.finditer(detail_html):
|
|
slug = cm.group(1).lower()
|
|
if slug in seen_tag:
|
|
continue
|
|
seen_tag.add(slug)
|
|
tags.append(
|
|
RawTag(
|
|
external_id=f"{self.sitetag}:tag:{slug}",
|
|
name=cm.group(2).strip(),
|
|
slug=slug,
|
|
)
|
|
)
|
|
|
|
# Direct mp4 z KVS flashvars — preferujemy 720p (video_alt_url) nad 360p (video_url).
|
|
stream_url: str | None = None
|
|
if (vm := _VIDEO_ALT_URL_RE.search(detail_html)):
|
|
stream_url = vm.group(1)
|
|
elif (vm := _VIDEO_URL_RE.search(detail_html)):
|
|
stream_url = vm.group(1)
|
|
|
|
# Phash — porn00 robi własne screenshoty (`/contents/videos_screenshots/`),
|
|
# więc canonical phash match raczej fail. Próbujemy mimo to.
|
|
fingerprints: list[RawFingerprint] = []
|
|
if thumb:
|
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
|
if ph:
|
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
|
|
playback_sources = [
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumb,
|
|
stream_url=stream_url,
|
|
)
|
|
]
|
|
|
|
return RawScene(
|
|
external_id=f"{self.sitetag}:{scene_url}",
|
|
title=title,
|
|
duration_sec=duration_sec,
|
|
url=scene_url,
|
|
studio=None, # porn00 brak studio signal
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
playback_sources=playback_sources,
|
|
)
|