Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
119 lines
3.9 KiB
Python
119 lines
3.9 KiB
Python
"""ZeroDayXXScraper — direct HTML scrape 0dayxx.com search.
|
|
|
|
Search: `https://0dayxx.com/page/<n>/?s=<query>`. Scene URL format:
|
|
`https://0dayxx.com/0day-porn-video/<slug>/` (lub czasem `/<category>/<slug>/`).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
import urllib.parse
|
|
from collections.abc import Iterator
|
|
|
|
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
|
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
|
from app.extractors import browser_get
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
_SCENE_URL_RE = re.compile(
|
|
r'href="(https://0dayxx\.com/(?:0day-porn-video|latest-porn-videos|porn-(?:bf|videos))/([^"/]+))/?"'
|
|
)
|
|
_OG_TITLE_RE = re.compile(
|
|
r'<meta\s+property="og:title"\s+content="([^"]+)"', re.IGNORECASE
|
|
)
|
|
_OG_IMAGE_RE = re.compile(
|
|
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
|
|
)
|
|
|
|
|
|
def _fetch_detail(scene_url: str) -> tuple[str | None, str | None]:
|
|
"""Pobiera 0dayxx detail page i wyciąga (real_title, thumbnail_url).
|
|
|
|
0dayxx jest wrapperem (embeduje watchporn.to/inne), więc duration/tagi tu
|
|
nie są — siedzą na watchporn.to. og:image jednak jest na 0dayxx i daje
|
|
miniaturkę z poprawnym wymiarem (200x200 — mała, ale lepsza niż żadna).
|
|
|
|
Bez tego fetch'u sceny 0dayxx trafiały do dedupu z slug'iem jako title +
|
|
bez thumbnail_url — czyli z dwoma najsłabszymi sygnałami na raz, co
|
|
powodowało albo brak match'y albo false-positive merge'y (zgłoszone
|
|
2026-05-09).
|
|
"""
|
|
try:
|
|
r = browser_get(scene_url, timeout=20)
|
|
except Exception as e:
|
|
log.debug("0dayxx detail fetch failed for %s: %s", scene_url, e)
|
|
return None, None
|
|
if r.status_code != 200:
|
|
return None, None
|
|
title = None
|
|
thumb = None
|
|
if (m := _OG_TITLE_RE.search(r.text)):
|
|
# Strip ` | 0dayxx.com Daily...` suffix (powtórki og:title czasem mają go).
|
|
title = m.group(1).split("|")[0].strip()
|
|
if (m := _OG_IMAGE_RE.search(r.text)):
|
|
thumb = m.group(1).strip()
|
|
return title, thumb
|
|
|
|
|
|
class ZeroDayXXScraper(BaseDirectTubeScraper):
|
|
sitetag = "0dayxxcom"
|
|
|
|
def search(
|
|
self,
|
|
query: str,
|
|
*,
|
|
page: int = 1,
|
|
limit: int | None = None,
|
|
) -> Iterator[RawScene]:
|
|
q = urllib.parse.quote_plus(query.strip())
|
|
url = f"https://0dayxx.com/page/{page}/?s={q}"
|
|
try:
|
|
r = browser_get(url, timeout=30)
|
|
except Exception as e:
|
|
log.warning("0dayxx search fetch failed: %s", e)
|
|
return
|
|
if r.status_code != 200:
|
|
return
|
|
|
|
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
|
|
|
|
seen: set[str] = set()
|
|
yielded = 0
|
|
for m in _SCENE_URL_RE.finditer(r.text):
|
|
scene_url = m.group(1) + "/"
|
|
slug = m.group(2)
|
|
if scene_url in seen:
|
|
continue
|
|
seen.add(scene_url)
|
|
|
|
slug_lower = slug.lower()
|
|
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
|
|
continue
|
|
|
|
real_title, thumb = _fetch_detail(scene_url)
|
|
title = real_title or slug.replace("-", " ").strip()
|
|
|
|
yield RawScene(
|
|
external_id=f"0dayxxcom:{scene_url}",
|
|
title=title,
|
|
url=scene_url,
|
|
playback_sources=[
|
|
RawPlaybackSource(
|
|
origin="tube:0dayxxcom",
|
|
page_url=scene_url,
|
|
thumbnail_url=thumb,
|
|
)
|
|
],
|
|
performers=[RawPerformer(name=query.strip())],
|
|
raw={
|
|
"source": "direct_scraper:0dayxx",
|
|
"query": query,
|
|
"page": page,
|
|
"url": scene_url,
|
|
},
|
|
)
|
|
yielded += 1
|
|
if limit is not None and yielded >= limit:
|
|
return
|