"""ZeroDayXXScraper — direct HTML scrape 0dayxx.com search. Search: `https://0dayxx.com/page//?s=`. Scene URL format: `https://0dayxx.com/0day-porn-video//` (lub czasem `///`). """ from __future__ import annotations import logging import re import urllib.parse from collections.abc import Iterator from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene from app.connectors.direct_scrapers.base import BaseDirectTubeScraper from app.extractors import browser_get log = logging.getLogger(__name__) _SCENE_URL_RE = re.compile( r'href="(https://0dayxx\.com/(?:0day-porn-video|latest-porn-videos|porn-(?:bf|videos))/([^"/]+))/?"' ) _OG_TITLE_RE = re.compile( r' tuple[str | None, str | None]: """Pobiera 0dayxx detail page i wyciąga (real_title, thumbnail_url). 0dayxx jest wrapperem (embeduje watchporn.to/inne), więc duration/tagi tu nie są — siedzą na watchporn.to. og:image jednak jest na 0dayxx i daje miniaturkę z poprawnym wymiarem (200x200 — mała, ale lepsza niż żadna). Bez tego fetch'u sceny 0dayxx trafiały do dedupu z slug'iem jako title + bez thumbnail_url — czyli z dwoma najsłabszymi sygnałami na raz, co powodowało albo brak match'y albo false-positive merge'y (zgłoszone 2026-05-09). """ try: r = browser_get(scene_url, timeout=20) except Exception as e: log.debug("0dayxx detail fetch failed for %s: %s", scene_url, e) return None, None if r.status_code != 200: return None, None title = None thumb = None if (m := _OG_TITLE_RE.search(r.text)): # Strip ` | 0dayxx.com Daily...` suffix (powtórki og:title czasem mają go). title = m.group(1).split("|")[0].strip() if (m := _OG_IMAGE_RE.search(r.text)): thumb = m.group(1).strip() return title, thumb class ZeroDayXXScraper(BaseDirectTubeScraper): sitetag = "0dayxxcom" def search( self, query: str, *, page: int = 1, limit: int | None = None, ) -> Iterator[RawScene]: q = urllib.parse.quote_plus(query.strip()) url = f"https://0dayxx.com/page/{page}/?s={q}" try: r = browser_get(url, timeout=30) except Exception as e: log.warning("0dayxx search fetch failed: %s", e) return if r.status_code != 200: return query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3} seen: set[str] = set() yielded = 0 for m in _SCENE_URL_RE.finditer(r.text): scene_url = m.group(1) + "/" slug = m.group(2) if scene_url in seen: continue seen.add(scene_url) slug_lower = slug.lower() if query_tokens and not any(tok in slug_lower for tok in query_tokens): continue real_title, thumb = _fetch_detail(scene_url) title = real_title or slug.replace("-", " ").strip() yield RawScene( external_id=f"0dayxxcom:{scene_url}", title=title, url=scene_url, playback_sources=[ RawPlaybackSource( origin="tube:0dayxxcom", page_url=scene_url, thumbnail_url=thumb, ) ], performers=[RawPerformer(name=query.strip())], raw={ "source": "direct_scraper:0dayxx", "query": query, "page": page, "url": scene_url, }, ) yielded += 1 if limit is not None and yielded >= limit: return