diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index e1b85c9..09669f2 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -91,16 +91,8 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [ # mobile = black screen (player JS nie inicjalizuje się przez Turnstile). 16% # scen solo (no backup tube), 84% multi-source — user może użyć innego tube. yt-dlp # nie wspiera DoodStream ("Piracy"), własny resolver TBD jeśli warto. - # SiskaScraper — wyłączony 2026-05-16, pozostaje WYŁĄCZONY. Rewizja 2026-06-20 - # (user fa4083a2): świeże filmy (videoID 227xxx) embedują playmogo + luluvid i - # SĄ phone-resolwowalne (_embed_iframe oddaje type='hoster', extractor+regex - # zaktualizowane do `video.php?videoID=` w siska.py). ALE search siski jest - # ZEPSUTY site-side: `?s=` ignoruje zapytanie i zwraca zawsze latest - # (angela white == riley reid == homepage). Jako BaseSearchScraper (performer- - # driven) zawsze yielduje 0 → bezcelowy. Żeby ożywić, trzeba PRZEROBIĆ na - # browse-scraper (latest chronologicznie) — zmiana charakteru ingestu, do decyzji. - # Stare self-player filmy (player.siska.video → cfglobalcdn) z 2018 są martwe. - # SiskaScraper, + # SiskaScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-20, + # bo search siski zepsuty site-side — `?s=` ignoruje query). Patrz siska.py. # Porn4DaysScraper — wyłączony 2026-05-12 (post audit fix). 100% scen na streamtape # only (DEAD_HOSTER_RE blacklist - malware drive-by .reg downloads). SERVER1_URL = # streamtape, brak SERVER2/SERVER3 backup. Porn-app sam olewa porn4days. 10,346 @@ -165,6 +157,11 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ FreshpornoScraper, + # SiskaScraper — re-enabled 2026-06-20 jako browse (user fa4083a2). Search siski + # zepsuty site-side (`?s=` ignoruje query), więc latest-browse z `/page//`. + # Komplet metadanych z kafelka listingu (tytuł/duration/thumb/performer/studio/ + # kategoria). Playback: playmogo + luluvid → telefon resolwuje phone-side. + SiskaScraper, # PornXPScraper — pilot 2026-05-17 (20 scen): studio 100%, performer 95%, # release_date 100%, duration 100%, stream_url 100%, phash 100%. Najlepsze # sygnały spośród browse-mode scraperów. Stream direct mp4 (sv.porn-xp.com) diff --git a/app/connectors/direct_scrapers/siska.py b/app/connectors/direct_scrapers/siska.py index 8dbb32c..2433406 100644 --- a/app/connectors/direct_scrapers/siska.py +++ b/app/connectors/direct_scrapers/siska.py @@ -1,26 +1,178 @@ -"""siska.video — direct HTML scrape. +"""siska.video — latest-vids browse scraper. -Search: `https://siska.video/page//?s=` (działa nadal). -Scene URL: `https://siska.video/video.php?videoID=` (zmiana 2026-05+, dawniej `//`). +Historia: dawniej performer-driven search scraper (`?s=`), ale siska zepsuła +wyszukiwarkę (zwraca latest niezależnie od query). Przerobione na BROWSE (latest +chronologicznie z `/page//`), re-enabled 2026-06-20 (user fa4083a2). -Nowy format nie ma słów tytułu w URL (slug = numer videoID), więc do `slug` (którego -`_search_base` używa do token-filtra query + derywacji tytułu) bierzemy `title='...'` -z tego samego . Świeże filmy embedują playmogo + luluvid → telefon resolwuje -phone-side (_embed_iframe oddaje type='hoster'). Re-enabled 2026-06-20 (user fa4083a2). +Cały blok kafelka listingu ma komplet metadanych (zero detail-fetchy): + + 40 : 27 + <Performer> - <Tytuł> - <Studio> +→ tytuł, duration, miniatura, performer+studio (alt), kategoria (ścieżka thumba). + +Playback: świeże filmy embedują playmogo (DoodStream clone) + luluvid (filemoon +family). Extractor `siskavideo` → `_embed_iframe.extract` oddaje type='hoster'; +telefon resolwuje phone-side. page_url = video.php?videoID=. """ from __future__ import annotations +import logging import re -from app.connectors.direct_scrapers._search_base import BaseSearchScraper +from app.connectors.base import ( + RawFingerprint, + RawPerformer, + RawPlaybackSource, + RawScene, + RawStudio, + RawTag, +) +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, +) +from app.extractors import browser_get + +log = logging.getLogger(__name__) + +_BASE = "https://siska.video" + +# Kafelek: . Reszta pól w oknie po tym matchu. +_A_RE = re.compile( + r"[^']*)'\s+href='(?Phttps://siska\.video/video\.php\?videoID=\d+)'", + re.IGNORECASE, +) +_DUR_RE = re.compile(r"th_video_duration'>\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<") +_THUMB_RE = re.compile(r"data-src='([^']+\.(?:jpg|jpeg|webp|png))'", re.IGNORECASE) +_ALT_RE = re.compile(r"alt='([^']*)'") +_CAT_RE = re.compile(r"/category/([^/]+)/", re.IGNORECASE) -class SiskaScraper(BaseSearchScraper): +def _parse_duration(text: str | None) -> int | None: + """`40 : 27` → 2427 (MM:SS); `1 : 05 : 30` → H:MM:SS. None gdy brak.""" + if not text: + return None + parts = [p.strip() for p in text.split(":")] + try: + nums = [int(p) for p in parts] + except ValueError: + return None + if len(nums) == 2: + return nums[0] * 60 + nums[1] + if len(nums) == 3: + return nums[0] * 3600 + nums[1] * 60 + nums[2] + return None + + +def _slugify(name: str) -> str: + return re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-") + + +class SiskaScraper(BaseBrowseScraper): sitetag = "siskavideo" - _search_url_template = "https://siska.video/page/{page}/?s={query}" - # - # `slug` = tytuł (token-filtr + tytuł działają na nim; numer videoID nie ma słów). - _scene_url_re = re.compile( - r"[^']*)'\s+href='(?Phttps://siska\.video/video\.php\?videoID=\d+)'", - re.IGNORECASE, - ) + + def _listing_url(self, page: int) -> str: + return f"{_BASE}/page/{page}/" + + # crawl_page nadpisany → poniższe abstrakcje nieużywane, ale wymagane do instancji. + def _extract_scene_urls(self, listing_html: str) -> list[str]: + return [m.group("url") for m in _A_RE.finditer(listing_html)] + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + return None + + def crawl_page(self, page: int) -> list[RawScene] | None: + url = self._listing_url(page) + try: + res = browser_get(url, timeout=self._timeout) + html = res.text if hasattr(res, "text") else res + except Exception as e: + log.warning("siska browse listing fetch failed (page %d): %s", page, e) + return None + + out: list[RawScene] = [] + seen: set[str] = set() + for m in _A_RE.finditer(html): + scene_url = m.group("url") + if scene_url in seen: + continue + seen.add(scene_url) + title = (m.group("title") or "").strip() + if not title: + continue + window = html[m.end():m.end() + 700] + + dm = _DUR_RE.search(window) + duration_sec = _parse_duration(dm.group(1) if dm else None) + tm = _THUMB_RE.search(window) + thumbnail_url = tm.group(1) if tm else None + + # alt='Performer - Tytuł - Studio' → performer (pierwszy), studio (ostatni). + performers: list[RawPerformer] = [] + studio: RawStudio | None = None + am = _ALT_RE.search(window) + if am and " - " in am.group(1): + parts = [p.strip() for p in am.group(1).split(" - ") if p.strip()] + if len(parts) >= 2: + pname = parts[0] + sname = parts[-1] + if pname: + performers.append( + RawPerformer( + external_id=f"{self.sitetag}:performer:{_slugify(pname)}", + name=pname, + ) + ) + if sname and len(parts) >= 3: + studio = RawStudio( + external_id=f"{self.sitetag}:studio:{_slugify(sname)}", + name=sname, + slug=_slugify(sname), + ) + + # Kategoria ze ścieżki miniatury (/category//.jpg). + tags: list[RawTag] = [] + if thumbnail_url: + cm = _CAT_RE.search(thumbnail_url) + if cm and cm.group(1).lower() not in ("uncategorized", ""): + cat = cm.group(1).replace("-", " ").replace("_", " ").strip() + tags.append( + RawTag( + external_id=f"{self.sitetag}:tag:{_slugify(cat)}", + name=cat, + slug=_slugify(cat), + ) + ) + + fingerprints: list[RawFingerprint] = [] + if thumbnail_url: + ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + playback_sources = [ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumbnail_url, + ) + ] + + out.append( + RawScene( + external_id=f"{self.sitetag}:{scene_url}", + title=title, + duration_sec=duration_sec, + url=scene_url, + studio=studio, + performers=performers, + tags=tags, + fingerprints=fingerprints, + playback_sources=playback_sources, + ) + ) + + log.info("siska browse page %d: %d scenes", page, len(out)) + return out