"""siska.video — latest-vids browse scraper. Historia: dawniej performer-driven search scraper (`?s=`), ale siska zepsuła wyszukiwarkę (zwraca latest niezależnie od query). Przerobione na BROWSE (latest chronologicznie z `/page//`), re-enabled 2026-06-20 (user fa4083a2). Cały blok kafelka listingu ma komplet metadanych (zero detail-fetchy): 40 : 27 <Performer> - <Tytuł> - <Studio> → tytuł, duration, miniatura, performer+studio (alt), kategoria (ścieżka thumba). Playback: świeże filmy embedują playmogo (DoodStream clone) + luluvid (filemoon family). Extractor `siskavideo` → `_embed_iframe.extract` oddaje type='hoster'; telefon resolwuje phone-side. page_url = video.php?videoID=. """ from __future__ import annotations import logging import re from app.connectors.base import ( RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, ) from app.extractors import browser_get log = logging.getLogger(__name__) _BASE = "https://siska.video" # Kafelek: . Reszta pól w oknie po tym matchu. _A_RE = re.compile( r"[^']*)'\s+href='(?Phttps://siska\.video/video\.php\?videoID=\d+)'", re.IGNORECASE, ) _DUR_RE = re.compile(r"th_video_duration'>\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<") _THUMB_RE = re.compile(r"data-src='([^']+\.(?:jpg|jpeg|webp|png))'", re.IGNORECASE) _ALT_RE = re.compile(r"alt='([^']*)'") _CAT_RE = re.compile(r"/category/([^/]+)/", re.IGNORECASE) def _parse_duration(text: str | None) -> int | None: """`40 : 27` → 2427 (MM:SS); `1 : 05 : 30` → H:MM:SS. None gdy brak.""" if not text: return None parts = [p.strip() for p in text.split(":")] try: nums = [int(p) for p in parts] except ValueError: return None if len(nums) == 2: return nums[0] * 60 + nums[1] if len(nums) == 3: return nums[0] * 3600 + nums[1] * 60 + nums[2] return None def _slugify(name: str) -> str: return re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-") class SiskaScraper(BaseBrowseScraper): sitetag = "siskavideo" def _listing_url(self, page: int) -> str: return f"{_BASE}/page/{page}/" # crawl_page nadpisany → poniższe abstrakcje nieużywane, ale wymagane do instancji. def _extract_scene_urls(self, listing_html: str) -> list[str]: return [m.group("url") for m in _A_RE.finditer(listing_html)] def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: return None def crawl_page(self, page: int) -> list[RawScene] | None: url = self._listing_url(page) try: res = browser_get(url, timeout=self._timeout) html = res.text if hasattr(res, "text") else res except Exception as e: log.warning("siska browse listing fetch failed (page %d): %s", page, e) return None out: list[RawScene] = [] seen: set[str] = set() for m in _A_RE.finditer(html): scene_url = m.group("url") if scene_url in seen: continue seen.add(scene_url) title = (m.group("title") or "").strip() if not title: continue window = html[m.end():m.end() + 700] dm = _DUR_RE.search(window) duration_sec = _parse_duration(dm.group(1) if dm else None) tm = _THUMB_RE.search(window) thumbnail_url = tm.group(1) if tm else None # alt='Performer - Tytuł - Studio' → performer (pierwszy), studio (ostatni). performers: list[RawPerformer] = [] studio: RawStudio | None = None am = _ALT_RE.search(window) if am and " - " in am.group(1): parts = [p.strip() for p in am.group(1).split(" - ") if p.strip()] if len(parts) >= 2: pname = parts[0] sname = parts[-1] if pname: performers.append( RawPerformer( external_id=f"{self.sitetag}:performer:{_slugify(pname)}", name=pname, ) ) if sname and len(parts) >= 3: studio = RawStudio( external_id=f"{self.sitetag}:studio:{_slugify(sname)}", name=sname, slug=_slugify(sname), ) # Kategoria ze ścieżki miniatury (/category//.jpg). tags: list[RawTag] = [] if thumbnail_url: cm = _CAT_RE.search(thumbnail_url) if cm and cm.group(1).lower() not in ("uncategorized", ""): cat = cm.group(1).replace("-", " ").replace("_", " ").strip() tags.append( RawTag( external_id=f"{self.sitetag}:tag:{_slugify(cat)}", name=cat, slug=_slugify(cat), ) ) fingerprints: list[RawFingerprint] = [] if thumbnail_url: ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) playback_sources = [ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumbnail_url, ) ] out.append( RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, duration_sec=duration_sec, url=scene_url, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=playback_sources, ) ) log.info("siska browse page %d: %d scenes", page, len(out)) return out