goon/app/connectors/direct_scrapers/siska.py
jtrzupek ac84da92a4 feat(siska): convert to browse scraper, re-enable (search broken site-side)
siska's ?s= search ignores the query (returns latest regardless), so the performer-driven search scraper always yielded 0 and was disabled. Rewrote SiskaScraper as a latest-browse scraper (BaseBrowseScraper, /page/<n>/) and moved it to ALL_BROWSE_SCRAPERS. The listing tile carries everything (no detail fetch): title, duration (MM:SS span), thumbnail (img data-src), performer + studio (img alt 'Performer - Title - Studio'), category (thumbnail path). Playback unchanged: fresh videos embed playmogo + luluvid, resolved phone-side via _embed_iframe. Verified ingest: 26 seen / 11 new / 15 updated / 0 errors — and 15 updated means siska scenes match existing canonical scenes, adding playback coverage rather than orphans. Now covered by the browse ingest-watchdog (48h) and the 6h browse-latest + deep-crawl jobs. Old self-player videos (player.siska.video -> cfglobalcdn, ~2018) are dead and age out.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-20 16:25:11 +02:00

178 lines
6.5 KiB
Python

"""siska.video — latest-vids browse scraper.
Historia: dawniej performer-driven search scraper (`?s=<q>`), ale siska zepsuła
wyszukiwarkę (zwraca latest niezależnie od query). Przerobione na BROWSE (latest
chronologicznie z `/page/<n>/`), re-enabled 2026-06-20 (user fa4083a2).
Cały blok kafelka listingu ma komplet metadanych (zero detail-fetchy):
<a title='<Tytuł>' href='https://siska.video/video.php?videoID=<n>'>
<span class='th_video_duration'>40 : 27</span>
<img data-src='https://siska.video/category/<Kat>/<id>.jpg'
alt='<Performer> - <Tytuł> - <Studio>'>
→ tytuł, duration, miniatura, performer+studio (alt), kategoria (ścieżka thumba).
Playback: świeże filmy embedują playmogo (DoodStream clone) + luluvid (filemoon
family). Extractor `siskavideo` → `_embed_iframe.extract` oddaje type='hoster';
telefon resolwuje phone-side. page_url = video.php?videoID=<n>.
"""
from __future__ import annotations
import logging
import re
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
from app.extractors import browser_get
log = logging.getLogger(__name__)
_BASE = "https://siska.video"
# Kafelek: <a title='..' href='..videoID=N'>. Reszta pól w oknie po tym matchu.
_A_RE = re.compile(
r"<a\s+title='(?P<title>[^']*)'\s+href='(?P<url>https://siska\.video/video\.php\?videoID=\d+)'",
re.IGNORECASE,
)
_DUR_RE = re.compile(r"th_video_duration'>\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<")
_THUMB_RE = re.compile(r"data-src='([^']+\.(?:jpg|jpeg|webp|png))'", re.IGNORECASE)
_ALT_RE = re.compile(r"alt='([^']*)'")
_CAT_RE = re.compile(r"/category/([^/]+)/", re.IGNORECASE)
def _parse_duration(text: str | None) -> int | None:
"""`40 : 27` → 2427 (MM:SS); `1 : 05 : 30` → H:MM:SS. None gdy brak."""
if not text:
return None
parts = [p.strip() for p in text.split(":")]
try:
nums = [int(p) for p in parts]
except ValueError:
return None
if len(nums) == 2:
return nums[0] * 60 + nums[1]
if len(nums) == 3:
return nums[0] * 3600 + nums[1] * 60 + nums[2]
return None
def _slugify(name: str) -> str:
return re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-")
class SiskaScraper(BaseBrowseScraper):
sitetag = "siskavideo"
def _listing_url(self, page: int) -> str:
return f"{_BASE}/page/{page}/"
# crawl_page nadpisany → poniższe abstrakcje nieużywane, ale wymagane do instancji.
def _extract_scene_urls(self, listing_html: str) -> list[str]:
return [m.group("url") for m in _A_RE.finditer(listing_html)]
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
return None
def crawl_page(self, page: int) -> list[RawScene] | None:
url = self._listing_url(page)
try:
res = browser_get(url, timeout=self._timeout)
html = res.text if hasattr(res, "text") else res
except Exception as e:
log.warning("siska browse listing fetch failed (page %d): %s", page, e)
return None
out: list[RawScene] = []
seen: set[str] = set()
for m in _A_RE.finditer(html):
scene_url = m.group("url")
if scene_url in seen:
continue
seen.add(scene_url)
title = (m.group("title") or "").strip()
if not title:
continue
window = html[m.end():m.end() + 700]
dm = _DUR_RE.search(window)
duration_sec = _parse_duration(dm.group(1) if dm else None)
tm = _THUMB_RE.search(window)
thumbnail_url = tm.group(1) if tm else None
# alt='Performer - Tytuł - Studio' → performer (pierwszy), studio (ostatni).
performers: list[RawPerformer] = []
studio: RawStudio | None = None
am = _ALT_RE.search(window)
if am and " - " in am.group(1):
parts = [p.strip() for p in am.group(1).split(" - ") if p.strip()]
if len(parts) >= 2:
pname = parts[0]
sname = parts[-1]
if pname:
performers.append(
RawPerformer(
external_id=f"{self.sitetag}:performer:{_slugify(pname)}",
name=pname,
)
)
if sname and len(parts) >= 3:
studio = RawStudio(
external_id=f"{self.sitetag}:studio:{_slugify(sname)}",
name=sname,
slug=_slugify(sname),
)
# Kategoria ze ścieżki miniatury (/category/<Kat>/<id>.jpg).
tags: list[RawTag] = []
if thumbnail_url:
cm = _CAT_RE.search(thumbnail_url)
if cm and cm.group(1).lower() not in ("uncategorized", ""):
cat = cm.group(1).replace("-", " ").replace("_", " ").strip()
tags.append(
RawTag(
external_id=f"{self.sitetag}:tag:{_slugify(cat)}",
name=cat,
slug=_slugify(cat),
)
)
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
out.append(
RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)
)
log.info("siska browse page %d: %d scenes", page, len(out))
return out