"""BasePlayTubeScraper — wspólna baza dla tube'ów na PlayTube CMS (hqfap, 4k69). Platforma rozpoznawalna po: `/watch/_.html`, sitemap index `/sitemaps/videos/sitemap-N.xml` (z ``), JSON-LD VideoObject na detail page'u (name + uploadDate + duration ISO 8601 + thumbnailUrl + contentUrl) oraz pillach ``. Listing NIE paginuje się GET-em (PlayTube doładowuje AJAX-em `aj/load-more/`), więc crawl_page buduje katalog z sitemapów (sort lastmod desc = newest first) i tnie na strony po `_PAGE_SIZE`. Działa dla browse_latest (pages 1-5) i deep_crawl (kursor do końca katalogu). Minus: sitemap laguje ~dobę za najświeższymi uploadami — akceptowalne przy dziennym harmonogramie. Subclass ustawia `base_url` + (opcjonalnie) override'uje `_pick_studio()` — PlayTube nie ma strukturalnego pola studio na scenie, studio siedzi w kategoriach (hqfap: suffix " Clips"; 4k69: nazwa z listy /studios). Cloudflare: HTML wymaga browser TLS (curl_cffi w browser_get); plain curl z VPS dostaje 403. Sitemapy i thumbnaile schodzą bez challenge'a. """ from __future__ import annotations import json import logging import re from datetime import date, datetime from app.connectors.base import ( RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, ) from app.extractors import browser_get from app.normalize.text import slugify log = logging.getLogger(__name__) _PAGE_SIZE = 20 _SITEMAP_LOC_RE = re.compile(r"\s*([^<]+?)\s*") _URL_BLOCK_RE = re.compile(r"(.*?)", re.DOTALL | re.IGNORECASE) _LASTMOD_RE = re.compile(r"\s*([^<]+?)\s*") _SCENE_ID_RE = re.compile(r"_(\d+)\.html") _JSONLD_RE = re.compile( r']+type=["\']application/ld\+json["\'][^>]*>(.*?)', re.IGNORECASE | re.DOTALL, ) _ISO_DUR_RE = re.compile(r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE) # Pille performera/kategorii. PlayTube renderuje single-quoted attrs; dopuszczamy # oba quote'y. Nazwa z `` (href bywa URL-encoded / ze spacjami). _PILL_RE = re.compile( r"pornstar|category)/[^'\"]*['\"]" r".*?(?P[^<]+)", re.IGNORECASE | re.DOTALL, ) def _parse_iso_duration(value: str | None) -> int | None: """`PT26M48S` → sekundy. None gdy format nieznany.""" if not value: return None m = _ISO_DUR_RE.match(value.strip()) if not m: return None total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0) return total or None def _parse_iso_date(value: str | None) -> date | None: """`2026-06-09T16:00:00+00:00` → date. None gdy parse fail.""" if not value: return None try: return datetime.fromisoformat(value.replace("Z", "+00:00")).date() except ValueError: m = re.match(r"(\d{4}-\d{2}-\d{2})", value) if m: try: return date.fromisoformat(m.group(1)) except ValueError: return None return None def _extract_video_object(html: str) -> dict | None: """Pierwszy JSON-LD VideoObject w HTML (PlayTube emituje jeden, płaski dict).""" for m in _JSONLD_RE.finditer(html): raw = m.group(1).strip() if not raw: continue try: data = json.loads(raw) except (json.JSONDecodeError, ValueError): continue items = data if isinstance(data, list) else [data] for obj in items: if isinstance(obj, dict) and obj.get("@type") == "VideoObject": return obj return None class BasePlayTubeScraper(BaseBrowseScraper): base_url: str # np. "https://hqfap.com" — subclass ustawia def __init__(self) -> None: super().__init__() # Katalog URL-i scen z sitemap, newest-first. Lazy-init raz per instancję # (browse_latest i deep_crawl tworzą instancję per run, więc kilkanaście # fetchy XML amortyzuje się na cały run). self._catalog: list[str] | None = None # Hook: wybierz studio spośród nazw kategorii (display name) albo None. # Wybrana kategoria NIE trafia do tagów. def _pick_studio(self, category_names: list[str]) -> str | None: return None # crawl_page override (jak EpornerApiScraper) — listing nie jest stronicowalny # przez GET, źródłem paginacji jest sitemap. _listing_url/_extract_scene_urls # nieużywane, ale abstrakcyjne — dostarczamy no-op implementacje. def _listing_url(self, page: int) -> str: # pragma: no cover - nieużywane return f"{self.base_url}/sitemap.xml" def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover return [] def _load_catalog(self) -> list[str] | None: """Pełna lista URL-i scen posortowana lastmod desc. None = fetch fail.""" if self._catalog is not None: return self._catalog index_url = f"{self.base_url}/sitemap.xml" try: idx = browser_get(index_url, timeout=self._timeout) idx.raise_for_status() except Exception as e: log.warning("%s: sitemap index fetch failed: %s", self.sitetag, e) return None sitemap_urls = [ u for u in _SITEMAP_LOC_RE.findall(idx.text) if "/videos/sitemap-" in u ] if not sitemap_urls: log.warning("%s: sitemap index has no video sitemaps", self.sitetag) return None entries: list[tuple[str, str]] = [] # (lastmod, scene_url) for sm_url in sitemap_urls: try: sm = browser_get(sm_url, timeout=self._timeout) sm.raise_for_status() except Exception as e: # Brak jednego sitemapa ≠ fail całości — reszta katalogu wystarczy. log.warning("%s: sitemap fetch failed %s: %s", self.sitetag, sm_url, e) continue for block in _URL_BLOCK_RE.findall(sm.text): loc_m = _SITEMAP_LOC_RE.search(block) if not loc_m or "/watch/" not in loc_m.group(1): continue lastmod_m = _LASTMOD_RE.search(block) entries.append((lastmod_m.group(1) if lastmod_m else "", loc_m.group(1))) if not entries: return None # Dedup po scene id (sitemap potrafi powtórzyć URL między plikami). entries.sort(key=lambda e: e[0], reverse=True) seen_ids: set[str] = set() catalog: list[str] = [] for _, url in entries: id_m = _SCENE_ID_RE.search(url) key = id_m.group(1) if id_m else url if key in seen_ids: continue seen_ids.add(key) catalog.append(url) log.info("%s: catalog loaded — %d scenes from %d sitemaps", self.sitetag, len(catalog), len(sitemap_urls)) self._catalog = catalog return catalog def crawl_page(self, page: int) -> list[RawScene] | None: catalog = self._load_catalog() if catalog is None: return None start = (page - 1) * _PAGE_SIZE chunk = catalog[start:start + _PAGE_SIZE] if not chunk: return [] out: list[RawScene] = [] for scene_url in chunk: try: res = browser_get(scene_url, timeout=self._timeout) res.raise_for_status() except Exception as e: log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e) continue try: raw = self._parse_detail(scene_url, res.text) except Exception as e: log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e) continue if raw is not None: out.append(raw) return out def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: video = _extract_video_object(detail_html) if not video: log.info("%s: no JSON-LD VideoObject on %s", self.sitetag, scene_url) return None title = (video.get("name") or "").strip() if not title: return None id_m = _SCENE_ID_RE.search(scene_url) scene_id = id_m.group(1) if id_m else None duration_sec = _parse_iso_duration(video.get("duration")) release_date = _parse_iso_date(video.get("uploadDate")) thumbnail_url = video.get("thumbnailUrl") or None # Pille: pornstar → performer; category → studio (hook `_pick_studio`) # albo tag. Ocenzurowane nazwy (`Te***`) pomijamy — gwiazdki to nie dane. performers: list[RawPerformer] = [] category_names: list[str] = [] seen_perf: set[str] = set() for m in _PILL_RE.finditer(detail_html): name = m.group("name").strip() if not name or "*" in name: continue if m.group("kind").lower() == "pornstar": slug = slugify(name) if slug and slug not in seen_perf: seen_perf.add(slug) performers.append( RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name) ) elif name not in category_names: category_names.append(name) studio: RawStudio | None = None studio_name = self._pick_studio(category_names) if studio_name: studio = RawStudio( external_id=f"{self.sitetag}:studio:{slugify(studio_name)}", name=studio_name, slug=slugify(studio_name), ) tags: list[RawTag] = [] seen_tag: set[str] = set() picked = (studio_name or "").strip().lower() for name in category_names: # Studio-kategoria nie idzie do tagów (ani w wersji z suffixem " Clips"). if picked and name.strip().lower() in (picked, picked + " clips"): continue slug = slugify(name) if not slug or slug in seen_tag: continue seen_tag.add(slug) tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug)) # Phash: thumbnaile bywają re-encodowanym studio art (szansa na match), # dla amatorskiego contentu nie zmatchują. Graceful: miss → composite scoring. fingerprints: list[RawFingerprint] = [] if thumbnail_url: ph = compute_thumbnail_phash(thumbnail_url, referer=self.base_url + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) # Stream: JSON-LD contentUrl wygasa (token time-bound) — NIE zapisujemy # stream_url; extractor per-sitetag resolvuje świeży on-demand. playback_sources = [ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumbnail_url, ) ] return RawScene( external_id=f"{self.sitetag}:{scene_id or scene_url}", title=title, release_date=release_date, duration_sec=duration_sec, url=scene_url, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=playback_sources, )