4k69.com (~65k scenes): same PlayTube CMS as hqfap - common logic moved to _playtube.py (sitemap catalog, JSON-LD, pills). Studio classified by matching category pills against the studios index page. Streams are get_file (fullmovies family) returned unresolved with mobile_direct, 2160p skipped. neporn.com: KVS engine, latest-updates listing, JSON-LD + video:duration meta, performers from models links with flashvars video_tags fallback for fresh uploads. Resolve via _kvs; final URL portable cross-IP. superporn.com rejected: Cloudflare 403 from VPS on all TLS impersonations. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
300 lines
12 KiB
Python
300 lines
12 KiB
Python
"""BasePlayTubeScraper — wspólna baza dla tube'ów na PlayTube CMS (hqfap, 4k69).
|
|
|
|
Platforma rozpoznawalna po: `/watch/<slug>_<id>.html`, sitemap index
|
|
`/sitemaps/videos/sitemap-N.xml` (z `<lastmod>`), JSON-LD VideoObject na detail
|
|
page'u (name + uploadDate + duration ISO 8601 + thumbnailUrl + contentUrl) oraz
|
|
pillach `<a class='pill' href='/videos/pornstar|category/<Name>'>`.
|
|
|
|
Listing NIE paginuje się GET-em (PlayTube doładowuje AJAX-em `aj/load-more/`),
|
|
więc crawl_page buduje katalog z sitemapów (sort lastmod desc = newest first)
|
|
i tnie na strony po `_PAGE_SIZE`. Działa dla browse_latest (pages 1-5) i
|
|
deep_crawl (kursor do końca katalogu). Minus: sitemap laguje ~dobę za
|
|
najświeższymi uploadami — akceptowalne przy dziennym harmonogramie.
|
|
|
|
Subclass ustawia `base_url` + (opcjonalnie) override'uje `_pick_studio()` —
|
|
PlayTube nie ma strukturalnego pola studio na scenie, studio siedzi w
|
|
kategoriach (hqfap: suffix " Clips"; 4k69: nazwa z listy /studios).
|
|
|
|
Cloudflare: HTML wymaga browser TLS (curl_cffi w browser_get); plain curl z VPS
|
|
dostaje 403. Sitemapy i thumbnaile schodzą bez challenge'a.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from datetime import date, datetime
|
|
|
|
from app.connectors.base import (
|
|
RawFingerprint,
|
|
RawPerformer,
|
|
RawPlaybackSource,
|
|
RawScene,
|
|
RawStudio,
|
|
RawTag,
|
|
)
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
)
|
|
from app.extractors import browser_get
|
|
from app.normalize.text import slugify
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_PAGE_SIZE = 20
|
|
|
|
_SITEMAP_LOC_RE = re.compile(r"<loc>\s*([^<]+?)\s*</loc>")
|
|
_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.DOTALL | re.IGNORECASE)
|
|
_LASTMOD_RE = re.compile(r"<lastmod>\s*([^<]+?)\s*</lastmod>")
|
|
_SCENE_ID_RE = re.compile(r"_(\d+)\.html")
|
|
|
|
_JSONLD_RE = re.compile(
|
|
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
_ISO_DUR_RE = re.compile(r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE)
|
|
|
|
# Pille performera/kategorii. PlayTube renderuje single-quoted attrs; dopuszczamy
|
|
# oba quote'y. Nazwa z `<span itemprop='name'>` (href bywa URL-encoded / ze spacjami).
|
|
_PILL_RE = re.compile(
|
|
r"<a\s+class=['\"]pill['\"]\s+href=['\"]/videos/(?P<kind>pornstar|category)/[^'\"]*['\"]"
|
|
r".*?<span itemprop=['\"]name['\"]>(?P<name>[^<]+)</span>",
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
|
|
|
|
def _parse_iso_duration(value: str | None) -> int | None:
|
|
"""`PT26M48S` → sekundy. None gdy format nieznany."""
|
|
if not value:
|
|
return None
|
|
m = _ISO_DUR_RE.match(value.strip())
|
|
if not m:
|
|
return None
|
|
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
|
|
return total or None
|
|
|
|
|
|
def _parse_iso_date(value: str | None) -> date | None:
|
|
"""`2026-06-09T16:00:00+00:00` → date. None gdy parse fail."""
|
|
if not value:
|
|
return None
|
|
try:
|
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
|
except ValueError:
|
|
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
|
|
if m:
|
|
try:
|
|
return date.fromisoformat(m.group(1))
|
|
except ValueError:
|
|
return None
|
|
return None
|
|
|
|
|
|
def _extract_video_object(html: str) -> dict | None:
|
|
"""Pierwszy JSON-LD VideoObject w HTML (PlayTube emituje jeden, płaski dict)."""
|
|
for m in _JSONLD_RE.finditer(html):
|
|
raw = m.group(1).strip()
|
|
if not raw:
|
|
continue
|
|
try:
|
|
data = json.loads(raw)
|
|
except (json.JSONDecodeError, ValueError):
|
|
continue
|
|
items = data if isinstance(data, list) else [data]
|
|
for obj in items:
|
|
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
|
|
return obj
|
|
return None
|
|
|
|
|
|
class BasePlayTubeScraper(BaseBrowseScraper):
|
|
base_url: str # np. "https://hqfap.com" — subclass ustawia
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
# Katalog URL-i scen z sitemap, newest-first. Lazy-init raz per instancję
|
|
# (browse_latest i deep_crawl tworzą instancję per run, więc kilkanaście
|
|
# fetchy XML amortyzuje się na cały run).
|
|
self._catalog: list[str] | None = None
|
|
|
|
# Hook: wybierz studio spośród nazw kategorii (display name) albo None.
|
|
# Wybrana kategoria NIE trafia do tagów.
|
|
def _pick_studio(self, category_names: list[str]) -> str | None:
|
|
return None
|
|
|
|
# crawl_page override (jak EpornerApiScraper) — listing nie jest stronicowalny
|
|
# przez GET, źródłem paginacji jest sitemap. _listing_url/_extract_scene_urls
|
|
# nieużywane, ale abstrakcyjne — dostarczamy no-op implementacje.
|
|
def _listing_url(self, page: int) -> str: # pragma: no cover - nieużywane
|
|
return f"{self.base_url}/sitemap.xml"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover
|
|
return []
|
|
|
|
def _load_catalog(self) -> list[str] | None:
|
|
"""Pełna lista URL-i scen posortowana lastmod desc. None = fetch fail."""
|
|
if self._catalog is not None:
|
|
return self._catalog
|
|
index_url = f"{self.base_url}/sitemap.xml"
|
|
try:
|
|
idx = browser_get(index_url, timeout=self._timeout)
|
|
idx.raise_for_status()
|
|
except Exception as e:
|
|
log.warning("%s: sitemap index fetch failed: %s", self.sitetag, e)
|
|
return None
|
|
sitemap_urls = [
|
|
u for u in _SITEMAP_LOC_RE.findall(idx.text) if "/videos/sitemap-" in u
|
|
]
|
|
if not sitemap_urls:
|
|
log.warning("%s: sitemap index has no video sitemaps", self.sitetag)
|
|
return None
|
|
|
|
entries: list[tuple[str, str]] = [] # (lastmod, scene_url)
|
|
for sm_url in sitemap_urls:
|
|
try:
|
|
sm = browser_get(sm_url, timeout=self._timeout)
|
|
sm.raise_for_status()
|
|
except Exception as e:
|
|
# Brak jednego sitemapa ≠ fail całości — reszta katalogu wystarczy.
|
|
log.warning("%s: sitemap fetch failed %s: %s", self.sitetag, sm_url, e)
|
|
continue
|
|
for block in _URL_BLOCK_RE.findall(sm.text):
|
|
loc_m = _SITEMAP_LOC_RE.search(block)
|
|
if not loc_m or "/watch/" not in loc_m.group(1):
|
|
continue
|
|
lastmod_m = _LASTMOD_RE.search(block)
|
|
entries.append((lastmod_m.group(1) if lastmod_m else "", loc_m.group(1)))
|
|
|
|
if not entries:
|
|
return None
|
|
# Dedup po scene id (sitemap potrafi powtórzyć URL między plikami).
|
|
entries.sort(key=lambda e: e[0], reverse=True)
|
|
seen_ids: set[str] = set()
|
|
catalog: list[str] = []
|
|
for _, url in entries:
|
|
id_m = _SCENE_ID_RE.search(url)
|
|
key = id_m.group(1) if id_m else url
|
|
if key in seen_ids:
|
|
continue
|
|
seen_ids.add(key)
|
|
catalog.append(url)
|
|
log.info("%s: catalog loaded — %d scenes from %d sitemaps",
|
|
self.sitetag, len(catalog), len(sitemap_urls))
|
|
self._catalog = catalog
|
|
return catalog
|
|
|
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
|
catalog = self._load_catalog()
|
|
if catalog is None:
|
|
return None
|
|
start = (page - 1) * _PAGE_SIZE
|
|
chunk = catalog[start:start + _PAGE_SIZE]
|
|
if not chunk:
|
|
return []
|
|
out: list[RawScene] = []
|
|
for scene_url in chunk:
|
|
try:
|
|
res = browser_get(scene_url, timeout=self._timeout)
|
|
res.raise_for_status()
|
|
except Exception as e:
|
|
log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e)
|
|
continue
|
|
try:
|
|
raw = self._parse_detail(scene_url, res.text)
|
|
except Exception as e:
|
|
log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e)
|
|
continue
|
|
if raw is not None:
|
|
out.append(raw)
|
|
return out
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
video = _extract_video_object(detail_html)
|
|
if not video:
|
|
log.info("%s: no JSON-LD VideoObject on %s", self.sitetag, scene_url)
|
|
return None
|
|
|
|
title = (video.get("name") or "").strip()
|
|
if not title:
|
|
return None
|
|
|
|
id_m = _SCENE_ID_RE.search(scene_url)
|
|
scene_id = id_m.group(1) if id_m else None
|
|
|
|
duration_sec = _parse_iso_duration(video.get("duration"))
|
|
release_date = _parse_iso_date(video.get("uploadDate"))
|
|
thumbnail_url = video.get("thumbnailUrl") or None
|
|
|
|
# Pille: pornstar → performer; category → studio (hook `_pick_studio`)
|
|
# albo tag. Ocenzurowane nazwy (`Te***`) pomijamy — gwiazdki to nie dane.
|
|
performers: list[RawPerformer] = []
|
|
category_names: list[str] = []
|
|
seen_perf: set[str] = set()
|
|
for m in _PILL_RE.finditer(detail_html):
|
|
name = m.group("name").strip()
|
|
if not name or "*" in name:
|
|
continue
|
|
if m.group("kind").lower() == "pornstar":
|
|
slug = slugify(name)
|
|
if slug and slug not in seen_perf:
|
|
seen_perf.add(slug)
|
|
performers.append(
|
|
RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name)
|
|
)
|
|
elif name not in category_names:
|
|
category_names.append(name)
|
|
|
|
studio: RawStudio | None = None
|
|
studio_name = self._pick_studio(category_names)
|
|
if studio_name:
|
|
studio = RawStudio(
|
|
external_id=f"{self.sitetag}:studio:{slugify(studio_name)}",
|
|
name=studio_name,
|
|
slug=slugify(studio_name),
|
|
)
|
|
|
|
tags: list[RawTag] = []
|
|
seen_tag: set[str] = set()
|
|
picked = (studio_name or "").strip().lower()
|
|
for name in category_names:
|
|
# Studio-kategoria nie idzie do tagów (ani w wersji z suffixem " Clips").
|
|
if picked and name.strip().lower() in (picked, picked + " clips"):
|
|
continue
|
|
slug = slugify(name)
|
|
if not slug or slug in seen_tag:
|
|
continue
|
|
seen_tag.add(slug)
|
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug))
|
|
|
|
# Phash: thumbnaile bywają re-encodowanym studio art (szansa na match),
|
|
# dla amatorskiego contentu nie zmatchują. Graceful: miss → composite scoring.
|
|
fingerprints: list[RawFingerprint] = []
|
|
if thumbnail_url:
|
|
ph = compute_thumbnail_phash(thumbnail_url, referer=self.base_url + "/")
|
|
if ph:
|
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
|
|
# Stream: JSON-LD contentUrl wygasa (token time-bound) — NIE zapisujemy
|
|
# stream_url; extractor per-sitetag resolvuje świeży on-demand.
|
|
playback_sources = [
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumbnail_url,
|
|
)
|
|
]
|
|
|
|
return RawScene(
|
|
external_id=f"{self.sitetag}:{scene_id or scene_url}",
|
|
title=title,
|
|
release_date=release_date,
|
|
duration_sec=duration_sec,
|
|
url=scene_url,
|
|
studio=studio,
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
playback_sources=playback_sources,
|
|
)
|