diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index cb77395..6968a98 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -153,7 +153,9 @@ from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402 from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402 from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402 +from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402 from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402 +from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402 from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402 from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402 @@ -210,8 +212,18 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ # się GET-em → crawl_page po sitemap index (12 plików, lastmod desc). Direct mp4 # (cdnde.com / okcdn.ru), cross-IP portable → natywny extractor `hqfapcom`. HQFapScraper, - # 4k69.com — NIE dołączony: homepage JS-rendered, brak og:/KVS markerów w surowym HTML - # (probe 2026-06-01). Wymagałby headless render — odłożony. + # FourK69Scraper — dołączony 2026-06-10 (user request). Probe 2026-06-01 odrzucił + # po homepage "JS-rendered" — błędnie: scene pages mają pełny SSR + JSON-LD. Ta sama + # platforma PlayTube co hqfap (wspólna baza _playtube.py), ~65k scen, content głównie + # studyjny (4K paysite re-upload). Studio z kategorii matchowanych do listy /studios. + # Stream get_file (www.4kporno.xxx) jak fullmovies → mobile_direct, skip 2160p. + FourK69Scraper, + # NepornScraper — dołączony 2026-06-10 (user request). KVS engine (jak freshporno/ + # porn00), /latest-updates/N/. JSON-LD (title+desc+uploadDate+thumb) + video:duration + # meta + /models/ performerzy + /categories/ tagi. Brak studio (tytuł bywa + # "- HardX Update - ..." — fuzzy match po tytule). Resolve server-side _kvs, + # finalny remote_control.php portable cross-IP. + NepornScraper, # porntrex/hqporner/youporn — NIE: KVS/JS bez SSR duration → niewidoczne orphany (2026-06-03). # ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory). ] diff --git a/app/connectors/direct_scrapers/_playtube.py b/app/connectors/direct_scrapers/_playtube.py new file mode 100644 index 0000000..832a4c2 --- /dev/null +++ b/app/connectors/direct_scrapers/_playtube.py @@ -0,0 +1,300 @@ +"""BasePlayTubeScraper — wspólna baza dla tube'ów na PlayTube CMS (hqfap, 4k69). + +Platforma rozpoznawalna po: `/watch/_.html`, sitemap index +`/sitemaps/videos/sitemap-N.xml` (z ``), JSON-LD VideoObject na detail +page'u (name + uploadDate + duration ISO 8601 + thumbnailUrl + contentUrl) oraz +pillach ``. + +Listing NIE paginuje się GET-em (PlayTube doładowuje AJAX-em `aj/load-more/`), +więc crawl_page buduje katalog z sitemapów (sort lastmod desc = newest first) +i tnie na strony po `_PAGE_SIZE`. Działa dla browse_latest (pages 1-5) i +deep_crawl (kursor do końca katalogu). Minus: sitemap laguje ~dobę za +najświeższymi uploadami — akceptowalne przy dziennym harmonogramie. + +Subclass ustawia `base_url` + (opcjonalnie) override'uje `_pick_studio()` — +PlayTube nie ma strukturalnego pola studio na scenie, studio siedzi w +kategoriach (hqfap: suffix " Clips"; 4k69: nazwa z listy /studios). + +Cloudflare: HTML wymaga browser TLS (curl_cffi w browser_get); plain curl z VPS +dostaje 403. Sitemapy i thumbnaile schodzą bez challenge'a. +""" +from __future__ import annotations + +import json +import logging +import re +from datetime import date, datetime + +from app.connectors.base import ( + RawFingerprint, + RawPerformer, + RawPlaybackSource, + RawScene, + RawStudio, + RawTag, +) +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, +) +from app.extractors import browser_get +from app.normalize.text import slugify + +log = logging.getLogger(__name__) + +_PAGE_SIZE = 20 + +_SITEMAP_LOC_RE = re.compile(r"\s*([^<]+?)\s*") +_URL_BLOCK_RE = re.compile(r"(.*?)", re.DOTALL | re.IGNORECASE) +_LASTMOD_RE = re.compile(r"\s*([^<]+?)\s*") +_SCENE_ID_RE = re.compile(r"_(\d+)\.html") + +_JSONLD_RE = re.compile( + r']+type=["\']application/ld\+json["\'][^>]*>(.*?)', + re.IGNORECASE | re.DOTALL, +) +_ISO_DUR_RE = re.compile(r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE) + +# Pille performera/kategorii. PlayTube renderuje single-quoted attrs; dopuszczamy +# oba quote'y. Nazwa z `` (href bywa URL-encoded / ze spacjami). +_PILL_RE = re.compile( + r"pornstar|category)/[^'\"]*['\"]" + r".*?(?P[^<]+)", + re.IGNORECASE | re.DOTALL, +) + + +def _parse_iso_duration(value: str | None) -> int | None: + """`PT26M48S` → sekundy. None gdy format nieznany.""" + if not value: + return None + m = _ISO_DUR_RE.match(value.strip()) + if not m: + return None + total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0) + return total or None + + +def _parse_iso_date(value: str | None) -> date | None: + """`2026-06-09T16:00:00+00:00` → date. None gdy parse fail.""" + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")).date() + except ValueError: + m = re.match(r"(\d{4}-\d{2}-\d{2})", value) + if m: + try: + return date.fromisoformat(m.group(1)) + except ValueError: + return None + return None + + +def _extract_video_object(html: str) -> dict | None: + """Pierwszy JSON-LD VideoObject w HTML (PlayTube emituje jeden, płaski dict).""" + for m in _JSONLD_RE.finditer(html): + raw = m.group(1).strip() + if not raw: + continue + try: + data = json.loads(raw) + except (json.JSONDecodeError, ValueError): + continue + items = data if isinstance(data, list) else [data] + for obj in items: + if isinstance(obj, dict) and obj.get("@type") == "VideoObject": + return obj + return None + + +class BasePlayTubeScraper(BaseBrowseScraper): + base_url: str # np. "https://hqfap.com" — subclass ustawia + + def __init__(self) -> None: + super().__init__() + # Katalog URL-i scen z sitemap, newest-first. Lazy-init raz per instancję + # (browse_latest i deep_crawl tworzą instancję per run, więc kilkanaście + # fetchy XML amortyzuje się na cały run). + self._catalog: list[str] | None = None + + # Hook: wybierz studio spośród nazw kategorii (display name) albo None. + # Wybrana kategoria NIE trafia do tagów. + def _pick_studio(self, category_names: list[str]) -> str | None: + return None + + # crawl_page override (jak EpornerApiScraper) — listing nie jest stronicowalny + # przez GET, źródłem paginacji jest sitemap. _listing_url/_extract_scene_urls + # nieużywane, ale abstrakcyjne — dostarczamy no-op implementacje. + def _listing_url(self, page: int) -> str: # pragma: no cover - nieużywane + return f"{self.base_url}/sitemap.xml" + + def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover + return [] + + def _load_catalog(self) -> list[str] | None: + """Pełna lista URL-i scen posortowana lastmod desc. None = fetch fail.""" + if self._catalog is not None: + return self._catalog + index_url = f"{self.base_url}/sitemap.xml" + try: + idx = browser_get(index_url, timeout=self._timeout) + idx.raise_for_status() + except Exception as e: + log.warning("%s: sitemap index fetch failed: %s", self.sitetag, e) + return None + sitemap_urls = [ + u for u in _SITEMAP_LOC_RE.findall(idx.text) if "/videos/sitemap-" in u + ] + if not sitemap_urls: + log.warning("%s: sitemap index has no video sitemaps", self.sitetag) + return None + + entries: list[tuple[str, str]] = [] # (lastmod, scene_url) + for sm_url in sitemap_urls: + try: + sm = browser_get(sm_url, timeout=self._timeout) + sm.raise_for_status() + except Exception as e: + # Brak jednego sitemapa ≠ fail całości — reszta katalogu wystarczy. + log.warning("%s: sitemap fetch failed %s: %s", self.sitetag, sm_url, e) + continue + for block in _URL_BLOCK_RE.findall(sm.text): + loc_m = _SITEMAP_LOC_RE.search(block) + if not loc_m or "/watch/" not in loc_m.group(1): + continue + lastmod_m = _LASTMOD_RE.search(block) + entries.append((lastmod_m.group(1) if lastmod_m else "", loc_m.group(1))) + + if not entries: + return None + # Dedup po scene id (sitemap potrafi powtórzyć URL między plikami). + entries.sort(key=lambda e: e[0], reverse=True) + seen_ids: set[str] = set() + catalog: list[str] = [] + for _, url in entries: + id_m = _SCENE_ID_RE.search(url) + key = id_m.group(1) if id_m else url + if key in seen_ids: + continue + seen_ids.add(key) + catalog.append(url) + log.info("%s: catalog loaded — %d scenes from %d sitemaps", + self.sitetag, len(catalog), len(sitemap_urls)) + self._catalog = catalog + return catalog + + def crawl_page(self, page: int) -> list[RawScene] | None: + catalog = self._load_catalog() + if catalog is None: + return None + start = (page - 1) * _PAGE_SIZE + chunk = catalog[start:start + _PAGE_SIZE] + if not chunk: + return [] + out: list[RawScene] = [] + for scene_url in chunk: + try: + res = browser_get(scene_url, timeout=self._timeout) + res.raise_for_status() + except Exception as e: + log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e) + continue + try: + raw = self._parse_detail(scene_url, res.text) + except Exception as e: + log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e) + continue + if raw is not None: + out.append(raw) + return out + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + video = _extract_video_object(detail_html) + if not video: + log.info("%s: no JSON-LD VideoObject on %s", self.sitetag, scene_url) + return None + + title = (video.get("name") or "").strip() + if not title: + return None + + id_m = _SCENE_ID_RE.search(scene_url) + scene_id = id_m.group(1) if id_m else None + + duration_sec = _parse_iso_duration(video.get("duration")) + release_date = _parse_iso_date(video.get("uploadDate")) + thumbnail_url = video.get("thumbnailUrl") or None + + # Pille: pornstar → performer; category → studio (hook `_pick_studio`) + # albo tag. Ocenzurowane nazwy (`Te***`) pomijamy — gwiazdki to nie dane. + performers: list[RawPerformer] = [] + category_names: list[str] = [] + seen_perf: set[str] = set() + for m in _PILL_RE.finditer(detail_html): + name = m.group("name").strip() + if not name or "*" in name: + continue + if m.group("kind").lower() == "pornstar": + slug = slugify(name) + if slug and slug not in seen_perf: + seen_perf.add(slug) + performers.append( + RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name) + ) + elif name not in category_names: + category_names.append(name) + + studio: RawStudio | None = None + studio_name = self._pick_studio(category_names) + if studio_name: + studio = RawStudio( + external_id=f"{self.sitetag}:studio:{slugify(studio_name)}", + name=studio_name, + slug=slugify(studio_name), + ) + + tags: list[RawTag] = [] + seen_tag: set[str] = set() + picked = (studio_name or "").strip().lower() + for name in category_names: + # Studio-kategoria nie idzie do tagów (ani w wersji z suffixem " Clips"). + if picked and name.strip().lower() in (picked, picked + " clips"): + continue + slug = slugify(name) + if not slug or slug in seen_tag: + continue + seen_tag.add(slug) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug)) + + # Phash: thumbnaile bywają re-encodowanym studio art (szansa na match), + # dla amatorskiego contentu nie zmatchują. Graceful: miss → composite scoring. + fingerprints: list[RawFingerprint] = [] + if thumbnail_url: + ph = compute_thumbnail_phash(thumbnail_url, referer=self.base_url + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + # Stream: JSON-LD contentUrl wygasa (token time-bound) — NIE zapisujemy + # stream_url; extractor per-sitetag resolvuje świeży on-demand. + playback_sources = [ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumbnail_url, + ) + ] + + return RawScene( + external_id=f"{self.sitetag}:{scene_id or scene_url}", + title=title, + release_date=release_date, + duration_sec=duration_sec, + url=scene_url, + studio=studio, + performers=performers, + tags=tags, + fingerprints=fingerprints, + playback_sources=playback_sources, + ) diff --git a/app/connectors/direct_scrapers/fourk69.py b/app/connectors/direct_scrapers/fourk69.py new file mode 100644 index 0000000..d30facc --- /dev/null +++ b/app/connectors/direct_scrapers/fourk69.py @@ -0,0 +1,66 @@ +"""4k69.com — latest-vids browse scraper (PlayTube CMS, patrz _playtube.py). + +Dołączony 2026-06-10 (user request; probe 2026-06-01 odrzucił po stronie głównej +"JS-rendered" — błędnie, scene pages mają pełny SSR + JSON-LD). 7 video sitemapów +≈ ~65k scen, content w dużej mierze studyjny (paysite re-upload, 4K). + +Specyfika vs baza: studio NIE ma własnego pola na scenie — nazwy studiów występują +jako kategorie ("21 Sextury", "Adult Time") obok zwykłych ("Anal", "4K"). +Klasyfikacja: lista wszystkich studiów z `/studios` (fetch raz per instancję, +match po znormalizowanej nazwie alfanumerycznej — pill "Adult Time" vs slug +"AdultTime"). Studio bywa też w prefiksie tytułu, ale kategoria jest pewniejsza. + +Playback: JSON-LD contentUrl + dwa dodatkowe get_file w HTML (2160m/720m/480m, +www.4kporno.xxx) — ta sama platforma co fullmovies/hdporngg: get_file binduje CDN +do IP fetchera, więc oddajemy NIEZRESOLWOWANE (mobile_direct), telefon follow-uje +302 z własnym IP. Extractor `4k69com` pomija 2160p (CDN time-out, jak fpvcdn). +""" +from __future__ import annotations + +import logging +import re + +from app.connectors.direct_scrapers._playtube import BasePlayTubeScraper +from app.extractors import browser_get + +log = logging.getLogger(__name__) + +_STUDIO_LINK_RE = re.compile(r"href=['\"][^'\"]*/videos/studio/([^'\"]+)['\"]", re.IGNORECASE) + + +def _norm(name: str) -> str: + """`Adult Time` / `AdultTime` → `adulttime` (porównanie pill vs studio slug).""" + return re.sub(r"[^a-z0-9]", "", name.lower()) + + +class FourK69Scraper(BasePlayTubeScraper): + sitetag = "4k69com" + base_url = "https://4k69.com" + + def __init__(self) -> None: + super().__init__() + self._studio_set: set[str] | None = None + + def _load_studio_set(self) -> set[str]: + """Znormalizowane nazwy wszystkich studiów z /studios. Pusty set = fetch + fail (graceful: sceny pójdą bez studio, composite ma performer+title+dur).""" + if self._studio_set is not None: + return self._studio_set + try: + r = browser_get(f"{self.base_url}/studios", timeout=self._timeout) + r.raise_for_status() + self._studio_set = {_norm(m) for m in _STUDIO_LINK_RE.findall(r.text) if _norm(m)} + log.info("4k69: studio list loaded — %d studios", len(self._studio_set)) + except Exception as e: + log.warning("4k69: studios page fetch failed: %s", e) + self._studio_set = set() + return self._studio_set + + def _pick_studio(self, category_names: list[str]) -> str | None: + studios = self._load_studio_set() + if not studios: + return None + for name in category_names: + if _norm(name) in studios: + return name + return None diff --git a/app/connectors/direct_scrapers/hqfap.py b/app/connectors/direct_scrapers/hqfap.py index 6aecb55..32caf8f 100644 --- a/app/connectors/direct_scrapers/hqfap.py +++ b/app/connectors/direct_scrapers/hqfap.py @@ -1,294 +1,26 @@ -"""hqfap.com — latest-vids browse scraper (PlayTube CMS). +"""hqfap.com — latest-vids browse scraper (PlayTube CMS, patrz _playtube.py). Dołączony 2026-06-10 (user request). Re-uploader katalogu pornhd.pet (~120k scen, thumbnaile to base64-encoded oryginalne URL-e w `/uploads/images/`). -Sygnały per scena (wszystko w SSR HTML detail page'a): - - JSON-LD VideoObject: name, uploadDate (ISO), duration (ISO 8601 `PT26M48S`), - thumbnailUrl, contentUrl (direct mp4 — patrz extractor `hqfapcom`) - - Performerzy: blok "Pornstars:" — `` - - Kategorie: blok "Categories & Tags:" — `` - Część nazw ocenzurowana gwiazdkami (`Te***`) — pomijamy. Kategorie z suffixem - " Clips" to studia ("Filthy Kings Clips") → RawStudio. - -Listing: strona główna i `/videos/latest` NIE paginują się przez GET (PlayTube -doładowuje AJAX-em), ale site ma pełny **sitemap index** (`/sitemap.xml` → -12× `sitemaps/videos/sitemap-N.xml`, po ~10k URL-i z ``). Crawl_page -buduje katalog z sitemap (sort lastmod desc = newest first) i tnie na strony po -20 URL-i — działa i dla browse_latest (pages 1-5) i dla deep_crawl (kursor). - -Cloudflare: strony HTML wymagają browser TLS (curl_cffi w browser_get); plain -curl z VPS dostaje 403. Sitemap i thumbnaile schodzą bez challenge'a. +Specyfika vs baza: studio siedzi w kategoriach z suffixem " Clips" +("Filthy Kings Clips" → studio "Filthy Kings"); reszta kategorii → tagi. +Playback: direct mp4 z JSON-LD contentUrl (cdnde.com nowsze / okcdn.ru starsze), +tokeny time-bound i portable cross-IP → natywny extractor `hqfapcom`. """ from __future__ import annotations -import json -import logging -import re -from datetime import date, datetime - -from app.connectors.base import ( - RawFingerprint, - RawPerformer, - RawPlaybackSource, - RawScene, - RawStudio, - RawTag, -) -from app.connectors.direct_scrapers._browse_base import ( - BaseBrowseScraper, - compute_thumbnail_phash, -) -from app.extractors import browser_get -from app.normalize.text import slugify - -log = logging.getLogger(__name__) - -_BASE = "https://hqfap.com" -_SITEMAP_INDEX = f"{_BASE}/sitemap.xml" -_PAGE_SIZE = 20 - -_SITEMAP_LOC_RE = re.compile(r"\s*([^<]+?)\s*") -_URL_BLOCK_RE = re.compile(r"(.*?)", re.DOTALL | re.IGNORECASE) -_LASTMOD_RE = re.compile(r"\s*([^<]+?)\s*") -_SCENE_ID_RE = re.compile(r"_(\d+)\.html") - -_JSONLD_RE = re.compile( - r']+type=["\']application/ld\+json["\'][^>]*>(.*?)', - re.IGNORECASE | re.DOTALL, -) -_ISO_DUR_RE = re.compile(r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE) - -# Pille performera/kategorii. PlayTube renderuje single-quoted attrs; dopuszczamy -# oba quote'y. Nazwa z `` (href bywa URL-encoded / ze spacjami). -_PILL_RE = re.compile( - r"pornstar|category)/[^'\"]*['\"]" - r".*?(?P[^<]+)", - re.IGNORECASE | re.DOTALL, -) +from app.connectors.direct_scrapers._playtube import BasePlayTubeScraper -def _parse_iso_duration(value: str | None) -> int | None: - """`PT26M48S` → sekundy. None gdy format nieznany.""" - if not value: - return None - m = _ISO_DUR_RE.match(value.strip()) - if not m: - return None - total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0) - return total or None - - -def _parse_iso_date(value: str | None) -> date | None: - """`2026-06-09T16:00:00+00:00` → date. None gdy parse fail.""" - if not value: - return None - try: - return datetime.fromisoformat(value.replace("Z", "+00:00")).date() - except ValueError: - m = re.match(r"(\d{4}-\d{2}-\d{2})", value) - if m: - try: - return date.fromisoformat(m.group(1)) - except ValueError: - return None - return None - - -def _extract_video_object(html: str) -> dict | None: - """Pierwszy JSON-LD VideoObject w HTML (hqfap emituje jeden, płaski dict).""" - for m in _JSONLD_RE.finditer(html): - raw = m.group(1).strip() - if not raw: - continue - try: - data = json.loads(raw) - except (json.JSONDecodeError, ValueError): - continue - items = data if isinstance(data, list) else [data] - for obj in items: - if isinstance(obj, dict) and obj.get("@type") == "VideoObject": - return obj - return None - - -class HQFapScraper(BaseBrowseScraper): +class HQFapScraper(BasePlayTubeScraper): sitetag = "hqfapcom" + base_url = "https://hqfap.com" - def __init__(self) -> None: - super().__init__() - # Katalog URL-i scen z sitemap, newest-first. Lazy-init raz per instancję - # (browse_latest i deep_crawl tworzą instancję per run, więc 13 fetchy XML - # amortyzuje się na cały run). - self._catalog: list[str] | None = None - - # crawl_page override (jak EpornerApiScraper) — listing nie jest stronicowalny - # przez GET, źródłem paginacji jest sitemap. _listing_url/_extract_scene_urls - # nieużywane, ale abstrakcyjne — dostarczamy no-op implementacje. - def _listing_url(self, page: int) -> str: # pragma: no cover - nieużywane - return _SITEMAP_INDEX - - def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover - return [] - - def _load_catalog(self) -> list[str] | None: - """Pełna lista URL-i scen posortowana lastmod desc. None = fetch fail.""" - if self._catalog is not None: - return self._catalog - try: - idx = browser_get(_SITEMAP_INDEX, timeout=self._timeout) - idx.raise_for_status() - except Exception as e: - log.warning("hqfap: sitemap index fetch failed: %s", e) - return None - sitemap_urls = [ - u for u in _SITEMAP_LOC_RE.findall(idx.text) if "/videos/sitemap-" in u - ] - if not sitemap_urls: - log.warning("hqfap: sitemap index has no video sitemaps") - return None - - entries: list[tuple[str, str]] = [] # (lastmod, scene_url) - for sm_url in sitemap_urls: - try: - sm = browser_get(sm_url, timeout=self._timeout) - sm.raise_for_status() - except Exception as e: - # Brak jednego sitemapa ≠ fail całości — reszta katalogu wystarczy. - log.warning("hqfap: sitemap fetch failed %s: %s", sm_url, e) - continue - for block in _URL_BLOCK_RE.findall(sm.text): - loc_m = _SITEMAP_LOC_RE.search(block) - if not loc_m or "/watch/" not in loc_m.group(1): - continue - lastmod_m = _LASTMOD_RE.search(block) - entries.append((lastmod_m.group(1) if lastmod_m else "", loc_m.group(1))) - - if not entries: - return None - # Dedup po scene id (sitemap potrafi powtórzyć URL między plikami). - entries.sort(key=lambda e: e[0], reverse=True) - seen_ids: set[str] = set() - catalog: list[str] = [] - for _, url in entries: - id_m = _SCENE_ID_RE.search(url) - key = id_m.group(1) if id_m else url - if key in seen_ids: - continue - seen_ids.add(key) - catalog.append(url) - log.info("hqfap: catalog loaded — %d scenes from %d sitemaps", - len(catalog), len(sitemap_urls)) - self._catalog = catalog - return catalog - - def crawl_page(self, page: int) -> list[RawScene] | None: - catalog = self._load_catalog() - if catalog is None: - return None - start = (page - 1) * _PAGE_SIZE - chunk = catalog[start:start + _PAGE_SIZE] - if not chunk: - return [] - out: list[RawScene] = [] - for scene_url in chunk: - try: - res = browser_get(scene_url, timeout=self._timeout) - res.raise_for_status() - except Exception as e: - log.info("hqfap detail fetch failed %s: %s", scene_url, e) - continue - try: - raw = self._parse_detail(scene_url, res.text) - except Exception as e: - log.warning("hqfap detail parse failed %s: %s", scene_url, e) - continue - if raw is not None: - out.append(raw) - return out - - def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: - video = _extract_video_object(detail_html) - if not video: - log.info("hqfap: no JSON-LD VideoObject on %s", scene_url) - return None - - title = (video.get("name") or "").strip() - if not title: - return None - - id_m = _SCENE_ID_RE.search(scene_url) - scene_id = id_m.group(1) if id_m else None - - duration_sec = _parse_iso_duration(video.get("duration")) - release_date = _parse_iso_date(video.get("uploadDate")) - thumbnail_url = video.get("thumbnailUrl") or None - - # Pille: pornstar → performer; category → tag, chyba że suffix " Clips" - # (studio-kategorie z importu pornhd, np. "Filthy Kings Clips" → "Filthy Kings"). - # Ocenzurowane nazwy (`Te***`) pomijamy — gwiazdki to nie dane. - studio: RawStudio | None = None - performers: list[RawPerformer] = [] - tags: list[RawTag] = [] - seen_perf: set[str] = set() - seen_tag: set[str] = set() - for m in _PILL_RE.finditer(detail_html): - name = m.group("name").strip() - if not name or "*" in name: - continue - slug = slugify(name) - if not slug: - continue - if m.group("kind").lower() == "pornstar": - if slug not in seen_perf: - seen_perf.add(slug) - performers.append( - RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name) - ) - elif name.lower().endswith(" clips"): - if studio is None: - studio_name = name[: -len(" clips")].strip() - if studio_name: - studio = RawStudio( - external_id=f"{self.sitetag}:studio:{slugify(studio_name)}", - name=studio_name, - slug=slugify(studio_name), - ) - elif slug not in seen_tag: - seen_tag.add(slug) - tags.append( - RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug) - ) - - # Phash: thumbnaile to re-encodowane webp posterów pornhd.pet — dla studio - # contentu bywają oryginalnym studio art (szansa na phash match), dla amatorskiego - # nie zmatchują. Graceful: miss → composite scoring (title+performer+duration). - fingerprints: list[RawFingerprint] = [] - if thumbnail_url: - ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") - if ph: - fingerprints.append(RawFingerprint(kind="phash", value=ph)) - - # Stream: JSON-LD contentUrl to direct mp4, ale token (`time=`) wygasa — - # NIE zapisujemy stream_url; extractor `hqfapcom` resolvuje świeży on-demand. - playback_sources = [ - RawPlaybackSource( - origin=f"tube:{self.sitetag}", - page_url=scene_url, - duration_sec=duration_sec, - thumbnail_url=thumbnail_url, - ) - ] - - return RawScene( - external_id=f"{self.sitetag}:{scene_id or scene_url}", - title=title, - release_date=release_date, - duration_sec=duration_sec, - url=scene_url, - studio=studio, - performers=performers, - tags=tags, - fingerprints=fingerprints, - playback_sources=playback_sources, - ) + def _pick_studio(self, category_names: list[str]) -> str | None: + for name in category_names: + if name.lower().endswith(" clips"): + studio_name = name[: -len(" clips")].strip() + if studio_name: + return studio_name + return None diff --git a/app/connectors/direct_scrapers/neporn.py b/app/connectors/direct_scrapers/neporn.py new file mode 100644 index 0000000..62c247a --- /dev/null +++ b/app/connectors/direct_scrapers/neporn.py @@ -0,0 +1,191 @@ +"""neporn.com — latest-vids browse scraper (KVS engine). + +Dołączony 2026-06-10 (user request). Paysite re-upload (HardX, DAP itp.), +~40k+ scen po numeracji video id. + +Sygnały per scena: + - JSON-LD VideoObject: name, description, uploadDate, thumbnailUrl + (uwaga: bywa zmanglowany "https:https://..." — normalizujemy) + - `` = sekundy wprost + - Performerzy: `` + (nazwa w spanie z ikoną — bierzemy slug i tytułujemy). Świeże uploady NIE mają + jeszcze linków modelek — fallback: flashvars `video_tags` (na świeżych scenach + to czyste nazwiska, np. "emily willis, gianna dior"); bierzemy wpisy 2-3 słowa + obecne w tytule i bez generycznych fraz (stoplist), bo starsze sceny mają tam + też szum ("deep throat", "natural tits", "hd porn"). + - Kategorie: linki `/categories//` w bloku info (tagi z `/tags/` to szum + typu "hd porn", "2020" — pomijamy) + +Studio: brak strukturalnego pola — tytuł bywa "- HardX Update - ..." ale format +niespójny, zostawiamy fuzzy matchowi po tytule. + +Listing: standard KVS `/latest-updates/N/` (24 sceny/strona). +Playback: KVS function/0 + license → `_kvs.resolve_kvs` server-side; finalny +`data001.neporn.com/remote_control.php?time=...` portable cross-IP +(test 2026-06-10: VPS resolve → lokalny ISP 206 video/mp4). +""" +from __future__ import annotations + +import logging +import re + +from app.connectors.base import ( + RawFingerprint, + RawPerformer, + RawPlaybackSource, + RawScene, + RawTag, +) +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, + meta_content, +) +from app.connectors.direct_scrapers._playtube import ( + _extract_video_object, + _parse_iso_date, +) + +log = logging.getLogger(__name__) + +_BASE = "https://neporn.com" + +_SCENE_URL_RE = re.compile(r'href="(https://neporn\.com/video/(\d+)/[^"]+)"') +_MODEL_LINK_RE = re.compile(r'href="https://neporn\.com/models/([a-z0-9\-]+)/"', re.IGNORECASE) +_CATEGORY_LINK_RE = re.compile( + r'href="https://neporn\.com/categories/([a-z0-9\-]+)/"\s*>\s*([^<]+?)\s*<', re.IGNORECASE +) +_VIDEO_TAGS_RE = re.compile(r"video_tags:\s*'([^']*)'") + +# Słowa dyskwalifikujące wpis z video_tags jako nazwisko (generyczne frazy typu +# "deep throat" / "natural tits" przechodzą test "jest w tytule" zbyt często). +_PERF_STOPWORDS = frozenset( + "porn sex tits ass anal throat cum blow blowjob dick cock pussy fuck fucking " + "scene scenes hd milf teen big small double penetration facial creampie " + "threesome amateur petite latina blonde brunette".split() +) + + +def _name_from_slug(slug: str) -> str: + """`emily-willis` → `Emily Willis`.""" + return " ".join(w.capitalize() for w in slug.split("-") if w) + + +class NepornScraper(BaseBrowseScraper): + sitetag = "neporncom" + + def _listing_url(self, page: int) -> str: + if page <= 1: + return f"{_BASE}/latest-updates/" + return f"{_BASE}/latest-updates/{page}/" + + def _extract_scene_urls(self, listing_html: str) -> list[str]: + seen: set[str] = set() + out: list[str] = [] + for m in _SCENE_URL_RE.finditer(listing_html): + url = m.group(1) + if url in seen: + continue + seen.add(url) + out.append(url) + return out + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + video = _extract_video_object(detail_html) or {} + + title = (video.get("name") or meta_content(detail_html, property="og:title") or "").strip() + # Tytuły bywają z wiszącym separatorem na początku ("- HardX Update - ..."). + title = title.lstrip("- ").strip() + if not title: + return None + + id_m = re.search(r"/video/(\d+)/", scene_url) + video_id = id_m.group(1) if id_m else None + + duration_sec: int | None = None + dur_meta = meta_content(detail_html, property="video:duration") + if dur_meta and dur_meta.isdigit(): + duration_sec = int(dur_meta) + + release_date = _parse_iso_date(video.get("uploadDate")) + description = (video.get("description") or "").strip() or None + thumbnail_url = (video.get("thumbnailUrl") or "").strip() or None + if thumbnail_url: + # JSON-LD emituje "https:https://cdn..." — utnij zdublowany scheme. + thumbnail_url = re.sub(r"^https?:(?=https?://)", "", thumbnail_url) + + performers: list[RawPerformer] = [] + seen_perf: set[str] = set() + for m in _MODEL_LINK_RE.finditer(detail_html): + slug = m.group(1) + if slug in seen_perf: + continue + seen_perf.add(slug) + performers.append( + RawPerformer( + external_id=f"{self.sitetag}:model:{slug}", + name=_name_from_slug(slug), + ) + ) + # Fallback dla świeżych uploadów bez linków modelek: nazwiska z flashvars + # video_tags — 2-3 słowa, obecne w tytule, bez generycznych fraz. + title_cf = title.casefold() + tags_m = _VIDEO_TAGS_RE.search(detail_html) + for entry in (tags_m.group(1).split(",") if tags_m else []): + entry = entry.strip() + words = entry.split() + if not (2 <= len(words) <= 3): + continue + if any(w.casefold() in _PERF_STOPWORDS for w in words): + continue + if entry.casefold() not in title_cf: + continue + slug = entry.casefold().replace(" ", "-") + if slug in seen_perf: + continue + seen_perf.add(slug) + performers.append( + RawPerformer( + external_id=f"{self.sitetag}:model:{slug}", + name=_name_from_slug(slug), + ) + ) + + tags: list[RawTag] = [] + seen_tag: set[str] = set() + for m in _CATEGORY_LINK_RE.finditer(detail_html): + slug, name = m.group(1), m.group(2).strip() + if not name or slug in seen_tag: + continue + seen_tag.add(slug) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug)) + + # Phash z KVS screenshotów — niski hit-rate oczekiwany (jak porn00), + # graceful miss → composite scoring (performer+title+duration). + fingerprints: list[RawFingerprint] = [] + if thumbnail_url: + ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + playback_sources = [ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumbnail_url, + ) + ] + + return RawScene( + external_id=f"{self.sitetag}:{video_id or scene_url}", + title=title, + description=description, + release_date=release_date, + duration_sec=duration_sec, + url=scene_url, + performers=performers, + tags=tags, + fingerprints=fingerprints, + playback_sources=playback_sources, + ) diff --git a/app/extractors/__init__.py b/app/extractors/__init__.py index a0bf3c0..3847a58 100644 --- a/app/extractors/__init__.py +++ b/app/extractors/__init__.py @@ -29,10 +29,12 @@ from app.extractors.tubes import ( _ytdlp, eporner, freshporno, + fourk69, fullmovies, hdporngg, hqfap, hqporner, + neporn, latestpornvideo, paradisehill, porn00, @@ -179,6 +181,13 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = { # Cross-IP test 2026-06-10: oba CDN-y portable (`ip=`/`srcIp=` nie egzekwowane), # tokeny time-bound → on-demand fetch daje świeży URL. Mobile direct, zero proxy. "hqfapcom": hqfap.extract, + # 4k69 — get_file (www.4kporno.xxx, rodzina fullmovies/hdporngg): binduje CDN do IP + # fetchera → oddajemy niezresolwowane (mobile_direct), telefon follow-uje 302. + # Skip 2160p (CDN time-out). Cross-IP test 2026-06-10: 206 z lokalnego ISP. + "4k69com": fourk69.extract, + # neporn — KVS function/0 + license (jak freshporno). Server-side _kvs resolve → + # data001.neporn.com/remote_control.php portable (cross-IP 206, 2026-06-10). + "neporncom": neporn.extract, } diff --git a/app/extractors/tubes/fourk69.py b/app/extractors/tubes/fourk69.py new file mode 100644 index 0000000..e56ac0f --- /dev/null +++ b/app/extractors/tubes/fourk69.py @@ -0,0 +1,56 @@ +"""4k69.com — get_file stream extractor (platforma jak fullmovies/hdporngg). + +Scene page (SSR za Cloudflare → curl_cffi) ma 3 get_file URL-e na www.4kporno.xxx +(`..._2160m.mp4` / `_720m` / `_480m`) — w JSON-LD contentUrl i w JS playera, NIE +w `` tagach (dlatego nie _source_getfile, tylko skan całej strony). + +Jak fpvcdn (fullmovies, ta sama rodzina `/get_file/8512/`): get_file binduje CDN +do IP fetchera, jest stateless i ważny ≥90s → oddajemy NIEZRESOLWOWANE z +mobile_direct_ok — telefon follow-uje 302 z własnym IP (cross-IP test 2026-06-10: +lokalny ISP 206 video/mp4). 2160p pomijamy (CDN time-out ~30s, jak fpvcdn). +""" +from __future__ import annotations + +import logging +import re + +from app.extractors._fetch import fetch_tube_html +from app.extractors._models import StreamSource + +log = logging.getLogger(__name__) + +_GET_FILE_RE = re.compile(r"https://[a-z0-9.\-]+/get_file/[^\s\"'\\]+\.mp4/?", re.IGNORECASE) +_QUALITY_RE = re.compile(r"_(\d{3,4})[mp]?\.mp4", re.IGNORECASE) +_SKIP_QUALITY_RE = re.compile(r"^(2160|1440)$") + + +def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None: + html = fetch_tube_html(page_url, timeout=timeout) + + seen: set[str] = set() + out: list[StreamSource] = [] + for m in _GET_FILE_RE.finditer(html): + url = m.group(0) + if url in seen: + continue + seen.add(url) + qm = _QUALITY_RE.search(url) + quality_num = qm.group(1) if qm else None + if quality_num and _SKIP_QUALITY_RE.match(quality_num): + continue + # `_preview.mp4` itp. bez liczby jakości — pomiń (trailer, nie scena). + if not quality_num: + continue + out.append(StreamSource( + link=url, + quality=f"{quality_num}p", + type="mp4", + referer="https://4k69.com/", + raw={"mobile_direct_ok": True}, + )) + + if not out: + log.info("4k69: no get_file URLs on %s", page_url) + return None + out.sort(key=lambda s: int((s.quality or "0p")[:-1]), reverse=True) + return out diff --git a/app/extractors/tubes/neporn.py b/app/extractors/tubes/neporn.py new file mode 100644 index 0000000..924e82e --- /dev/null +++ b/app/extractors/tubes/neporn.py @@ -0,0 +1,19 @@ +"""neporn.com — KVS (kt_player) direct stream extractor. Patrz app/extractors/tubes/_kvs.py. + +Flashvars `video_url` = `function/0/...get_file/...` + `license_code` (silnik jak +freshporno/porn00/yespornvip). Resolve server-side: decode + follow 302 → +`data001.neporn.com/remote_control.php?time=&cv=...`. + +Cross-IP test 2026-06-10: finalny URL portable (token time-bound, NIE IP-bound — +VPS resolve → lokalny ISP 206 video/mp4). Mobile gra direct, zero proxy/WebView. +""" +from __future__ import annotations + +from app.extractors._models import StreamSource +from app.extractors.tubes import _kvs + +_BASE = "https://neporn.com" + + +def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None: + return _kvs.resolve_kvs(page_url, base_url=_BASE, timeout=timeout)