feat(tubes): add 4k69 + neporn browse scrapers, shared PlayTube base
4k69.com (~65k scenes): same PlayTube CMS as hqfap - common logic moved to _playtube.py (sitemap catalog, JSON-LD, pills). Studio classified by matching category pills against the studios index page. Streams are get_file (fullmovies family) returned unresolved with mobile_direct, 2160p skipped. neporn.com: KVS engine, latest-updates listing, JSON-LD + video:duration meta, performers from models links with flashvars video_tags fallback for fresh uploads. Resolve via _kvs; final URL portable cross-IP. superporn.com rejected: Cloudflare 403 from VPS on all TLS impersonations. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
6de986b9a7
commit
80fd83cb4e
8 changed files with 670 additions and 285 deletions
|
|
@ -153,7 +153,9 @@ from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F
|
||||||
from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402
|
from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
|
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
|
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
|
||||||
|
from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402
|
from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402
|
||||||
|
from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
|
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402
|
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402
|
||||||
|
|
||||||
|
|
@ -210,8 +212,18 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
# się GET-em → crawl_page po sitemap index (12 plików, lastmod desc). Direct mp4
|
# się GET-em → crawl_page po sitemap index (12 plików, lastmod desc). Direct mp4
|
||||||
# (cdnde.com / okcdn.ru), cross-IP portable → natywny extractor `hqfapcom`.
|
# (cdnde.com / okcdn.ru), cross-IP portable → natywny extractor `hqfapcom`.
|
||||||
HQFapScraper,
|
HQFapScraper,
|
||||||
# 4k69.com — NIE dołączony: homepage JS-rendered, brak og:/KVS markerów w surowym HTML
|
# FourK69Scraper — dołączony 2026-06-10 (user request). Probe 2026-06-01 odrzucił
|
||||||
# (probe 2026-06-01). Wymagałby headless render — odłożony.
|
# po homepage "JS-rendered" — błędnie: scene pages mają pełny SSR + JSON-LD. Ta sama
|
||||||
|
# platforma PlayTube co hqfap (wspólna baza _playtube.py), ~65k scen, content głównie
|
||||||
|
# studyjny (4K paysite re-upload). Studio z kategorii matchowanych do listy /studios.
|
||||||
|
# Stream get_file (www.4kporno.xxx) jak fullmovies → mobile_direct, skip 2160p.
|
||||||
|
FourK69Scraper,
|
||||||
|
# NepornScraper — dołączony 2026-06-10 (user request). KVS engine (jak freshporno/
|
||||||
|
# porn00), /latest-updates/N/. JSON-LD (title+desc+uploadDate+thumb) + video:duration
|
||||||
|
# meta + /models/ performerzy + /categories/ tagi. Brak studio (tytuł bywa
|
||||||
|
# "- HardX Update - ..." — fuzzy match po tytule). Resolve server-side _kvs,
|
||||||
|
# finalny remote_control.php portable cross-IP.
|
||||||
|
NepornScraper,
|
||||||
# porntrex/hqporner/youporn — NIE: KVS/JS bez SSR duration → niewidoczne orphany (2026-06-03).
|
# porntrex/hqporner/youporn — NIE: KVS/JS bez SSR duration → niewidoczne orphany (2026-06-03).
|
||||||
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
|
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
|
||||||
]
|
]
|
||||||
|
|
|
||||||
300
app/connectors/direct_scrapers/_playtube.py
Normal file
300
app/connectors/direct_scrapers/_playtube.py
Normal file
|
|
@ -0,0 +1,300 @@
|
||||||
|
"""BasePlayTubeScraper — wspólna baza dla tube'ów na PlayTube CMS (hqfap, 4k69).
|
||||||
|
|
||||||
|
Platforma rozpoznawalna po: `/watch/<slug>_<id>.html`, sitemap index
|
||||||
|
`/sitemaps/videos/sitemap-N.xml` (z `<lastmod>`), JSON-LD VideoObject na detail
|
||||||
|
page'u (name + uploadDate + duration ISO 8601 + thumbnailUrl + contentUrl) oraz
|
||||||
|
pillach `<a class='pill' href='/videos/pornstar|category/<Name>'>`.
|
||||||
|
|
||||||
|
Listing NIE paginuje się GET-em (PlayTube doładowuje AJAX-em `aj/load-more/`),
|
||||||
|
więc crawl_page buduje katalog z sitemapów (sort lastmod desc = newest first)
|
||||||
|
i tnie na strony po `_PAGE_SIZE`. Działa dla browse_latest (pages 1-5) i
|
||||||
|
deep_crawl (kursor do końca katalogu). Minus: sitemap laguje ~dobę za
|
||||||
|
najświeższymi uploadami — akceptowalne przy dziennym harmonogramie.
|
||||||
|
|
||||||
|
Subclass ustawia `base_url` + (opcjonalnie) override'uje `_pick_studio()` —
|
||||||
|
PlayTube nie ma strukturalnego pola studio na scenie, studio siedzi w
|
||||||
|
kategoriach (hqfap: suffix " Clips"; 4k69: nazwa z listy /studios).
|
||||||
|
|
||||||
|
Cloudflare: HTML wymaga browser TLS (curl_cffi w browser_get); plain curl z VPS
|
||||||
|
dostaje 403. Sitemapy i thumbnaile schodzą bez challenge'a.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
|
from app.extractors import browser_get
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_PAGE_SIZE = 20
|
||||||
|
|
||||||
|
_SITEMAP_LOC_RE = re.compile(r"<loc>\s*([^<]+?)\s*</loc>")
|
||||||
|
_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.DOTALL | re.IGNORECASE)
|
||||||
|
_LASTMOD_RE = re.compile(r"<lastmod>\s*([^<]+?)\s*</lastmod>")
|
||||||
|
_SCENE_ID_RE = re.compile(r"_(\d+)\.html")
|
||||||
|
|
||||||
|
_JSONLD_RE = re.compile(
|
||||||
|
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
_ISO_DUR_RE = re.compile(r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE)
|
||||||
|
|
||||||
|
# Pille performera/kategorii. PlayTube renderuje single-quoted attrs; dopuszczamy
|
||||||
|
# oba quote'y. Nazwa z `<span itemprop='name'>` (href bywa URL-encoded / ze spacjami).
|
||||||
|
_PILL_RE = re.compile(
|
||||||
|
r"<a\s+class=['\"]pill['\"]\s+href=['\"]/videos/(?P<kind>pornstar|category)/[^'\"]*['\"]"
|
||||||
|
r".*?<span itemprop=['\"]name['\"]>(?P<name>[^<]+)</span>",
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_iso_duration(value: str | None) -> int | None:
|
||||||
|
"""`PT26M48S` → sekundy. None gdy format nieznany."""
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
m = _ISO_DUR_RE.match(value.strip())
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
|
||||||
|
return total or None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_iso_date(value: str | None) -> date | None:
|
||||||
|
"""`2026-06-09T16:00:00+00:00` → date. None gdy parse fail."""
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
||||||
|
except ValueError:
|
||||||
|
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
return date.fromisoformat(m.group(1))
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_video_object(html: str) -> dict | None:
|
||||||
|
"""Pierwszy JSON-LD VideoObject w HTML (PlayTube emituje jeden, płaski dict)."""
|
||||||
|
for m in _JSONLD_RE.finditer(html):
|
||||||
|
raw = m.group(1).strip()
|
||||||
|
if not raw:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(raw)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
continue
|
||||||
|
items = data if isinstance(data, list) else [data]
|
||||||
|
for obj in items:
|
||||||
|
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
|
||||||
|
return obj
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class BasePlayTubeScraper(BaseBrowseScraper):
|
||||||
|
base_url: str # np. "https://hqfap.com" — subclass ustawia
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
# Katalog URL-i scen z sitemap, newest-first. Lazy-init raz per instancję
|
||||||
|
# (browse_latest i deep_crawl tworzą instancję per run, więc kilkanaście
|
||||||
|
# fetchy XML amortyzuje się na cały run).
|
||||||
|
self._catalog: list[str] | None = None
|
||||||
|
|
||||||
|
# Hook: wybierz studio spośród nazw kategorii (display name) albo None.
|
||||||
|
# Wybrana kategoria NIE trafia do tagów.
|
||||||
|
def _pick_studio(self, category_names: list[str]) -> str | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# crawl_page override (jak EpornerApiScraper) — listing nie jest stronicowalny
|
||||||
|
# przez GET, źródłem paginacji jest sitemap. _listing_url/_extract_scene_urls
|
||||||
|
# nieużywane, ale abstrakcyjne — dostarczamy no-op implementacje.
|
||||||
|
def _listing_url(self, page: int) -> str: # pragma: no cover - nieużywane
|
||||||
|
return f"{self.base_url}/sitemap.xml"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _load_catalog(self) -> list[str] | None:
|
||||||
|
"""Pełna lista URL-i scen posortowana lastmod desc. None = fetch fail."""
|
||||||
|
if self._catalog is not None:
|
||||||
|
return self._catalog
|
||||||
|
index_url = f"{self.base_url}/sitemap.xml"
|
||||||
|
try:
|
||||||
|
idx = browser_get(index_url, timeout=self._timeout)
|
||||||
|
idx.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("%s: sitemap index fetch failed: %s", self.sitetag, e)
|
||||||
|
return None
|
||||||
|
sitemap_urls = [
|
||||||
|
u for u in _SITEMAP_LOC_RE.findall(idx.text) if "/videos/sitemap-" in u
|
||||||
|
]
|
||||||
|
if not sitemap_urls:
|
||||||
|
log.warning("%s: sitemap index has no video sitemaps", self.sitetag)
|
||||||
|
return None
|
||||||
|
|
||||||
|
entries: list[tuple[str, str]] = [] # (lastmod, scene_url)
|
||||||
|
for sm_url in sitemap_urls:
|
||||||
|
try:
|
||||||
|
sm = browser_get(sm_url, timeout=self._timeout)
|
||||||
|
sm.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
# Brak jednego sitemapa ≠ fail całości — reszta katalogu wystarczy.
|
||||||
|
log.warning("%s: sitemap fetch failed %s: %s", self.sitetag, sm_url, e)
|
||||||
|
continue
|
||||||
|
for block in _URL_BLOCK_RE.findall(sm.text):
|
||||||
|
loc_m = _SITEMAP_LOC_RE.search(block)
|
||||||
|
if not loc_m or "/watch/" not in loc_m.group(1):
|
||||||
|
continue
|
||||||
|
lastmod_m = _LASTMOD_RE.search(block)
|
||||||
|
entries.append((lastmod_m.group(1) if lastmod_m else "", loc_m.group(1)))
|
||||||
|
|
||||||
|
if not entries:
|
||||||
|
return None
|
||||||
|
# Dedup po scene id (sitemap potrafi powtórzyć URL między plikami).
|
||||||
|
entries.sort(key=lambda e: e[0], reverse=True)
|
||||||
|
seen_ids: set[str] = set()
|
||||||
|
catalog: list[str] = []
|
||||||
|
for _, url in entries:
|
||||||
|
id_m = _SCENE_ID_RE.search(url)
|
||||||
|
key = id_m.group(1) if id_m else url
|
||||||
|
if key in seen_ids:
|
||||||
|
continue
|
||||||
|
seen_ids.add(key)
|
||||||
|
catalog.append(url)
|
||||||
|
log.info("%s: catalog loaded — %d scenes from %d sitemaps",
|
||||||
|
self.sitetag, len(catalog), len(sitemap_urls))
|
||||||
|
self._catalog = catalog
|
||||||
|
return catalog
|
||||||
|
|
||||||
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||||
|
catalog = self._load_catalog()
|
||||||
|
if catalog is None:
|
||||||
|
return None
|
||||||
|
start = (page - 1) * _PAGE_SIZE
|
||||||
|
chunk = catalog[start:start + _PAGE_SIZE]
|
||||||
|
if not chunk:
|
||||||
|
return []
|
||||||
|
out: list[RawScene] = []
|
||||||
|
for scene_url in chunk:
|
||||||
|
try:
|
||||||
|
res = browser_get(scene_url, timeout=self._timeout)
|
||||||
|
res.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
raw = self._parse_detail(scene_url, res.text)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e)
|
||||||
|
continue
|
||||||
|
if raw is not None:
|
||||||
|
out.append(raw)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
video = _extract_video_object(detail_html)
|
||||||
|
if not video:
|
||||||
|
log.info("%s: no JSON-LD VideoObject on %s", self.sitetag, scene_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
title = (video.get("name") or "").strip()
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
id_m = _SCENE_ID_RE.search(scene_url)
|
||||||
|
scene_id = id_m.group(1) if id_m else None
|
||||||
|
|
||||||
|
duration_sec = _parse_iso_duration(video.get("duration"))
|
||||||
|
release_date = _parse_iso_date(video.get("uploadDate"))
|
||||||
|
thumbnail_url = video.get("thumbnailUrl") or None
|
||||||
|
|
||||||
|
# Pille: pornstar → performer; category → studio (hook `_pick_studio`)
|
||||||
|
# albo tag. Ocenzurowane nazwy (`Te***`) pomijamy — gwiazdki to nie dane.
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
category_names: list[str] = []
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for m in _PILL_RE.finditer(detail_html):
|
||||||
|
name = m.group("name").strip()
|
||||||
|
if not name or "*" in name:
|
||||||
|
continue
|
||||||
|
if m.group("kind").lower() == "pornstar":
|
||||||
|
slug = slugify(name)
|
||||||
|
if slug and slug not in seen_perf:
|
||||||
|
seen_perf.add(slug)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name)
|
||||||
|
)
|
||||||
|
elif name not in category_names:
|
||||||
|
category_names.append(name)
|
||||||
|
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
studio_name = self._pick_studio(category_names)
|
||||||
|
if studio_name:
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"{self.sitetag}:studio:{slugify(studio_name)}",
|
||||||
|
name=studio_name,
|
||||||
|
slug=slugify(studio_name),
|
||||||
|
)
|
||||||
|
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
picked = (studio_name or "").strip().lower()
|
||||||
|
for name in category_names:
|
||||||
|
# Studio-kategoria nie idzie do tagów (ani w wersji z suffixem " Clips").
|
||||||
|
if picked and name.strip().lower() in (picked, picked + " clips"):
|
||||||
|
continue
|
||||||
|
slug = slugify(name)
|
||||||
|
if not slug or slug in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(slug)
|
||||||
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug))
|
||||||
|
|
||||||
|
# Phash: thumbnaile bywają re-encodowanym studio art (szansa na match),
|
||||||
|
# dla amatorskiego contentu nie zmatchują. Graceful: miss → composite scoring.
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumbnail_url:
|
||||||
|
ph = compute_thumbnail_phash(thumbnail_url, referer=self.base_url + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
# Stream: JSON-LD contentUrl wygasa (token time-bound) — NIE zapisujemy
|
||||||
|
# stream_url; extractor per-sitetag resolvuje świeży on-demand.
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_id or scene_url}",
|
||||||
|
title=title,
|
||||||
|
release_date=release_date,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
66
app/connectors/direct_scrapers/fourk69.py
Normal file
66
app/connectors/direct_scrapers/fourk69.py
Normal file
|
|
@ -0,0 +1,66 @@
|
||||||
|
"""4k69.com — latest-vids browse scraper (PlayTube CMS, patrz _playtube.py).
|
||||||
|
|
||||||
|
Dołączony 2026-06-10 (user request; probe 2026-06-01 odrzucił po stronie głównej
|
||||||
|
"JS-rendered" — błędnie, scene pages mają pełny SSR + JSON-LD). 7 video sitemapów
|
||||||
|
≈ ~65k scen, content w dużej mierze studyjny (paysite re-upload, 4K).
|
||||||
|
|
||||||
|
Specyfika vs baza: studio NIE ma własnego pola na scenie — nazwy studiów występują
|
||||||
|
jako kategorie ("21 Sextury", "Adult Time") obok zwykłych ("Anal", "4K").
|
||||||
|
Klasyfikacja: lista wszystkich studiów z `/studios` (fetch raz per instancję,
|
||||||
|
match po znormalizowanej nazwie alfanumerycznej — pill "Adult Time" vs slug
|
||||||
|
"AdultTime"). Studio bywa też w prefiksie tytułu, ale kategoria jest pewniejsza.
|
||||||
|
|
||||||
|
Playback: JSON-LD contentUrl + dwa dodatkowe get_file w HTML (2160m/720m/480m,
|
||||||
|
www.4kporno.xxx) — ta sama platforma co fullmovies/hdporngg: get_file binduje CDN
|
||||||
|
do IP fetchera, więc oddajemy NIEZRESOLWOWANE (mobile_direct), telefon follow-uje
|
||||||
|
302 z własnym IP. Extractor `4k69com` pomija 2160p (CDN time-out, jak fpvcdn).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._playtube import BasePlayTubeScraper
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_STUDIO_LINK_RE = re.compile(r"href=['\"][^'\"]*/videos/studio/([^'\"]+)['\"]", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _norm(name: str) -> str:
|
||||||
|
"""`Adult Time` / `AdultTime` → `adulttime` (porównanie pill vs studio slug)."""
|
||||||
|
return re.sub(r"[^a-z0-9]", "", name.lower())
|
||||||
|
|
||||||
|
|
||||||
|
class FourK69Scraper(BasePlayTubeScraper):
|
||||||
|
sitetag = "4k69com"
|
||||||
|
base_url = "https://4k69.com"
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self._studio_set: set[str] | None = None
|
||||||
|
|
||||||
|
def _load_studio_set(self) -> set[str]:
|
||||||
|
"""Znormalizowane nazwy wszystkich studiów z /studios. Pusty set = fetch
|
||||||
|
fail (graceful: sceny pójdą bez studio, composite ma performer+title+dur)."""
|
||||||
|
if self._studio_set is not None:
|
||||||
|
return self._studio_set
|
||||||
|
try:
|
||||||
|
r = browser_get(f"{self.base_url}/studios", timeout=self._timeout)
|
||||||
|
r.raise_for_status()
|
||||||
|
self._studio_set = {_norm(m) for m in _STUDIO_LINK_RE.findall(r.text) if _norm(m)}
|
||||||
|
log.info("4k69: studio list loaded — %d studios", len(self._studio_set))
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("4k69: studios page fetch failed: %s", e)
|
||||||
|
self._studio_set = set()
|
||||||
|
return self._studio_set
|
||||||
|
|
||||||
|
def _pick_studio(self, category_names: list[str]) -> str | None:
|
||||||
|
studios = self._load_studio_set()
|
||||||
|
if not studios:
|
||||||
|
return None
|
||||||
|
for name in category_names:
|
||||||
|
if _norm(name) in studios:
|
||||||
|
return name
|
||||||
|
return None
|
||||||
|
|
@ -1,294 +1,26 @@
|
||||||
"""hqfap.com — latest-vids browse scraper (PlayTube CMS).
|
"""hqfap.com — latest-vids browse scraper (PlayTube CMS, patrz _playtube.py).
|
||||||
|
|
||||||
Dołączony 2026-06-10 (user request). Re-uploader katalogu pornhd.pet (~120k scen,
|
Dołączony 2026-06-10 (user request). Re-uploader katalogu pornhd.pet (~120k scen,
|
||||||
thumbnaile to base64-encoded oryginalne URL-e w `/uploads/images/`).
|
thumbnaile to base64-encoded oryginalne URL-e w `/uploads/images/`).
|
||||||
|
|
||||||
Sygnały per scena (wszystko w SSR HTML detail page'a):
|
Specyfika vs baza: studio siedzi w kategoriach z suffixem " Clips"
|
||||||
- JSON-LD VideoObject: name, uploadDate (ISO), duration (ISO 8601 `PT26M48S`),
|
("Filthy Kings Clips" → studio "Filthy Kings"); reszta kategorii → tagi.
|
||||||
thumbnailUrl, contentUrl (direct mp4 — patrz extractor `hqfapcom`)
|
Playback: direct mp4 z JSON-LD contentUrl (cdnde.com nowsze / okcdn.ru starsze),
|
||||||
- Performerzy: blok "Pornstars:" — `<a class='pill' href='/videos/pornstar/<Name>'>`
|
tokeny time-bound i portable cross-IP → natywny extractor `hqfapcom`.
|
||||||
- Kategorie: blok "Categories & Tags:" — `<a class='pill' href='/videos/category/<Name>'>`
|
|
||||||
Część nazw ocenzurowana gwiazdkami (`Te***`) — pomijamy. Kategorie z suffixem
|
|
||||||
" Clips" to studia ("Filthy Kings Clips") → RawStudio.
|
|
||||||
|
|
||||||
Listing: strona główna i `/videos/latest` NIE paginują się przez GET (PlayTube
|
|
||||||
doładowuje AJAX-em), ale site ma pełny **sitemap index** (`/sitemap.xml` →
|
|
||||||
12× `sitemaps/videos/sitemap-N.xml`, po ~10k URL-i z `<lastmod>`). Crawl_page
|
|
||||||
buduje katalog z sitemap (sort lastmod desc = newest first) i tnie na strony po
|
|
||||||
20 URL-i — działa i dla browse_latest (pages 1-5) i dla deep_crawl (kursor).
|
|
||||||
|
|
||||||
Cloudflare: strony HTML wymagają browser TLS (curl_cffi w browser_get); plain
|
|
||||||
curl z VPS dostaje 403. Sitemap i thumbnaile schodzą bez challenge'a.
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
from app.connectors.direct_scrapers._playtube import BasePlayTubeScraper
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from datetime import date, datetime
|
|
||||||
|
|
||||||
from app.connectors.base import (
|
|
||||||
RawFingerprint,
|
|
||||||
RawPerformer,
|
|
||||||
RawPlaybackSource,
|
|
||||||
RawScene,
|
|
||||||
RawStudio,
|
|
||||||
RawTag,
|
|
||||||
)
|
|
||||||
from app.connectors.direct_scrapers._browse_base import (
|
|
||||||
BaseBrowseScraper,
|
|
||||||
compute_thumbnail_phash,
|
|
||||||
)
|
|
||||||
from app.extractors import browser_get
|
|
||||||
from app.normalize.text import slugify
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_BASE = "https://hqfap.com"
|
|
||||||
_SITEMAP_INDEX = f"{_BASE}/sitemap.xml"
|
|
||||||
_PAGE_SIZE = 20
|
|
||||||
|
|
||||||
_SITEMAP_LOC_RE = re.compile(r"<loc>\s*([^<]+?)\s*</loc>")
|
|
||||||
_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.DOTALL | re.IGNORECASE)
|
|
||||||
_LASTMOD_RE = re.compile(r"<lastmod>\s*([^<]+?)\s*</lastmod>")
|
|
||||||
_SCENE_ID_RE = re.compile(r"_(\d+)\.html")
|
|
||||||
|
|
||||||
_JSONLD_RE = re.compile(
|
|
||||||
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
|
|
||||||
re.IGNORECASE | re.DOTALL,
|
|
||||||
)
|
|
||||||
_ISO_DUR_RE = re.compile(r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE)
|
|
||||||
|
|
||||||
# Pille performera/kategorii. PlayTube renderuje single-quoted attrs; dopuszczamy
|
|
||||||
# oba quote'y. Nazwa z `<span itemprop='name'>` (href bywa URL-encoded / ze spacjami).
|
|
||||||
_PILL_RE = re.compile(
|
|
||||||
r"<a\s+class=['\"]pill['\"]\s+href=['\"]/videos/(?P<kind>pornstar|category)/[^'\"]*['\"]"
|
|
||||||
r".*?<span itemprop=['\"]name['\"]>(?P<name>[^<]+)</span>",
|
|
||||||
re.IGNORECASE | re.DOTALL,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_iso_duration(value: str | None) -> int | None:
|
class HQFapScraper(BasePlayTubeScraper):
|
||||||
"""`PT26M48S` → sekundy. None gdy format nieznany."""
|
|
||||||
if not value:
|
|
||||||
return None
|
|
||||||
m = _ISO_DUR_RE.match(value.strip())
|
|
||||||
if not m:
|
|
||||||
return None
|
|
||||||
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
|
|
||||||
return total or None
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_iso_date(value: str | None) -> date | None:
|
|
||||||
"""`2026-06-09T16:00:00+00:00` → date. None gdy parse fail."""
|
|
||||||
if not value:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
|
||||||
except ValueError:
|
|
||||||
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
|
|
||||||
if m:
|
|
||||||
try:
|
|
||||||
return date.fromisoformat(m.group(1))
|
|
||||||
except ValueError:
|
|
||||||
return None
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _extract_video_object(html: str) -> dict | None:
|
|
||||||
"""Pierwszy JSON-LD VideoObject w HTML (hqfap emituje jeden, płaski dict)."""
|
|
||||||
for m in _JSONLD_RE.finditer(html):
|
|
||||||
raw = m.group(1).strip()
|
|
||||||
if not raw:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
data = json.loads(raw)
|
|
||||||
except (json.JSONDecodeError, ValueError):
|
|
||||||
continue
|
|
||||||
items = data if isinstance(data, list) else [data]
|
|
||||||
for obj in items:
|
|
||||||
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
|
|
||||||
return obj
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
class HQFapScraper(BaseBrowseScraper):
|
|
||||||
sitetag = "hqfapcom"
|
sitetag = "hqfapcom"
|
||||||
|
base_url = "https://hqfap.com"
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def _pick_studio(self, category_names: list[str]) -> str | None:
|
||||||
super().__init__()
|
for name in category_names:
|
||||||
# Katalog URL-i scen z sitemap, newest-first. Lazy-init raz per instancję
|
if name.lower().endswith(" clips"):
|
||||||
# (browse_latest i deep_crawl tworzą instancję per run, więc 13 fetchy XML
|
studio_name = name[: -len(" clips")].strip()
|
||||||
# amortyzuje się na cały run).
|
if studio_name:
|
||||||
self._catalog: list[str] | None = None
|
return studio_name
|
||||||
|
return None
|
||||||
# crawl_page override (jak EpornerApiScraper) — listing nie jest stronicowalny
|
|
||||||
# przez GET, źródłem paginacji jest sitemap. _listing_url/_extract_scene_urls
|
|
||||||
# nieużywane, ale abstrakcyjne — dostarczamy no-op implementacje.
|
|
||||||
def _listing_url(self, page: int) -> str: # pragma: no cover - nieużywane
|
|
||||||
return _SITEMAP_INDEX
|
|
||||||
|
|
||||||
def _extract_scene_urls(self, listing_html: str) -> list[str]: # pragma: no cover
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _load_catalog(self) -> list[str] | None:
|
|
||||||
"""Pełna lista URL-i scen posortowana lastmod desc. None = fetch fail."""
|
|
||||||
if self._catalog is not None:
|
|
||||||
return self._catalog
|
|
||||||
try:
|
|
||||||
idx = browser_get(_SITEMAP_INDEX, timeout=self._timeout)
|
|
||||||
idx.raise_for_status()
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("hqfap: sitemap index fetch failed: %s", e)
|
|
||||||
return None
|
|
||||||
sitemap_urls = [
|
|
||||||
u for u in _SITEMAP_LOC_RE.findall(idx.text) if "/videos/sitemap-" in u
|
|
||||||
]
|
|
||||||
if not sitemap_urls:
|
|
||||||
log.warning("hqfap: sitemap index has no video sitemaps")
|
|
||||||
return None
|
|
||||||
|
|
||||||
entries: list[tuple[str, str]] = [] # (lastmod, scene_url)
|
|
||||||
for sm_url in sitemap_urls:
|
|
||||||
try:
|
|
||||||
sm = browser_get(sm_url, timeout=self._timeout)
|
|
||||||
sm.raise_for_status()
|
|
||||||
except Exception as e:
|
|
||||||
# Brak jednego sitemapa ≠ fail całości — reszta katalogu wystarczy.
|
|
||||||
log.warning("hqfap: sitemap fetch failed %s: %s", sm_url, e)
|
|
||||||
continue
|
|
||||||
for block in _URL_BLOCK_RE.findall(sm.text):
|
|
||||||
loc_m = _SITEMAP_LOC_RE.search(block)
|
|
||||||
if not loc_m or "/watch/" not in loc_m.group(1):
|
|
||||||
continue
|
|
||||||
lastmod_m = _LASTMOD_RE.search(block)
|
|
||||||
entries.append((lastmod_m.group(1) if lastmod_m else "", loc_m.group(1)))
|
|
||||||
|
|
||||||
if not entries:
|
|
||||||
return None
|
|
||||||
# Dedup po scene id (sitemap potrafi powtórzyć URL między plikami).
|
|
||||||
entries.sort(key=lambda e: e[0], reverse=True)
|
|
||||||
seen_ids: set[str] = set()
|
|
||||||
catalog: list[str] = []
|
|
||||||
for _, url in entries:
|
|
||||||
id_m = _SCENE_ID_RE.search(url)
|
|
||||||
key = id_m.group(1) if id_m else url
|
|
||||||
if key in seen_ids:
|
|
||||||
continue
|
|
||||||
seen_ids.add(key)
|
|
||||||
catalog.append(url)
|
|
||||||
log.info("hqfap: catalog loaded — %d scenes from %d sitemaps",
|
|
||||||
len(catalog), len(sitemap_urls))
|
|
||||||
self._catalog = catalog
|
|
||||||
return catalog
|
|
||||||
|
|
||||||
def crawl_page(self, page: int) -> list[RawScene] | None:
|
|
||||||
catalog = self._load_catalog()
|
|
||||||
if catalog is None:
|
|
||||||
return None
|
|
||||||
start = (page - 1) * _PAGE_SIZE
|
|
||||||
chunk = catalog[start:start + _PAGE_SIZE]
|
|
||||||
if not chunk:
|
|
||||||
return []
|
|
||||||
out: list[RawScene] = []
|
|
||||||
for scene_url in chunk:
|
|
||||||
try:
|
|
||||||
res = browser_get(scene_url, timeout=self._timeout)
|
|
||||||
res.raise_for_status()
|
|
||||||
except Exception as e:
|
|
||||||
log.info("hqfap detail fetch failed %s: %s", scene_url, e)
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
raw = self._parse_detail(scene_url, res.text)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("hqfap detail parse failed %s: %s", scene_url, e)
|
|
||||||
continue
|
|
||||||
if raw is not None:
|
|
||||||
out.append(raw)
|
|
||||||
return out
|
|
||||||
|
|
||||||
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
||||||
video = _extract_video_object(detail_html)
|
|
||||||
if not video:
|
|
||||||
log.info("hqfap: no JSON-LD VideoObject on %s", scene_url)
|
|
||||||
return None
|
|
||||||
|
|
||||||
title = (video.get("name") or "").strip()
|
|
||||||
if not title:
|
|
||||||
return None
|
|
||||||
|
|
||||||
id_m = _SCENE_ID_RE.search(scene_url)
|
|
||||||
scene_id = id_m.group(1) if id_m else None
|
|
||||||
|
|
||||||
duration_sec = _parse_iso_duration(video.get("duration"))
|
|
||||||
release_date = _parse_iso_date(video.get("uploadDate"))
|
|
||||||
thumbnail_url = video.get("thumbnailUrl") or None
|
|
||||||
|
|
||||||
# Pille: pornstar → performer; category → tag, chyba że suffix " Clips"
|
|
||||||
# (studio-kategorie z importu pornhd, np. "Filthy Kings Clips" → "Filthy Kings").
|
|
||||||
# Ocenzurowane nazwy (`Te***`) pomijamy — gwiazdki to nie dane.
|
|
||||||
studio: RawStudio | None = None
|
|
||||||
performers: list[RawPerformer] = []
|
|
||||||
tags: list[RawTag] = []
|
|
||||||
seen_perf: set[str] = set()
|
|
||||||
seen_tag: set[str] = set()
|
|
||||||
for m in _PILL_RE.finditer(detail_html):
|
|
||||||
name = m.group("name").strip()
|
|
||||||
if not name or "*" in name:
|
|
||||||
continue
|
|
||||||
slug = slugify(name)
|
|
||||||
if not slug:
|
|
||||||
continue
|
|
||||||
if m.group("kind").lower() == "pornstar":
|
|
||||||
if slug not in seen_perf:
|
|
||||||
seen_perf.add(slug)
|
|
||||||
performers.append(
|
|
||||||
RawPerformer(external_id=f"{self.sitetag}:performer:{slug}", name=name)
|
|
||||||
)
|
|
||||||
elif name.lower().endswith(" clips"):
|
|
||||||
if studio is None:
|
|
||||||
studio_name = name[: -len(" clips")].strip()
|
|
||||||
if studio_name:
|
|
||||||
studio = RawStudio(
|
|
||||||
external_id=f"{self.sitetag}:studio:{slugify(studio_name)}",
|
|
||||||
name=studio_name,
|
|
||||||
slug=slugify(studio_name),
|
|
||||||
)
|
|
||||||
elif slug not in seen_tag:
|
|
||||||
seen_tag.add(slug)
|
|
||||||
tags.append(
|
|
||||||
RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Phash: thumbnaile to re-encodowane webp posterów pornhd.pet — dla studio
|
|
||||||
# contentu bywają oryginalnym studio art (szansa na phash match), dla amatorskiego
|
|
||||||
# nie zmatchują. Graceful: miss → composite scoring (title+performer+duration).
|
|
||||||
fingerprints: list[RawFingerprint] = []
|
|
||||||
if thumbnail_url:
|
|
||||||
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
|
||||||
if ph:
|
|
||||||
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
||||||
|
|
||||||
# Stream: JSON-LD contentUrl to direct mp4, ale token (`time=`) wygasa —
|
|
||||||
# NIE zapisujemy stream_url; extractor `hqfapcom` resolvuje świeży on-demand.
|
|
||||||
playback_sources = [
|
|
||||||
RawPlaybackSource(
|
|
||||||
origin=f"tube:{self.sitetag}",
|
|
||||||
page_url=scene_url,
|
|
||||||
duration_sec=duration_sec,
|
|
||||||
thumbnail_url=thumbnail_url,
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
return RawScene(
|
|
||||||
external_id=f"{self.sitetag}:{scene_id or scene_url}",
|
|
||||||
title=title,
|
|
||||||
release_date=release_date,
|
|
||||||
duration_sec=duration_sec,
|
|
||||||
url=scene_url,
|
|
||||||
studio=studio,
|
|
||||||
performers=performers,
|
|
||||||
tags=tags,
|
|
||||||
fingerprints=fingerprints,
|
|
||||||
playback_sources=playback_sources,
|
|
||||||
)
|
|
||||||
|
|
|
||||||
191
app/connectors/direct_scrapers/neporn.py
Normal file
191
app/connectors/direct_scrapers/neporn.py
Normal file
|
|
@ -0,0 +1,191 @@
|
||||||
|
"""neporn.com — latest-vids browse scraper (KVS engine).
|
||||||
|
|
||||||
|
Dołączony 2026-06-10 (user request). Paysite re-upload (HardX, DAP itp.),
|
||||||
|
~40k+ scen po numeracji video id.
|
||||||
|
|
||||||
|
Sygnały per scena:
|
||||||
|
- JSON-LD VideoObject: name, description, uploadDate, thumbnailUrl
|
||||||
|
(uwaga: bywa zmanglowany "https:https://..." — normalizujemy)
|
||||||
|
- `<meta property="video:duration">` = sekundy wprost
|
||||||
|
- Performerzy: `<a class="link" href="https://neporn.com/models/<slug>/">`
|
||||||
|
(nazwa w spanie z ikoną — bierzemy slug i tytułujemy). Świeże uploady NIE mają
|
||||||
|
jeszcze linków modelek — fallback: flashvars `video_tags` (na świeżych scenach
|
||||||
|
to czyste nazwiska, np. "emily willis, gianna dior"); bierzemy wpisy 2-3 słowa
|
||||||
|
obecne w tytule i bez generycznych fraz (stoplist), bo starsze sceny mają tam
|
||||||
|
też szum ("deep throat", "natural tits", "hd porn").
|
||||||
|
- Kategorie: linki `/categories/<slug>/` w bloku info (tagi z `/tags/` to szum
|
||||||
|
typu "hd porn", "2020" — pomijamy)
|
||||||
|
|
||||||
|
Studio: brak strukturalnego pola — tytuł bywa "- HardX Update - ..." ale format
|
||||||
|
niespójny, zostawiamy fuzzy matchowi po tytule.
|
||||||
|
|
||||||
|
Listing: standard KVS `/latest-updates/N/` (24 sceny/strona).
|
||||||
|
Playback: KVS function/0 + license → `_kvs.resolve_kvs` server-side; finalny
|
||||||
|
`data001.neporn.com/remote_control.php?time=...` portable cross-IP
|
||||||
|
(test 2026-06-10: VPS resolve → lokalny ISP 206 video/mp4).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
meta_content,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._playtube import (
|
||||||
|
_extract_video_object,
|
||||||
|
_parse_iso_date,
|
||||||
|
)
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://neporn.com"
|
||||||
|
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(https://neporn\.com/video/(\d+)/[^"]+)"')
|
||||||
|
_MODEL_LINK_RE = re.compile(r'href="https://neporn\.com/models/([a-z0-9\-]+)/"', re.IGNORECASE)
|
||||||
|
_CATEGORY_LINK_RE = re.compile(
|
||||||
|
r'href="https://neporn\.com/categories/([a-z0-9\-]+)/"\s*>\s*([^<]+?)\s*<', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_VIDEO_TAGS_RE = re.compile(r"video_tags:\s*'([^']*)'")
|
||||||
|
|
||||||
|
# Słowa dyskwalifikujące wpis z video_tags jako nazwisko (generyczne frazy typu
|
||||||
|
# "deep throat" / "natural tits" przechodzą test "jest w tytule" zbyt często).
|
||||||
|
_PERF_STOPWORDS = frozenset(
|
||||||
|
"porn sex tits ass anal throat cum blow blowjob dick cock pussy fuck fucking "
|
||||||
|
"scene scenes hd milf teen big small double penetration facial creampie "
|
||||||
|
"threesome amateur petite latina blonde brunette".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _name_from_slug(slug: str) -> str:
|
||||||
|
"""`emily-willis` → `Emily Willis`."""
|
||||||
|
return " ".join(w.capitalize() for w in slug.split("-") if w)
|
||||||
|
|
||||||
|
|
||||||
|
class NepornScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "neporncom"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
if page <= 1:
|
||||||
|
return f"{_BASE}/latest-updates/"
|
||||||
|
return f"{_BASE}/latest-updates/{page}/"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
url = m.group(1)
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
video = _extract_video_object(detail_html) or {}
|
||||||
|
|
||||||
|
title = (video.get("name") or meta_content(detail_html, property="og:title") or "").strip()
|
||||||
|
# Tytuły bywają z wiszącym separatorem na początku ("- HardX Update - ...").
|
||||||
|
title = title.lstrip("- ").strip()
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
id_m = re.search(r"/video/(\d+)/", scene_url)
|
||||||
|
video_id = id_m.group(1) if id_m else None
|
||||||
|
|
||||||
|
duration_sec: int | None = None
|
||||||
|
dur_meta = meta_content(detail_html, property="video:duration")
|
||||||
|
if dur_meta and dur_meta.isdigit():
|
||||||
|
duration_sec = int(dur_meta)
|
||||||
|
|
||||||
|
release_date = _parse_iso_date(video.get("uploadDate"))
|
||||||
|
description = (video.get("description") or "").strip() or None
|
||||||
|
thumbnail_url = (video.get("thumbnailUrl") or "").strip() or None
|
||||||
|
if thumbnail_url:
|
||||||
|
# JSON-LD emituje "https:https://cdn..." — utnij zdublowany scheme.
|
||||||
|
thumbnail_url = re.sub(r"^https?:(?=https?://)", "", thumbnail_url)
|
||||||
|
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for m in _MODEL_LINK_RE.finditer(detail_html):
|
||||||
|
slug = m.group(1)
|
||||||
|
if slug in seen_perf:
|
||||||
|
continue
|
||||||
|
seen_perf.add(slug)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(
|
||||||
|
external_id=f"{self.sitetag}:model:{slug}",
|
||||||
|
name=_name_from_slug(slug),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# Fallback dla świeżych uploadów bez linków modelek: nazwiska z flashvars
|
||||||
|
# video_tags — 2-3 słowa, obecne w tytule, bez generycznych fraz.
|
||||||
|
title_cf = title.casefold()
|
||||||
|
tags_m = _VIDEO_TAGS_RE.search(detail_html)
|
||||||
|
for entry in (tags_m.group(1).split(",") if tags_m else []):
|
||||||
|
entry = entry.strip()
|
||||||
|
words = entry.split()
|
||||||
|
if not (2 <= len(words) <= 3):
|
||||||
|
continue
|
||||||
|
if any(w.casefold() in _PERF_STOPWORDS for w in words):
|
||||||
|
continue
|
||||||
|
if entry.casefold() not in title_cf:
|
||||||
|
continue
|
||||||
|
slug = entry.casefold().replace(" ", "-")
|
||||||
|
if slug in seen_perf:
|
||||||
|
continue
|
||||||
|
seen_perf.add(slug)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(
|
||||||
|
external_id=f"{self.sitetag}:model:{slug}",
|
||||||
|
name=_name_from_slug(slug),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for m in _CATEGORY_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if not name or slug in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(slug)
|
||||||
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug))
|
||||||
|
|
||||||
|
# Phash z KVS screenshotów — niski hit-rate oczekiwany (jak porn00),
|
||||||
|
# graceful miss → composite scoring (performer+title+duration).
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumbnail_url:
|
||||||
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{video_id or scene_url}",
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
release_date=release_date,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
|
|
@ -29,10 +29,12 @@ from app.extractors.tubes import (
|
||||||
_ytdlp,
|
_ytdlp,
|
||||||
eporner,
|
eporner,
|
||||||
freshporno,
|
freshporno,
|
||||||
|
fourk69,
|
||||||
fullmovies,
|
fullmovies,
|
||||||
hdporngg,
|
hdporngg,
|
||||||
hqfap,
|
hqfap,
|
||||||
hqporner,
|
hqporner,
|
||||||
|
neporn,
|
||||||
latestpornvideo,
|
latestpornvideo,
|
||||||
paradisehill,
|
paradisehill,
|
||||||
porn00,
|
porn00,
|
||||||
|
|
@ -179,6 +181,13 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# Cross-IP test 2026-06-10: oba CDN-y portable (`ip=`/`srcIp=` nie egzekwowane),
|
# Cross-IP test 2026-06-10: oba CDN-y portable (`ip=`/`srcIp=` nie egzekwowane),
|
||||||
# tokeny time-bound → on-demand fetch daje świeży URL. Mobile direct, zero proxy.
|
# tokeny time-bound → on-demand fetch daje świeży URL. Mobile direct, zero proxy.
|
||||||
"hqfapcom": hqfap.extract,
|
"hqfapcom": hqfap.extract,
|
||||||
|
# 4k69 — get_file (www.4kporno.xxx, rodzina fullmovies/hdporngg): binduje CDN do IP
|
||||||
|
# fetchera → oddajemy niezresolwowane (mobile_direct), telefon follow-uje 302.
|
||||||
|
# Skip 2160p (CDN time-out). Cross-IP test 2026-06-10: 206 z lokalnego ISP.
|
||||||
|
"4k69com": fourk69.extract,
|
||||||
|
# neporn — KVS function/0 + license (jak freshporno). Server-side _kvs resolve →
|
||||||
|
# data001.neporn.com/remote_control.php portable (cross-IP 206, 2026-06-10).
|
||||||
|
"neporncom": neporn.extract,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
56
app/extractors/tubes/fourk69.py
Normal file
56
app/extractors/tubes/fourk69.py
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
"""4k69.com — get_file stream extractor (platforma jak fullmovies/hdporngg).
|
||||||
|
|
||||||
|
Scene page (SSR za Cloudflare → curl_cffi) ma 3 get_file URL-e na www.4kporno.xxx
|
||||||
|
(`..._2160m.mp4` / `_720m` / `_480m`) — w JSON-LD contentUrl i w JS playera, NIE
|
||||||
|
w `<source>` tagach (dlatego nie _source_getfile, tylko skan całej strony).
|
||||||
|
|
||||||
|
Jak fpvcdn (fullmovies, ta sama rodzina `/get_file/8512/`): get_file binduje CDN
|
||||||
|
do IP fetchera, jest stateless i ważny ≥90s → oddajemy NIEZRESOLWOWANE z
|
||||||
|
mobile_direct_ok — telefon follow-uje 302 z własnym IP (cross-IP test 2026-06-10:
|
||||||
|
lokalny ISP 206 video/mp4). 2160p pomijamy (CDN time-out ~30s, jak fpvcdn).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.extractors._fetch import fetch_tube_html
|
||||||
|
from app.extractors._models import StreamSource
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_GET_FILE_RE = re.compile(r"https://[a-z0-9.\-]+/get_file/[^\s\"'\\]+\.mp4/?", re.IGNORECASE)
|
||||||
|
_QUALITY_RE = re.compile(r"_(\d{3,4})[mp]?\.mp4", re.IGNORECASE)
|
||||||
|
_SKIP_QUALITY_RE = re.compile(r"^(2160|1440)$")
|
||||||
|
|
||||||
|
|
||||||
|
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
||||||
|
html = fetch_tube_html(page_url, timeout=timeout)
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[StreamSource] = []
|
||||||
|
for m in _GET_FILE_RE.finditer(html):
|
||||||
|
url = m.group(0)
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
qm = _QUALITY_RE.search(url)
|
||||||
|
quality_num = qm.group(1) if qm else None
|
||||||
|
if quality_num and _SKIP_QUALITY_RE.match(quality_num):
|
||||||
|
continue
|
||||||
|
# `_preview.mp4` itp. bez liczby jakości — pomiń (trailer, nie scena).
|
||||||
|
if not quality_num:
|
||||||
|
continue
|
||||||
|
out.append(StreamSource(
|
||||||
|
link=url,
|
||||||
|
quality=f"{quality_num}p",
|
||||||
|
type="mp4",
|
||||||
|
referer="https://4k69.com/",
|
||||||
|
raw={"mobile_direct_ok": True},
|
||||||
|
))
|
||||||
|
|
||||||
|
if not out:
|
||||||
|
log.info("4k69: no get_file URLs on %s", page_url)
|
||||||
|
return None
|
||||||
|
out.sort(key=lambda s: int((s.quality or "0p")[:-1]), reverse=True)
|
||||||
|
return out
|
||||||
19
app/extractors/tubes/neporn.py
Normal file
19
app/extractors/tubes/neporn.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""neporn.com — KVS (kt_player) direct stream extractor. Patrz app/extractors/tubes/_kvs.py.
|
||||||
|
|
||||||
|
Flashvars `video_url` = `function/0/...get_file/...` + `license_code` (silnik jak
|
||||||
|
freshporno/porn00/yespornvip). Resolve server-side: decode + follow 302 →
|
||||||
|
`data001.neporn.com/remote_control.php?time=&cv=...`.
|
||||||
|
|
||||||
|
Cross-IP test 2026-06-10: finalny URL portable (token time-bound, NIE IP-bound —
|
||||||
|
VPS resolve → lokalny ISP 206 video/mp4). Mobile gra direct, zero proxy/WebView.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.extractors._models import StreamSource
|
||||||
|
from app.extractors.tubes import _kvs
|
||||||
|
|
||||||
|
_BASE = "https://neporn.com"
|
||||||
|
|
||||||
|
|
||||||
|
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
||||||
|
return _kvs.resolve_kvs(page_url, base_url=_BASE, timeout=timeout)
|
||||||
Loading…
Add table
Reference in a new issue