4k69.com (~65k scenes): same PlayTube CMS as hqfap - common logic moved to _playtube.py (sitemap catalog, JSON-LD, pills). Studio classified by matching category pills against the studios index page. Streams are get_file (fullmovies family) returned unresolved with mobile_direct, 2160p skipped. neporn.com: KVS engine, latest-updates listing, JSON-LD + video:duration meta, performers from models links with flashvars video_tags fallback for fresh uploads. Resolve via _kvs; final URL portable cross-IP. superporn.com rejected: Cloudflare 403 from VPS on all TLS impersonations. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
191 lines
7.2 KiB
Python
191 lines
7.2 KiB
Python
"""neporn.com — latest-vids browse scraper (KVS engine).
|
|
|
|
Dołączony 2026-06-10 (user request). Paysite re-upload (HardX, DAP itp.),
|
|
~40k+ scen po numeracji video id.
|
|
|
|
Sygnały per scena:
|
|
- JSON-LD VideoObject: name, description, uploadDate, thumbnailUrl
|
|
(uwaga: bywa zmanglowany "https:https://..." — normalizujemy)
|
|
- `<meta property="video:duration">` = sekundy wprost
|
|
- Performerzy: `<a class="link" href="https://neporn.com/models/<slug>/">`
|
|
(nazwa w spanie z ikoną — bierzemy slug i tytułujemy). Świeże uploady NIE mają
|
|
jeszcze linków modelek — fallback: flashvars `video_tags` (na świeżych scenach
|
|
to czyste nazwiska, np. "emily willis, gianna dior"); bierzemy wpisy 2-3 słowa
|
|
obecne w tytule i bez generycznych fraz (stoplist), bo starsze sceny mają tam
|
|
też szum ("deep throat", "natural tits", "hd porn").
|
|
- Kategorie: linki `/categories/<slug>/` w bloku info (tagi z `/tags/` to szum
|
|
typu "hd porn", "2020" — pomijamy)
|
|
|
|
Studio: brak strukturalnego pola — tytuł bywa "- HardX Update - ..." ale format
|
|
niespójny, zostawiamy fuzzy matchowi po tytule.
|
|
|
|
Listing: standard KVS `/latest-updates/N/` (24 sceny/strona).
|
|
Playback: KVS function/0 + license → `_kvs.resolve_kvs` server-side; finalny
|
|
`data001.neporn.com/remote_control.php?time=...` portable cross-IP
|
|
(test 2026-06-10: VPS resolve → lokalny ISP 206 video/mp4).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
from app.connectors.base import (
|
|
RawFingerprint,
|
|
RawPerformer,
|
|
RawPlaybackSource,
|
|
RawScene,
|
|
RawTag,
|
|
)
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
meta_content,
|
|
)
|
|
from app.connectors.direct_scrapers._playtube import (
|
|
_extract_video_object,
|
|
_parse_iso_date,
|
|
)
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_BASE = "https://neporn.com"
|
|
|
|
_SCENE_URL_RE = re.compile(r'href="(https://neporn\.com/video/(\d+)/[^"]+)"')
|
|
_MODEL_LINK_RE = re.compile(r'href="https://neporn\.com/models/([a-z0-9\-]+)/"', re.IGNORECASE)
|
|
_CATEGORY_LINK_RE = re.compile(
|
|
r'href="https://neporn\.com/categories/([a-z0-9\-]+)/"\s*>\s*([^<]+?)\s*<', re.IGNORECASE
|
|
)
|
|
_VIDEO_TAGS_RE = re.compile(r"video_tags:\s*'([^']*)'")
|
|
|
|
# Słowa dyskwalifikujące wpis z video_tags jako nazwisko (generyczne frazy typu
|
|
# "deep throat" / "natural tits" przechodzą test "jest w tytule" zbyt często).
|
|
_PERF_STOPWORDS = frozenset(
|
|
"porn sex tits ass anal throat cum blow blowjob dick cock pussy fuck fucking "
|
|
"scene scenes hd milf teen big small double penetration facial creampie "
|
|
"threesome amateur petite latina blonde brunette".split()
|
|
)
|
|
|
|
|
|
def _name_from_slug(slug: str) -> str:
|
|
"""`emily-willis` → `Emily Willis`."""
|
|
return " ".join(w.capitalize() for w in slug.split("-") if w)
|
|
|
|
|
|
class NepornScraper(BaseBrowseScraper):
|
|
sitetag = "neporncom"
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
if page <= 1:
|
|
return f"{_BASE}/latest-updates/"
|
|
return f"{_BASE}/latest-updates/{page}/"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
seen: set[str] = set()
|
|
out: list[str] = []
|
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
|
url = m.group(1)
|
|
if url in seen:
|
|
continue
|
|
seen.add(url)
|
|
out.append(url)
|
|
return out
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
video = _extract_video_object(detail_html) or {}
|
|
|
|
title = (video.get("name") or meta_content(detail_html, property="og:title") or "").strip()
|
|
# Tytuły bywają z wiszącym separatorem na początku ("- HardX Update - ...").
|
|
title = title.lstrip("- ").strip()
|
|
if not title:
|
|
return None
|
|
|
|
id_m = re.search(r"/video/(\d+)/", scene_url)
|
|
video_id = id_m.group(1) if id_m else None
|
|
|
|
duration_sec: int | None = None
|
|
dur_meta = meta_content(detail_html, property="video:duration")
|
|
if dur_meta and dur_meta.isdigit():
|
|
duration_sec = int(dur_meta)
|
|
|
|
release_date = _parse_iso_date(video.get("uploadDate"))
|
|
description = (video.get("description") or "").strip() or None
|
|
thumbnail_url = (video.get("thumbnailUrl") or "").strip() or None
|
|
if thumbnail_url:
|
|
# JSON-LD emituje "https:https://cdn..." — utnij zdublowany scheme.
|
|
thumbnail_url = re.sub(r"^https?:(?=https?://)", "", thumbnail_url)
|
|
|
|
performers: list[RawPerformer] = []
|
|
seen_perf: set[str] = set()
|
|
for m in _MODEL_LINK_RE.finditer(detail_html):
|
|
slug = m.group(1)
|
|
if slug in seen_perf:
|
|
continue
|
|
seen_perf.add(slug)
|
|
performers.append(
|
|
RawPerformer(
|
|
external_id=f"{self.sitetag}:model:{slug}",
|
|
name=_name_from_slug(slug),
|
|
)
|
|
)
|
|
# Fallback dla świeżych uploadów bez linków modelek: nazwiska z flashvars
|
|
# video_tags — 2-3 słowa, obecne w tytule, bez generycznych fraz.
|
|
title_cf = title.casefold()
|
|
tags_m = _VIDEO_TAGS_RE.search(detail_html)
|
|
for entry in (tags_m.group(1).split(",") if tags_m else []):
|
|
entry = entry.strip()
|
|
words = entry.split()
|
|
if not (2 <= len(words) <= 3):
|
|
continue
|
|
if any(w.casefold() in _PERF_STOPWORDS for w in words):
|
|
continue
|
|
if entry.casefold() not in title_cf:
|
|
continue
|
|
slug = entry.casefold().replace(" ", "-")
|
|
if slug in seen_perf:
|
|
continue
|
|
seen_perf.add(slug)
|
|
performers.append(
|
|
RawPerformer(
|
|
external_id=f"{self.sitetag}:model:{slug}",
|
|
name=_name_from_slug(slug),
|
|
)
|
|
)
|
|
|
|
tags: list[RawTag] = []
|
|
seen_tag: set[str] = set()
|
|
for m in _CATEGORY_LINK_RE.finditer(detail_html):
|
|
slug, name = m.group(1), m.group(2).strip()
|
|
if not name or slug in seen_tag:
|
|
continue
|
|
seen_tag.add(slug)
|
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug))
|
|
|
|
# Phash z KVS screenshotów — niski hit-rate oczekiwany (jak porn00),
|
|
# graceful miss → composite scoring (performer+title+duration).
|
|
fingerprints: list[RawFingerprint] = []
|
|
if thumbnail_url:
|
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
|
if ph:
|
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
|
|
playback_sources = [
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumbnail_url,
|
|
)
|
|
]
|
|
|
|
return RawScene(
|
|
external_id=f"{self.sitetag}:{video_id or scene_url}",
|
|
title=title,
|
|
description=description,
|
|
release_date=release_date,
|
|
duration_sec=duration_sec,
|
|
url=scene_url,
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
playback_sources=playback_sources,
|
|
)
|