goon/app/connectors/direct_scrapers/neporn.py
jtrzupek 80fd83cb4e feat(tubes): add 4k69 + neporn browse scrapers, shared PlayTube base
4k69.com (~65k scenes): same PlayTube CMS as hqfap - common logic moved
to _playtube.py (sitemap catalog, JSON-LD, pills). Studio classified by
matching category pills against the studios index page. Streams are
get_file (fullmovies family) returned unresolved with mobile_direct,
2160p skipped.

neporn.com: KVS engine, latest-updates listing, JSON-LD + video:duration
meta, performers from models links with flashvars video_tags fallback
for fresh uploads. Resolve via _kvs; final URL portable cross-IP.

superporn.com rejected: Cloudflare 403 from VPS on all TLS impersonations.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 18:15:13 +02:00

191 lines
7.2 KiB
Python

"""neporn.com — latest-vids browse scraper (KVS engine).
Dołączony 2026-06-10 (user request). Paysite re-upload (HardX, DAP itp.),
~40k+ scen po numeracji video id.
Sygnały per scena:
- JSON-LD VideoObject: name, description, uploadDate, thumbnailUrl
(uwaga: bywa zmanglowany "https:https://..." — normalizujemy)
- `<meta property="video:duration">` = sekundy wprost
- Performerzy: `<a class="link" href="https://neporn.com/models/<slug>/">`
(nazwa w spanie z ikoną — bierzemy slug i tytułujemy). Świeże uploady NIE mają
jeszcze linków modelek — fallback: flashvars `video_tags` (na świeżych scenach
to czyste nazwiska, np. "emily willis, gianna dior"); bierzemy wpisy 2-3 słowa
obecne w tytule i bez generycznych fraz (stoplist), bo starsze sceny mają tam
też szum ("deep throat", "natural tits", "hd porn").
- Kategorie: linki `/categories/<slug>/` w bloku info (tagi z `/tags/` to szum
typu "hd porn", "2020" — pomijamy)
Studio: brak strukturalnego pola — tytuł bywa "- HardX Update - ..." ale format
niespójny, zostawiamy fuzzy matchowi po tytule.
Listing: standard KVS `/latest-updates/N/` (24 sceny/strona).
Playback: KVS function/0 + license → `_kvs.resolve_kvs` server-side; finalny
`data001.neporn.com/remote_control.php?time=...` portable cross-IP
(test 2026-06-10: VPS resolve → lokalny ISP 206 video/mp4).
"""
from __future__ import annotations
import logging
import re
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
meta_content,
)
from app.connectors.direct_scrapers._playtube import (
_extract_video_object,
_parse_iso_date,
)
log = logging.getLogger(__name__)
_BASE = "https://neporn.com"
_SCENE_URL_RE = re.compile(r'href="(https://neporn\.com/video/(\d+)/[^"]+)"')
_MODEL_LINK_RE = re.compile(r'href="https://neporn\.com/models/([a-z0-9\-]+)/"', re.IGNORECASE)
_CATEGORY_LINK_RE = re.compile(
r'href="https://neporn\.com/categories/([a-z0-9\-]+)/"\s*>\s*([^<]+?)\s*<', re.IGNORECASE
)
_VIDEO_TAGS_RE = re.compile(r"video_tags:\s*'([^']*)'")
# Słowa dyskwalifikujące wpis z video_tags jako nazwisko (generyczne frazy typu
# "deep throat" / "natural tits" przechodzą test "jest w tytule" zbyt często).
_PERF_STOPWORDS = frozenset(
"porn sex tits ass anal throat cum blow blowjob dick cock pussy fuck fucking "
"scene scenes hd milf teen big small double penetration facial creampie "
"threesome amateur petite latina blonde brunette".split()
)
def _name_from_slug(slug: str) -> str:
"""`emily-willis` → `Emily Willis`."""
return " ".join(w.capitalize() for w in slug.split("-") if w)
class NepornScraper(BaseBrowseScraper):
sitetag = "neporncom"
def _listing_url(self, page: int) -> str:
if page <= 1:
return f"{_BASE}/latest-updates/"
return f"{_BASE}/latest-updates/{page}/"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in _SCENE_URL_RE.finditer(listing_html):
url = m.group(1)
if url in seen:
continue
seen.add(url)
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
video = _extract_video_object(detail_html) or {}
title = (video.get("name") or meta_content(detail_html, property="og:title") or "").strip()
# Tytuły bywają z wiszącym separatorem na początku ("- HardX Update - ...").
title = title.lstrip("- ").strip()
if not title:
return None
id_m = re.search(r"/video/(\d+)/", scene_url)
video_id = id_m.group(1) if id_m else None
duration_sec: int | None = None
dur_meta = meta_content(detail_html, property="video:duration")
if dur_meta and dur_meta.isdigit():
duration_sec = int(dur_meta)
release_date = _parse_iso_date(video.get("uploadDate"))
description = (video.get("description") or "").strip() or None
thumbnail_url = (video.get("thumbnailUrl") or "").strip() or None
if thumbnail_url:
# JSON-LD emituje "https:https://cdn..." — utnij zdublowany scheme.
thumbnail_url = re.sub(r"^https?:(?=https?://)", "", thumbnail_url)
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for m in _MODEL_LINK_RE.finditer(detail_html):
slug = m.group(1)
if slug in seen_perf:
continue
seen_perf.add(slug)
performers.append(
RawPerformer(
external_id=f"{self.sitetag}:model:{slug}",
name=_name_from_slug(slug),
)
)
# Fallback dla świeżych uploadów bez linków modelek: nazwiska z flashvars
# video_tags — 2-3 słowa, obecne w tytule, bez generycznych fraz.
title_cf = title.casefold()
tags_m = _VIDEO_TAGS_RE.search(detail_html)
for entry in (tags_m.group(1).split(",") if tags_m else []):
entry = entry.strip()
words = entry.split()
if not (2 <= len(words) <= 3):
continue
if any(w.casefold() in _PERF_STOPWORDS for w in words):
continue
if entry.casefold() not in title_cf:
continue
slug = entry.casefold().replace(" ", "-")
if slug in seen_perf:
continue
seen_perf.add(slug)
performers.append(
RawPerformer(
external_id=f"{self.sitetag}:model:{slug}",
name=_name_from_slug(slug),
)
)
tags: list[RawTag] = []
seen_tag: set[str] = set()
for m in _CATEGORY_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if not name or slug in seen_tag:
continue
seen_tag.add(slug)
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug))
# Phash z KVS screenshotów — niski hit-rate oczekiwany (jak porn00),
# graceful miss → composite scoring (performer+title+duration).
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{video_id or scene_url}",
title=title,
description=description,
release_date=release_date,
duration_sec=duration_sec,
url=scene_url,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)