User-facing bugs resolved (per bug_reports table 2026-05-25): - 40cd28aa (short-scene filter): mobile api.ts default min_duration_sec=60 hides 6519 sub-60s scenes across all list endpoints (Performer/Site/Tag/ Browse). Caller may override with explicit 0. - 5e89ef7e (porndoe needs cookies/play click): INJECTED_JS in PlayerScreen now auto-clicks player-poster overlay (player-poster-play, big-play-button, vjs-big-play-button, jw-icon-display, btn-big-play, mejs__overlay-button, play-button, btn-play, videoPlayButton). Triggered same interval as consent-dismiss + ad-iframe removal. - b1b5e1a2 (Mixdrop czarny ekran): re-enable mixdrop direct stream via VPS curl_cffi proxy (was: skip → WebView fallback → blank screen). Backend pipeline (mixdrop.py extract + stream_proxy._curl_cffi_stream with JA3 + auto-refetch on token expire) was already complete; just removed the skip in app/api/playback.py. Plus ongoing WIP (paradisehill multi-part extraction, stream_proxy refetch logic, gesture race fix for long-press 2x speed, anti-adblock INJECTED_JS defenses, scripts for freshporno backfill, new sources API).
382 lines
14 KiB
Python
382 lines
14 KiB
Python
"""Paradisehill connector — primary source dla movies (full-length adult films).
|
||
|
||
Site notes:
|
||
- Age-gate: wymagany cookie `is18=1` (POST /is18/ zwraca 400 z curla, ale samo dorzucenie
|
||
cookie do GET-a działa — site jest tolerancyjny).
|
||
- Listing: `/all/?sort=created_at&page=N` — paginacja po 28 filmów, mikro-data Schema.org Movie.
|
||
- Detail: `/<hex_id>/` — pełne meta + Video.js playlist (chaptery jako "Part 1/2/3").
|
||
|
||
Co ekstraktujemy:
|
||
- Schema.org microdata: name, description, director, datePublished (upload), image, thumbnailUrl
|
||
- Studio: link `/studio/<id>/{name}` (tylko link dostarcza nazwę i external_id)
|
||
- Genres: ze Schema.org `itemprop="genre"` (pierwszy = movie's main genre)
|
||
- Year: parsowany z description gdy obecny ("This 1999 film..."), bo `datePublished` to upload_date
|
||
- Chapters: liczba `<li>...Part N</li>` w playliście Video.js
|
||
- Playback: na MVP `page_url` only — Video.js playlist URL jest dynamicznie ładowany przez JS
|
||
i wymaga login session. Mobile może otworzyć page w WebView (degradacja lepsza niż brak).
|
||
|
||
External_id: hex slug z URL-a (np. `259448f6b75ee` z `/259448f6b75ee/`).
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import re
|
||
from collections.abc import Iterator
|
||
from datetime import UTC, date, datetime
|
||
from typing import Any
|
||
|
||
import httpx
|
||
|
||
from app.connectors.base import (
|
||
BaseMovieConnector,
|
||
RawMovie,
|
||
RawMovieChapter,
|
||
RawPerformer,
|
||
RawPlaybackSource,
|
||
RawStudio,
|
||
RawTag,
|
||
)
|
||
from app.models.source import SourceKind
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
BASE_URL = "https://paradisehill.cc"
|
||
USER_AGENT = (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||
)
|
||
LISTING_PATH = "/all/" # ?sort=created_at&page=N
|
||
SOURCE_NAME = "paradisehill"
|
||
|
||
|
||
# Microdata extraction — Schema.org tagi są stabilne i niezagubione przy lekkich
|
||
# zmianach themu (yii2 widget renderuje je inwariantnie).
|
||
_TITLE_RE = re.compile(
|
||
r'<h1\s+class="title-inside"\s+itemprop="name">([^<]+)</h1>', re.IGNORECASE
|
||
)
|
||
_DIRECTOR_RE = re.compile(r'itemprop="director">([^<]+)</', re.IGNORECASE)
|
||
_DESCRIPTION_RE = re.compile(
|
||
r'itemprop="description">([^<]+(?:<[^>]+>[^<]+)*)</span>', re.IGNORECASE | re.DOTALL
|
||
)
|
||
_DATE_PUBLISHED_RE = re.compile(
|
||
r'itemprop="datePublished"\s+content="([^"]+)"', re.IGNORECASE
|
||
)
|
||
_POSTER_RE = re.compile(
|
||
r'<img\s+itemprop="image"\s+src="(/images/[^"]+)"', re.IGNORECASE
|
||
)
|
||
_THUMBNAIL_RE = re.compile(
|
||
r'<img\s+itemprop="thumbnailUrl"\s+src="(/images/[^"]+)"', re.IGNORECASE
|
||
)
|
||
_STUDIO_LINK_RE = re.compile(r'<a\s+href="/studio/(\d+)/"[^>]*>([^<]+)</a>', re.IGNORECASE)
|
||
_CHAPTER_RE = re.compile(
|
||
r'<a\s+href="#"\s+class="js-list-item"\s+data-index="(\d+)">([^<]+)</a>',
|
||
re.IGNORECASE,
|
||
)
|
||
# videoList JS array w detail page — może mieć multiple parts (Video.js playlist):
|
||
# var videoList = [{"sources":[{"src":"...part1.mp4","type":"video/mp4"}]}, ...]
|
||
# Bez parsowania tego mobile WebView gra tylko pierwszy part, kolejne pomija.
|
||
# Bug-reports `c5693926`/`418270e4` 2026-05-21 ("ładuje tylko 1 z 4 części").
|
||
_VIDEO_LIST_RE = re.compile(r"var\s+videoList\s*=\s*(\[.*?\])\s*;", re.IGNORECASE | re.DOTALL)
|
||
_VIDEO_SRC_RE = re.compile(r'"src"\s*:\s*"([^"]+\.mp4[^"]*)"', re.IGNORECASE)
|
||
|
||
|
||
def extract_video_parts(html: str) -> list[tuple[str, str]]:
|
||
"""Wyciąga listę MP4 parts z paradisehill detail HTML.
|
||
|
||
Returns: [(mp4_url, label), ...] np. `[(.../part1.mp4, "Part 1"), ...]`.
|
||
Pusta lista gdy `videoList` nieobecny lub bez sources (login-only filmy).
|
||
"""
|
||
m = _VIDEO_LIST_RE.search(html)
|
||
if not m:
|
||
return []
|
||
parts: list[tuple[str, str]] = []
|
||
for i, src_m in enumerate(_VIDEO_SRC_RE.finditer(m.group(1)), start=1):
|
||
url = src_m.group(1).replace("\\/", "/")
|
||
parts.append((url, f"Part {i}"))
|
||
return parts
|
||
|
||
|
||
def fetch_and_extract_parts(page_url: str, *, timeout: float = 20.0) -> list[tuple[str, str]]:
|
||
"""Resolve-time helper: pobierz page, wyciągnij videoList parts.
|
||
Używane przez `app.api.playback.resolve_movie_playback` dla origin='paradisehill'.
|
||
"""
|
||
with httpx.Client(
|
||
timeout=timeout,
|
||
follow_redirects=True,
|
||
headers={
|
||
"User-Agent": USER_AGENT,
|
||
"Cookie": "is18=1",
|
||
"Accept-Language": "en-US,en;q=0.9",
|
||
},
|
||
) as client:
|
||
r = client.get(page_url)
|
||
r.raise_for_status()
|
||
return extract_video_parts(r.text)
|
||
# Listing page item:
|
||
_LIST_ITEM_RE = re.compile(
|
||
r'<div\s+class="item\s+list-film-item"[^>]*>\s*'
|
||
r'<a\s+href="/([0-9a-f]+)/"[^>]*>',
|
||
re.IGNORECASE,
|
||
)
|
||
# Year w description: szukamy 4-cyfrowego roku w sensownym zakresie
|
||
_YEAR_IN_DESC_RE = re.compile(r"\b(19[5-9]\d|20[0-3]\d)\b")
|
||
# Year w tytule (np. "Title (1999)")
|
||
_YEAR_IN_TITLE_RE = re.compile(r"\((\d{4})\)")
|
||
|
||
|
||
class ParadisehillConnector(BaseMovieConnector):
|
||
kind = SourceKind.scraper
|
||
name = SOURCE_NAME
|
||
|
||
def __init__(self, *, timeout: float = 30.0):
|
||
self._client = httpx.Client(
|
||
base_url=BASE_URL,
|
||
timeout=timeout,
|
||
follow_redirects=True,
|
||
headers={
|
||
"User-Agent": USER_AGENT,
|
||
# Wszystkie requesty wymagają is18 cookie. Pre-set żeby ominąć age-gate.
|
||
"Cookie": "is18=1",
|
||
"Accept-Language": "en-US,en;q=0.9",
|
||
"Accept": "text/html,application/xhtml+xml",
|
||
},
|
||
)
|
||
|
||
def close(self) -> None:
|
||
self._client.close()
|
||
|
||
def fetch_movies(
|
||
self,
|
||
*,
|
||
since: datetime | None = None,
|
||
limit: int | None = None,
|
||
) -> Iterator[RawMovie]:
|
||
"""Crawluje listing `/all/?sort=created_at` chronologicznie (najnowsze first).
|
||
|
||
`since`: stop gdy datePublished < since. `limit`: stop po N filmach.
|
||
Aktualnie 28 movies/page; site rośnie ~5/dzień, więc pełen crawl to ~tysiące
|
||
stron — w prod używamy `since` żeby zobaczyć tylko delta od poprzedniego runa.
|
||
"""
|
||
seen = 0
|
||
page = 1
|
||
while True:
|
||
try:
|
||
ids = list(self._fetch_listing_page(page))
|
||
except httpx.HTTPError as e:
|
||
log.warning("paradisehill listing page=%d failed: %s", page, e)
|
||
return
|
||
|
||
if not ids:
|
||
log.info("paradisehill: empty listing page=%d, stop", page)
|
||
return
|
||
|
||
for mid in ids:
|
||
try:
|
||
movie = self._fetch_detail(mid)
|
||
except httpx.HTTPError as e:
|
||
log.warning("paradisehill detail %s failed: %s", mid, e)
|
||
continue
|
||
if movie is None:
|
||
continue
|
||
|
||
# `since` filter — datePublished poniżej threshold = stop crawla,
|
||
# bo listing jest chronologiczny. since z `_last_successful_finished_at`
|
||
# jest TZ-aware (UTC); combine() daje naive — przywróć UTC tzinfo żeby
|
||
# porównanie nie crashowało.
|
||
if since is not None and movie.release_date is not None:
|
||
rd_dt = datetime.combine(
|
||
movie.release_date, datetime.min.time(), tzinfo=UTC
|
||
)
|
||
if rd_dt < since:
|
||
log.info(
|
||
"paradisehill: hit since boundary at %s (%s), stop",
|
||
mid, movie.release_date,
|
||
)
|
||
return
|
||
|
||
yield movie
|
||
seen += 1
|
||
if limit is not None and seen >= limit:
|
||
return
|
||
|
||
page += 1
|
||
|
||
def _fetch_listing_page(self, page: int) -> Iterator[str]:
|
||
"""Yielduje hex IDs filmów na danej stronie."""
|
||
url = f"{LISTING_PATH}?sort=created_at&page={page}"
|
||
r = self._client.get(url)
|
||
r.raise_for_status()
|
||
for m in _LIST_ITEM_RE.finditer(r.text):
|
||
yield m.group(1)
|
||
|
||
def _fetch_detail(self, hex_id: str) -> RawMovie | None:
|
||
url = f"/{hex_id}/"
|
||
r = self._client.get(url)
|
||
r.raise_for_status()
|
||
return _parse_detail(hex_id, r.text)
|
||
|
||
|
||
def _parse_detail(hex_id: str, html: str) -> RawMovie | None:
|
||
"""Parsuje detail HTML → RawMovie. Zwraca None gdy brak title (skopany template)."""
|
||
m_title = _TITLE_RE.search(html)
|
||
if not m_title:
|
||
log.warning("paradisehill: no title in detail %s", hex_id)
|
||
return None
|
||
title = _decode_html(m_title.group(1).strip())
|
||
|
||
m_director = _DIRECTOR_RE.search(html)
|
||
director = _decode_html(m_director.group(1).strip()) if m_director else None
|
||
if director and director.lower() in ("unknown", "n/a", "-"):
|
||
director = None
|
||
|
||
m_desc = _DESCRIPTION_RE.search(html)
|
||
description = _decode_html(_strip_tags(m_desc.group(1)).strip()) if m_desc else None
|
||
|
||
release_date: date | None = None
|
||
m_date = _DATE_PUBLISHED_RE.search(html)
|
||
if m_date:
|
||
try:
|
||
release_date = datetime.fromisoformat(m_date.group(1)).date()
|
||
except ValueError:
|
||
pass
|
||
|
||
# Year — najpierw z tytułu, potem z opisu. datePublished to upload date paradisehill
|
||
# (np. 2026-05) a nie production year (np. 1999) — useless dla year filtering.
|
||
release_year: int | None = None
|
||
m_yt = _YEAR_IN_TITLE_RE.search(title)
|
||
if m_yt:
|
||
release_year = int(m_yt.group(1))
|
||
elif description:
|
||
m_yd = _YEAR_IN_DESC_RE.search(description)
|
||
if m_yd:
|
||
release_year = int(m_yd.group(1))
|
||
|
||
poster_url: str | None = None
|
||
m_poster = _POSTER_RE.search(html)
|
||
if m_poster:
|
||
poster_url = BASE_URL + m_poster.group(1)
|
||
backdrop_url: str | None = None
|
||
m_thumb = _THUMBNAIL_RE.search(html)
|
||
if m_thumb:
|
||
backdrop_url = BASE_URL + m_thumb.group(1)
|
||
|
||
studio: RawStudio | None = None
|
||
m_studio = _STUDIO_LINK_RE.search(html)
|
||
if m_studio:
|
||
studio = RawStudio(
|
||
external_id=f"paradisehill:{m_studio.group(1)}",
|
||
name=_decode_html(m_studio.group(2).strip()),
|
||
)
|
||
|
||
# Genre — pierwszy itemprop="genre" w samym block-inside (nie w recommendations).
|
||
# Recommended films też mają itemprop="genre" więc match limity do block-inside.
|
||
# Wcześniejszy regex wymagał `</div></div><div class="similar"` — ale paradisehill
|
||
# czasami ma `</div></noindex>...<div class="similar"` (banner skin z 2026-05-19),
|
||
# przez co block_match failował → fallback do html[:8000] → 0 tagów. Bug-report
|
||
# `3c999b27` 2026-05-21 ("Brak kategorii"). Robust: szukaj similar jako stop boundary,
|
||
# bez wymagania zamknięcia konkretnymi `</div>`.
|
||
tags: list[RawTag] = []
|
||
block_start = re.search(
|
||
r'<div\s+class="block-inside"[^>]*itemtype="http://schema\.org/Movie"[^>]*>',
|
||
html,
|
||
)
|
||
if block_start:
|
||
rest = html[block_start.end():]
|
||
# Stop boundary: pierwszy <div class="similar...">. Wszystko przedtem to
|
||
# właściwa zawartość filmu (genre/cast/itd.); reszta to recommendations
|
||
# i komentarze ktore mają własne itemprop="genre".
|
||
stop = re.search(r'<div\s+class="similar', rest)
|
||
block = rest[: stop.start()] if stop else rest[:12000]
|
||
else:
|
||
block = html[:8000]
|
||
# Paradisehill miesza dwa szablony per-page:
|
||
# v1: `itemprop="genre">Female Domination</span>`
|
||
# v2: `itemprop="genre"><a href="/category/...">All Sex</a></span>` (od 2026-05)
|
||
# Optional `<a>` wrapper między `itemprop` a tekstem — bez tego v2 dawał empty.
|
||
for m_genre in re.finditer(
|
||
r'itemprop="genre"[^>]*>\s*(?:<a[^>]*>)?\s*([^<]+)', block, re.IGNORECASE,
|
||
):
|
||
name = _decode_html(m_genre.group(1).strip())
|
||
if name and len(tags) < 10:
|
||
tags.append(RawTag(name=name, slug=_slugify(name)))
|
||
|
||
chapters: list[RawMovieChapter] = []
|
||
for m_ch in _CHAPTER_RE.finditer(html):
|
||
chapters.append(
|
||
RawMovieChapter(
|
||
chapter_index=int(m_ch.group(1)),
|
||
title=_decode_html(m_ch.group(2).strip()),
|
||
)
|
||
)
|
||
|
||
page_url = f"{BASE_URL}/{hex_id}/"
|
||
playback_sources = [
|
||
RawPlaybackSource(
|
||
origin=SOURCE_NAME,
|
||
page_url=page_url,
|
||
thumbnail_url=poster_url,
|
||
)
|
||
]
|
||
|
||
return RawMovie(
|
||
external_id=hex_id,
|
||
title=title,
|
||
description=description,
|
||
release_year=release_year,
|
||
release_date=release_date,
|
||
director=director,
|
||
poster_url=poster_url,
|
||
backdrop_url=backdrop_url,
|
||
url=page_url,
|
||
studio=studio,
|
||
performers=[], # Paradisehill rzadko ma cast linki — uzupełnimy przez mirrory.
|
||
tags=tags,
|
||
chapters=chapters,
|
||
playback_sources=playback_sources,
|
||
raw={"hex_id": hex_id, "html_len": len(html)},
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_TAG_RE = re.compile(r"<[^>]+>")
|
||
|
||
|
||
def _strip_tags(s: str) -> str:
|
||
return _TAG_RE.sub("", s)
|
||
|
||
|
||
_HTML_ENTITIES = {
|
||
"&": "&",
|
||
"<": "<",
|
||
">": ">",
|
||
""": '"',
|
||
"'": "'",
|
||
"'": "'",
|
||
" ": " ",
|
||
"’": "'",
|
||
"‘": "'",
|
||
"”": '"',
|
||
"“": '"',
|
||
"…": "...",
|
||
"—": "—",
|
||
"–": "–",
|
||
}
|
||
|
||
|
||
def _decode_html(s: str) -> str:
|
||
for k, v in _HTML_ENTITIES.items():
|
||
s = s.replace(k, v)
|
||
# Numeric entities
|
||
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
|
||
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
|
||
return s
|
||
|
||
|
||
_SLUG_RE = re.compile(r"[^a-z0-9]+")
|
||
|
||
|
||
def _slugify(s: str) -> str:
|
||
return _SLUG_RE.sub("-", s.lower()).strip("-") or "tag"
|