"""Paradisehill connector — primary source dla movies (full-length adult films). Site notes: - Age-gate: wymagany cookie `is18=1` (POST /is18/ zwraca 400 z curla, ale samo dorzucenie cookie do GET-a działa — site jest tolerancyjny). - Listing: `/all/?sort=created_at&page=N` — paginacja po 28 filmów, mikro-data Schema.org Movie. - Detail: `//` — pełne meta + Video.js playlist (chaptery jako "Part 1/2/3"). Co ekstraktujemy: - Schema.org microdata: name, description, director, datePublished (upload), image, thumbnailUrl - Studio: link `/studio//{name}` (tylko link dostarcza nazwę i external_id) - Genres: ze Schema.org `itemprop="genre"` (pierwszy = movie's main genre) - Year: parsowany z description gdy obecny ("This 1999 film..."), bo `datePublished` to upload_date - Chapters: liczba `
  • ...Part N
  • ` w playliście Video.js - Playback: na MVP `page_url` only — Video.js playlist URL jest dynamicznie ładowany przez JS i wymaga login session. Mobile może otworzyć page w WebView (degradacja lepsza niż brak). External_id: hex slug z URL-a (np. `259448f6b75ee` z `/259448f6b75ee/`). """ from __future__ import annotations import logging import re from collections.abc import Iterator from datetime import UTC, date, datetime, timedelta from typing import Any import httpx from app.connectors.base import ( BaseMovieConnector, RawMovie, RawMovieChapter, RawPerformer, RawPlaybackSource, RawStudio, RawTag, ) from app.models.source import SourceKind log = logging.getLogger(__name__) BASE_URL = "https://paradisehill.cc" USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" ) LISTING_PATH = "/all/" # ?sort=created_at&page=N SOURCE_NAME = "paradisehill" # Microdata extraction — Schema.org tagi są stabilne i niezagubione przy lekkich # zmianach themu (yii2 widget renderuje je inwariantnie). _TITLE_RE = re.compile( r'([^<]+)', re.IGNORECASE ) _DIRECTOR_RE = re.compile(r'itemprop="director">([^<]+)([^<]+(?:<[^>]+>[^<]+)*)', re.IGNORECASE | re.DOTALL ) _DATE_PUBLISHED_RE = re.compile( r'itemprop="datePublished"\s+content="([^"]+)"', re.IGNORECASE ) _POSTER_RE = re.compile( r']*>([^<]+)', re.IGNORECASE) _CHAPTER_RE = re.compile( r'([^<]+)', re.IGNORECASE, ) # videoList JS array w detail page — może mieć multiple parts (Video.js playlist): # var videoList = [{"sources":[{"src":"...part1.mp4","type":"video/mp4"}]}, ...] # Bez parsowania tego mobile WebView gra tylko pierwszy part, kolejne pomija. # Bug-reports `c5693926`/`418270e4` 2026-05-21 ("ładuje tylko 1 z 4 części"). _VIDEO_LIST_RE = re.compile(r"var\s+videoList\s*=\s*(\[.*?\])\s*;", re.IGNORECASE | re.DOTALL) _VIDEO_SRC_RE = re.compile(r'"src"\s*:\s*"([^"]+\.mp4[^"]*)"', re.IGNORECASE) def extract_video_parts(html: str) -> list[tuple[str, str]]: """Wyciąga listę MP4 parts z paradisehill detail HTML. Returns: [(mp4_url, label), ...] np. `[(.../part1.mp4, "Part 1"), ...]`. Pusta lista gdy `videoList` nieobecny lub bez sources (login-only filmy). """ m = _VIDEO_LIST_RE.search(html) if not m: return [] parts: list[tuple[str, str]] = [] for i, src_m in enumerate(_VIDEO_SRC_RE.finditer(m.group(1)), start=1): url = src_m.group(1).replace("\\/", "/") parts.append((url, f"Part {i}")) return parts def fetch_and_extract_parts(page_url: str, *, timeout: float = 20.0) -> list[tuple[str, str]]: """Resolve-time helper: pobierz page, wyciągnij videoList parts. Używane przez `app.api.playback.resolve_movie_playback` dla origin='paradisehill'. """ with httpx.Client( timeout=timeout, follow_redirects=True, headers={ "User-Agent": USER_AGENT, "Cookie": "is18=1", "Accept-Language": "en-US,en;q=0.9", }, ) as client: r = client.get(page_url) r.raise_for_status() return extract_video_parts(r.text) # Listing page item: _LIST_ITEM_RE = re.compile( r']*>\s*' r']*>', re.IGNORECASE, ) # Year w description: szukamy 4-cyfrowego roku w sensownym zakresie _YEAR_IN_DESC_RE = re.compile(r"\b(19[5-9]\d|20[0-3]\d)\b") # Year w tytule (np. "Title (1999)") _YEAR_IN_TITLE_RE = re.compile(r"\((\d{4})\)") class ParadisehillConnector(BaseMovieConnector): kind = SourceKind.scraper name = SOURCE_NAME def __init__(self, *, timeout: float = 30.0): self._client = httpx.Client( base_url=BASE_URL, timeout=timeout, follow_redirects=True, headers={ "User-Agent": USER_AGENT, # Wszystkie requesty wymagają is18 cookie. Pre-set żeby ominąć age-gate. "Cookie": "is18=1", "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml", }, ) def close(self) -> None: self._client.close() def fetch_movies( self, *, since: datetime | None = None, limit: int | None = None, ) -> Iterator[RawMovie]: """Crawluje listing `/all/?sort=created_at` chronologicznie (najnowsze first). `since`: stop gdy datePublished < since. `limit`: stop po N filmach. Aktualnie 28 movies/page; site rośnie ~5/dzień, więc pełen crawl to ~tysiące stron — w prod używamy `since` żeby zobaczyć tylko delta od poprzedniego runa. """ seen = 0 page = 1 while True: try: ids = list(self._fetch_listing_page(page)) except httpx.HTTPError as e: log.warning("paradisehill listing page=%d failed: %s", page, e) return if not ids: log.info("paradisehill: empty listing page=%d, stop", page) return for mid in ids: try: movie = self._fetch_detail(mid) except httpx.HTTPError as e: log.warning("paradisehill detail %s failed: %s", mid, e) continue if movie is None: continue # `since` filter — datePublished (= data uploadu na paradisehill) poniżej # progu = stop crawla (listing chronologiczny). # # UWAGA: release_date to DATA (bez godziny). Wcześniej combine()→00:00 # porównywane z TIMESTAMPEM `since` ucinało crawl na PIERWSZYM filmie z dnia # == since (midnight < since o dowolnej porze dnia) → uploady tego samego dnia # systematycznie ginęły, bo movie-ingest jest dzienny (seen=0-2/run mimo # świeżych filmów na froncie strony — bug-report 2026-06-01 "Movies stoją"). # Fix: porównuj po DACIE z 1-dniowym grace; ponowny fetch świeżych jest tani # (external_records upsert pomija niezmieniony hash). if since is not None and movie.release_date is not None: if movie.release_date < (since - timedelta(days=1)).date(): log.info( "paradisehill: hit since boundary at %s (%s), stop", mid, movie.release_date, ) return yield movie seen += 1 if limit is not None and seen >= limit: return page += 1 def _fetch_listing_page(self, page: int) -> Iterator[str]: """Yielduje hex IDs filmów na danej stronie.""" url = f"{LISTING_PATH}?sort=created_at&page={page}" r = self._client.get(url) r.raise_for_status() for m in _LIST_ITEM_RE.finditer(r.text): yield m.group(1) def _fetch_detail(self, hex_id: str) -> RawMovie | None: url = f"/{hex_id}/" r = self._client.get(url) r.raise_for_status() return _parse_detail(hex_id, r.text) def _parse_detail(hex_id: str, html: str) -> RawMovie | None: """Parsuje detail HTML → RawMovie. Zwraca None gdy brak title (skopany template).""" m_title = _TITLE_RE.search(html) if not m_title: log.warning("paradisehill: no title in detail %s", hex_id) return None title = _decode_html(m_title.group(1).strip()) m_director = _DIRECTOR_RE.search(html) director = _decode_html(m_director.group(1).strip()) if m_director else None if director and director.lower() in ("unknown", "n/a", "-"): director = None m_desc = _DESCRIPTION_RE.search(html) description = _decode_html(_strip_tags(m_desc.group(1)).strip()) if m_desc else None release_date: date | None = None m_date = _DATE_PUBLISHED_RE.search(html) if m_date: try: release_date = datetime.fromisoformat(m_date.group(1)).date() except ValueError: pass # Year — najpierw z tytułu, potem z opisu. datePublished to upload date paradisehill # (np. 2026-05) a nie production year (np. 1999) — useless dla year filtering. release_year: int | None = None m_yt = _YEAR_IN_TITLE_RE.search(title) if m_yt: release_year = int(m_yt.group(1)) elif description: m_yd = _YEAR_IN_DESC_RE.search(description) if m_yd: release_year = int(m_yd.group(1)) poster_url: str | None = None m_poster = _POSTER_RE.search(html) if m_poster: poster_url = BASE_URL + m_poster.group(1) backdrop_url: str | None = None m_thumb = _THUMBNAIL_RE.search(html) if m_thumb: backdrop_url = BASE_URL + m_thumb.group(1) studio: RawStudio | None = None m_studio = _STUDIO_LINK_RE.search(html) if m_studio: studio = RawStudio( external_id=f"paradisehill:{m_studio.group(1)}", name=_decode_html(m_studio.group(2).strip()), ) # Genre — pierwszy itemprop="genre" w samym block-inside (nie w recommendations). # Recommended films też mają itemprop="genre" więc match limity do block-inside. # Wcześniejszy regex wymagał `
    ...
    `. tags: list[RawTag] = [] block_start = re.search( r']*itemtype="http://schema\.org/Movie"[^>]*>', html, ) if block_start: rest = html[block_start.end():] # Stop boundary: pierwszy
    . Wszystko przedtem to # właściwa zawartość filmu (genre/cast/itd.); reszta to recommendations # i komentarze ktore mają własne itemprop="genre". stop = re.search(r'Female Domination` # v2: `itemprop="genre">All Sex` (od 2026-05) # Optional `` wrapper między `itemprop` a tekstem — bez tego v2 dawał empty. for m_genre in re.finditer( r'itemprop="genre"[^>]*>\s*(?:]*>)?\s*([^<]+)', block, re.IGNORECASE, ): name = _decode_html(m_genre.group(1).strip()) if name and len(tags) < 10: tags.append(RawTag(name=name, slug=_slugify(name))) chapters: list[RawMovieChapter] = [] for m_ch in _CHAPTER_RE.finditer(html): chapters.append( RawMovieChapter( chapter_index=int(m_ch.group(1)), title=_decode_html(m_ch.group(2).strip()), ) ) page_url = f"{BASE_URL}/{hex_id}/" playback_sources = [ RawPlaybackSource( origin=SOURCE_NAME, page_url=page_url, thumbnail_url=poster_url, ) ] return RawMovie( external_id=hex_id, title=title, description=description, release_year=release_year, release_date=release_date, director=director, poster_url=poster_url, backdrop_url=backdrop_url, url=page_url, studio=studio, performers=[], # Paradisehill rzadko ma cast linki — uzupełnimy przez mirrory. tags=tags, chapters=chapters, playback_sources=playback_sources, raw={"hex_id": hex_id, "html_len": len(html)}, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- _TAG_RE = re.compile(r"<[^>]+>") def _strip_tags(s: str) -> str: return _TAG_RE.sub("", s) _HTML_ENTITIES = { "&": "&", "<": "<", ">": ">", """: '"', "'": "'", "'": "'", " ": " ", "’": "'", "‘": "'", "”": '"', "“": '"', "…": "...", "—": "—", "–": "–", } def _decode_html(s: str) -> str: for k, v in _HTML_ENTITIES.items(): s = s.replace(k, v) # Numeric entities s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s) s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s) return s _SLUG_RE = re.compile(r"[^a-z0-9]+") def _slugify(s: str) -> str: return _SLUG_RE.sub("-", s.lower()).strip("-") or "tag"