goon/app/connectors/paradisehill.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

325 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Paradisehill connector — primary source dla movies (full-length adult films).
Site notes:
- Age-gate: wymagany cookie `is18=1` (POST /is18/ zwraca 400 z curla, ale samo dorzucenie
cookie do GET-a działa — site jest tolerancyjny).
- Listing: `/all/?sort=created_at&page=N` — paginacja po 28 filmów, mikro-data Schema.org Movie.
- Detail: `/<hex_id>/` — pełne meta + Video.js playlist (chaptery jako "Part 1/2/3").
Co ekstraktujemy:
- Schema.org microdata: name, description, director, datePublished (upload), image, thumbnailUrl
- Studio: link `/studio/<id>/{name}` (tylko link dostarcza nazwę i external_id)
- Genres: ze Schema.org `itemprop="genre"` (pierwszy = movie's main genre)
- Year: parsowany z description gdy obecny ("This 1999 film..."), bo `datePublished` to upload_date
- Chapters: liczba `<li>...Part N</li>` w playliście Video.js
- Playback: na MVP `page_url` only — Video.js playlist URL jest dynamicznie ładowany przez JS
i wymaga login session. Mobile może otworzyć page w WebView (degradacja lepsza niż brak).
External_id: hex slug z URL-a (np. `259448f6b75ee` z `/259448f6b75ee/`).
"""
from __future__ import annotations
import logging
import re
from collections.abc import Iterator
from datetime import UTC, date, datetime
from typing import Any
import httpx
from app.connectors.base import (
BaseMovieConnector,
RawMovie,
RawMovieChapter,
RawPerformer,
RawPlaybackSource,
RawStudio,
RawTag,
)
from app.models.source import SourceKind
log = logging.getLogger(__name__)
BASE_URL = "https://paradisehill.cc"
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
LISTING_PATH = "/all/" # ?sort=created_at&page=N
SOURCE_NAME = "paradisehill"
# Microdata extraction — Schema.org tagi są stabilne i niezagubione przy lekkich
# zmianach themu (yii2 widget renderuje je inwariantnie).
_TITLE_RE = re.compile(
r'<h1\s+class="title-inside"\s+itemprop="name">([^<]+)</h1>', re.IGNORECASE
)
_DIRECTOR_RE = re.compile(r'itemprop="director">([^<]+)</', re.IGNORECASE)
_DESCRIPTION_RE = re.compile(
r'itemprop="description">([^<]+(?:<[^>]+>[^<]+)*)</span>', re.IGNORECASE | re.DOTALL
)
_DATE_PUBLISHED_RE = re.compile(
r'itemprop="datePublished"\s+content="([^"]+)"', re.IGNORECASE
)
_POSTER_RE = re.compile(
r'<img\s+itemprop="image"\s+src="(/images/[^"]+)"', re.IGNORECASE
)
_THUMBNAIL_RE = re.compile(
r'<img\s+itemprop="thumbnailUrl"\s+src="(/images/[^"]+)"', re.IGNORECASE
)
_STUDIO_LINK_RE = re.compile(r'<a\s+href="/studio/(\d+)/"[^>]*>([^<]+)</a>', re.IGNORECASE)
_CHAPTER_RE = re.compile(
r'<a\s+href="#"\s+class="js-list-item"\s+data-index="(\d+)">([^<]+)</a>',
re.IGNORECASE,
)
# Listing page item:
_LIST_ITEM_RE = re.compile(
r'<div\s+class="item\s+list-film-item"[^>]*>\s*'
r'<a\s+href="/([0-9a-f]+)/"[^>]*>',
re.IGNORECASE,
)
# Year w description: szukamy 4-cyfrowego roku w sensownym zakresie
_YEAR_IN_DESC_RE = re.compile(r"\b(19[5-9]\d|20[0-3]\d)\b")
# Year w tytule (np. "Title (1999)")
_YEAR_IN_TITLE_RE = re.compile(r"\((\d{4})\)")
class ParadisehillConnector(BaseMovieConnector):
kind = SourceKind.scraper
name = SOURCE_NAME
def __init__(self, *, timeout: float = 30.0):
self._client = httpx.Client(
base_url=BASE_URL,
timeout=timeout,
follow_redirects=True,
headers={
"User-Agent": USER_AGENT,
# Wszystkie requesty wymagają is18 cookie. Pre-set żeby ominąć age-gate.
"Cookie": "is18=1",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
},
)
def close(self) -> None:
self._client.close()
def fetch_movies(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawMovie]:
"""Crawluje listing `/all/?sort=created_at` chronologicznie (najnowsze first).
`since`: stop gdy datePublished < since. `limit`: stop po N filmach.
Aktualnie 28 movies/page; site rośnie ~5/dzień, więc pełen crawl to ~tysiące
stron — w prod używamy `since` żeby zobaczyć tylko delta od poprzedniego runa.
"""
seen = 0
page = 1
while True:
try:
ids = list(self._fetch_listing_page(page))
except httpx.HTTPError as e:
log.warning("paradisehill listing page=%d failed: %s", page, e)
return
if not ids:
log.info("paradisehill: empty listing page=%d, stop", page)
return
for mid in ids:
try:
movie = self._fetch_detail(mid)
except httpx.HTTPError as e:
log.warning("paradisehill detail %s failed: %s", mid, e)
continue
if movie is None:
continue
# `since` filter — datePublished poniżej threshold = stop crawla,
# bo listing jest chronologiczny. since z `_last_successful_finished_at`
# jest TZ-aware (UTC); combine() daje naive — przywróć UTC tzinfo żeby
# porównanie nie crashowało.
if since is not None and movie.release_date is not None:
rd_dt = datetime.combine(
movie.release_date, datetime.min.time(), tzinfo=UTC
)
if rd_dt < since:
log.info(
"paradisehill: hit since boundary at %s (%s), stop",
mid, movie.release_date,
)
return
yield movie
seen += 1
if limit is not None and seen >= limit:
return
page += 1
def _fetch_listing_page(self, page: int) -> Iterator[str]:
"""Yielduje hex IDs filmów na danej stronie."""
url = f"{LISTING_PATH}?sort=created_at&page={page}"
r = self._client.get(url)
r.raise_for_status()
for m in _LIST_ITEM_RE.finditer(r.text):
yield m.group(1)
def _fetch_detail(self, hex_id: str) -> RawMovie | None:
url = f"/{hex_id}/"
r = self._client.get(url)
r.raise_for_status()
return _parse_detail(hex_id, r.text)
def _parse_detail(hex_id: str, html: str) -> RawMovie | None:
"""Parsuje detail HTML → RawMovie. Zwraca None gdy brak title (skopany template)."""
m_title = _TITLE_RE.search(html)
if not m_title:
log.warning("paradisehill: no title in detail %s", hex_id)
return None
title = _decode_html(m_title.group(1).strip())
m_director = _DIRECTOR_RE.search(html)
director = _decode_html(m_director.group(1).strip()) if m_director else None
if director and director.lower() in ("unknown", "n/a", "-"):
director = None
m_desc = _DESCRIPTION_RE.search(html)
description = _decode_html(_strip_tags(m_desc.group(1)).strip()) if m_desc else None
release_date: date | None = None
m_date = _DATE_PUBLISHED_RE.search(html)
if m_date:
try:
release_date = datetime.fromisoformat(m_date.group(1)).date()
except ValueError:
pass
# Year — najpierw z tytułu, potem z opisu. datePublished to upload date paradisehill
# (np. 2026-05) a nie production year (np. 1999) — useless dla year filtering.
release_year: int | None = None
m_yt = _YEAR_IN_TITLE_RE.search(title)
if m_yt:
release_year = int(m_yt.group(1))
elif description:
m_yd = _YEAR_IN_DESC_RE.search(description)
if m_yd:
release_year = int(m_yd.group(1))
poster_url: str | None = None
m_poster = _POSTER_RE.search(html)
if m_poster:
poster_url = BASE_URL + m_poster.group(1)
backdrop_url: str | None = None
m_thumb = _THUMBNAIL_RE.search(html)
if m_thumb:
backdrop_url = BASE_URL + m_thumb.group(1)
studio: RawStudio | None = None
m_studio = _STUDIO_LINK_RE.search(html)
if m_studio:
studio = RawStudio(
external_id=f"paradisehill:{m_studio.group(1)}",
name=_decode_html(m_studio.group(2).strip()),
)
# Genre — pierwszy itemprop="genre" w samym block-inside (nie w recommendations).
# Recommended films też mają itemprop="genre" więc match limity do block-inside.
tags: list[RawTag] = []
block_match = re.search(
r'<div\s+class="block-inside"[^>]*itemtype="http://schema\.org/Movie"[^>]*>'
r'(.*?)</div>\s*</div>\s*<div\s+class="similar',
html,
re.DOTALL,
)
block = block_match.group(1) if block_match else html[:8000]
for m_genre in re.finditer(r'itemprop="genre"[^>]*>([^<]+)</', block, re.IGNORECASE):
name = _decode_html(m_genre.group(1).strip())
if name and len(tags) < 10:
tags.append(RawTag(name=name, slug=_slugify(name)))
chapters: list[RawMovieChapter] = []
for m_ch in _CHAPTER_RE.finditer(html):
chapters.append(
RawMovieChapter(
chapter_index=int(m_ch.group(1)),
title=_decode_html(m_ch.group(2).strip()),
)
)
page_url = f"{BASE_URL}/{hex_id}/"
playback_sources = [
RawPlaybackSource(
origin=SOURCE_NAME,
page_url=page_url,
thumbnail_url=poster_url,
)
]
return RawMovie(
external_id=hex_id,
title=title,
description=description,
release_year=release_year,
release_date=release_date,
director=director,
poster_url=poster_url,
backdrop_url=backdrop_url,
url=page_url,
studio=studio,
performers=[], # Paradisehill rzadko ma cast linki — uzupełnimy przez mirrory.
tags=tags,
chapters=chapters,
playback_sources=playback_sources,
raw={"hex_id": hex_id, "html_len": len(html)},
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_TAG_RE = re.compile(r"<[^>]+>")
def _strip_tags(s: str) -> str:
return _TAG_RE.sub("", s)
_HTML_ENTITIES = {
"&amp;": "&",
"&lt;": "<",
"&gt;": ">",
"&quot;": '"',
"&#39;": "'",
"&apos;": "'",
"&nbsp;": " ",
"&rsquo;": "'",
"&lsquo;": "'",
"&rdquo;": '"',
"&ldquo;": '"',
"&hellip;": "...",
"&mdash;": "",
"&ndash;": "",
}
def _decode_html(s: str) -> str:
for k, v in _HTML_ENTITIES.items():
s = s.replace(k, v)
# Numeric entities
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
return s
_SLUG_RE = re.compile(r"[^a-z0-9]+")
def _slugify(s: str) -> str:
return _SLUG_RE.sub("-", s.lower()).strip("-") or "tag"