goon/app/connectors/paradisehill.py
jtrzupek cd12348782 fix(movies): paradisehill delta date-granularity + browse cadence docs
- paradisehill.fetch_movies compared release_date coerced to midnight against the
  `since` timestamp, so the chronological crawl stopped at the first upload dated
  the same calendar day as `since` and silently dropped most new movies (0-2 seen
  per run; Movies tab stalled). Compare by DATE with a 1-day grace instead; idempotent
  external_records upsert dedups the re-fetched recent window.
- scripts/backfill_paradisehill_movies.py: one-off no-delta deep crawl to recover the
  backlog missed during the bug (idempotent, resumable).
- docs: correct stale 'raz dziennie/24h' browse-latest comments to 6h (4x/day), the
  actual configured cadence (config.py sched_browse_latest_hours=6).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 17:00:10 +02:00

385 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Paradisehill connector — primary source dla movies (full-length adult films).
Site notes:
- Age-gate: wymagany cookie `is18=1` (POST /is18/ zwraca 400 z curla, ale samo dorzucenie
cookie do GET-a działa — site jest tolerancyjny).
- Listing: `/all/?sort=created_at&page=N` — paginacja po 28 filmów, mikro-data Schema.org Movie.
- Detail: `/<hex_id>/` — pełne meta + Video.js playlist (chaptery jako "Part 1/2/3").
Co ekstraktujemy:
- Schema.org microdata: name, description, director, datePublished (upload), image, thumbnailUrl
- Studio: link `/studio/<id>/{name}` (tylko link dostarcza nazwę i external_id)
- Genres: ze Schema.org `itemprop="genre"` (pierwszy = movie's main genre)
- Year: parsowany z description gdy obecny ("This 1999 film..."), bo `datePublished` to upload_date
- Chapters: liczba `<li>...Part N</li>` w playliście Video.js
- Playback: na MVP `page_url` only — Video.js playlist URL jest dynamicznie ładowany przez JS
i wymaga login session. Mobile może otworzyć page w WebView (degradacja lepsza niż brak).
External_id: hex slug z URL-a (np. `259448f6b75ee` z `/259448f6b75ee/`).
"""
from __future__ import annotations
import logging
import re
from collections.abc import Iterator
from datetime import UTC, date, datetime, timedelta
from typing import Any
import httpx
from app.connectors.base import (
BaseMovieConnector,
RawMovie,
RawMovieChapter,
RawPerformer,
RawPlaybackSource,
RawStudio,
RawTag,
)
from app.models.source import SourceKind
log = logging.getLogger(__name__)
BASE_URL = "https://paradisehill.cc"
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
LISTING_PATH = "/all/" # ?sort=created_at&page=N
SOURCE_NAME = "paradisehill"
# Microdata extraction — Schema.org tagi są stabilne i niezagubione przy lekkich
# zmianach themu (yii2 widget renderuje je inwariantnie).
_TITLE_RE = re.compile(
r'<h1\s+class="title-inside"\s+itemprop="name">([^<]+)</h1>', re.IGNORECASE
)
_DIRECTOR_RE = re.compile(r'itemprop="director">([^<]+)</', re.IGNORECASE)
_DESCRIPTION_RE = re.compile(
r'itemprop="description">([^<]+(?:<[^>]+>[^<]+)*)</span>', re.IGNORECASE | re.DOTALL
)
_DATE_PUBLISHED_RE = re.compile(
r'itemprop="datePublished"\s+content="([^"]+)"', re.IGNORECASE
)
_POSTER_RE = re.compile(
r'<img\s+itemprop="image"\s+src="(/images/[^"]+)"', re.IGNORECASE
)
_THUMBNAIL_RE = re.compile(
r'<img\s+itemprop="thumbnailUrl"\s+src="(/images/[^"]+)"', re.IGNORECASE
)
_STUDIO_LINK_RE = re.compile(r'<a\s+href="/studio/(\d+)/"[^>]*>([^<]+)</a>', re.IGNORECASE)
_CHAPTER_RE = re.compile(
r'<a\s+href="#"\s+class="js-list-item"\s+data-index="(\d+)">([^<]+)</a>',
re.IGNORECASE,
)
# videoList JS array w detail page — może mieć multiple parts (Video.js playlist):
# var videoList = [{"sources":[{"src":"...part1.mp4","type":"video/mp4"}]}, ...]
# Bez parsowania tego mobile WebView gra tylko pierwszy part, kolejne pomija.
# Bug-reports `c5693926`/`418270e4` 2026-05-21 ("ładuje tylko 1 z 4 części").
_VIDEO_LIST_RE = re.compile(r"var\s+videoList\s*=\s*(\[.*?\])\s*;", re.IGNORECASE | re.DOTALL)
_VIDEO_SRC_RE = re.compile(r'"src"\s*:\s*"([^"]+\.mp4[^"]*)"', re.IGNORECASE)
def extract_video_parts(html: str) -> list[tuple[str, str]]:
"""Wyciąga listę MP4 parts z paradisehill detail HTML.
Returns: [(mp4_url, label), ...] np. `[(.../part1.mp4, "Part 1"), ...]`.
Pusta lista gdy `videoList` nieobecny lub bez sources (login-only filmy).
"""
m = _VIDEO_LIST_RE.search(html)
if not m:
return []
parts: list[tuple[str, str]] = []
for i, src_m in enumerate(_VIDEO_SRC_RE.finditer(m.group(1)), start=1):
url = src_m.group(1).replace("\\/", "/")
parts.append((url, f"Part {i}"))
return parts
def fetch_and_extract_parts(page_url: str, *, timeout: float = 20.0) -> list[tuple[str, str]]:
"""Resolve-time helper: pobierz page, wyciągnij videoList parts.
Używane przez `app.api.playback.resolve_movie_playback` dla origin='paradisehill'.
"""
with httpx.Client(
timeout=timeout,
follow_redirects=True,
headers={
"User-Agent": USER_AGENT,
"Cookie": "is18=1",
"Accept-Language": "en-US,en;q=0.9",
},
) as client:
r = client.get(page_url)
r.raise_for_status()
return extract_video_parts(r.text)
# Listing page item:
_LIST_ITEM_RE = re.compile(
r'<div\s+class="item\s+list-film-item"[^>]*>\s*'
r'<a\s+href="/([0-9a-f]+)/"[^>]*>',
re.IGNORECASE,
)
# Year w description: szukamy 4-cyfrowego roku w sensownym zakresie
_YEAR_IN_DESC_RE = re.compile(r"\b(19[5-9]\d|20[0-3]\d)\b")
# Year w tytule (np. "Title (1999)")
_YEAR_IN_TITLE_RE = re.compile(r"\((\d{4})\)")
class ParadisehillConnector(BaseMovieConnector):
kind = SourceKind.scraper
name = SOURCE_NAME
def __init__(self, *, timeout: float = 30.0):
self._client = httpx.Client(
base_url=BASE_URL,
timeout=timeout,
follow_redirects=True,
headers={
"User-Agent": USER_AGENT,
# Wszystkie requesty wymagają is18 cookie. Pre-set żeby ominąć age-gate.
"Cookie": "is18=1",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
},
)
def close(self) -> None:
self._client.close()
def fetch_movies(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawMovie]:
"""Crawluje listing `/all/?sort=created_at` chronologicznie (najnowsze first).
`since`: stop gdy datePublished < since. `limit`: stop po N filmach.
Aktualnie 28 movies/page; site rośnie ~5/dzień, więc pełen crawl to ~tysiące
stron — w prod używamy `since` żeby zobaczyć tylko delta od poprzedniego runa.
"""
seen = 0
page = 1
while True:
try:
ids = list(self._fetch_listing_page(page))
except httpx.HTTPError as e:
log.warning("paradisehill listing page=%d failed: %s", page, e)
return
if not ids:
log.info("paradisehill: empty listing page=%d, stop", page)
return
for mid in ids:
try:
movie = self._fetch_detail(mid)
except httpx.HTTPError as e:
log.warning("paradisehill detail %s failed: %s", mid, e)
continue
if movie is None:
continue
# `since` filter — datePublished (= data uploadu na paradisehill) poniżej
# progu = stop crawla (listing chronologiczny).
#
# UWAGA: release_date to DATA (bez godziny). Wcześniej combine()→00:00
# porównywane z TIMESTAMPEM `since` ucinało crawl na PIERWSZYM filmie z dnia
# == since (midnight < since o dowolnej porze dnia) → uploady tego samego dnia
# systematycznie ginęły, bo movie-ingest jest dzienny (seen=0-2/run mimo
# świeżych filmów na froncie strony — bug-report 2026-06-01 "Movies stoją").
# Fix: porównuj po DACIE z 1-dniowym grace; ponowny fetch świeżych jest tani
# (external_records upsert pomija niezmieniony hash).
if since is not None and movie.release_date is not None:
if movie.release_date < (since - timedelta(days=1)).date():
log.info(
"paradisehill: hit since boundary at %s (%s), stop",
mid, movie.release_date,
)
return
yield movie
seen += 1
if limit is not None and seen >= limit:
return
page += 1
def _fetch_listing_page(self, page: int) -> Iterator[str]:
"""Yielduje hex IDs filmów na danej stronie."""
url = f"{LISTING_PATH}?sort=created_at&page={page}"
r = self._client.get(url)
r.raise_for_status()
for m in _LIST_ITEM_RE.finditer(r.text):
yield m.group(1)
def _fetch_detail(self, hex_id: str) -> RawMovie | None:
url = f"/{hex_id}/"
r = self._client.get(url)
r.raise_for_status()
return _parse_detail(hex_id, r.text)
def _parse_detail(hex_id: str, html: str) -> RawMovie | None:
"""Parsuje detail HTML → RawMovie. Zwraca None gdy brak title (skopany template)."""
m_title = _TITLE_RE.search(html)
if not m_title:
log.warning("paradisehill: no title in detail %s", hex_id)
return None
title = _decode_html(m_title.group(1).strip())
m_director = _DIRECTOR_RE.search(html)
director = _decode_html(m_director.group(1).strip()) if m_director else None
if director and director.lower() in ("unknown", "n/a", "-"):
director = None
m_desc = _DESCRIPTION_RE.search(html)
description = _decode_html(_strip_tags(m_desc.group(1)).strip()) if m_desc else None
release_date: date | None = None
m_date = _DATE_PUBLISHED_RE.search(html)
if m_date:
try:
release_date = datetime.fromisoformat(m_date.group(1)).date()
except ValueError:
pass
# Year — najpierw z tytułu, potem z opisu. datePublished to upload date paradisehill
# (np. 2026-05) a nie production year (np. 1999) — useless dla year filtering.
release_year: int | None = None
m_yt = _YEAR_IN_TITLE_RE.search(title)
if m_yt:
release_year = int(m_yt.group(1))
elif description:
m_yd = _YEAR_IN_DESC_RE.search(description)
if m_yd:
release_year = int(m_yd.group(1))
poster_url: str | None = None
m_poster = _POSTER_RE.search(html)
if m_poster:
poster_url = BASE_URL + m_poster.group(1)
backdrop_url: str | None = None
m_thumb = _THUMBNAIL_RE.search(html)
if m_thumb:
backdrop_url = BASE_URL + m_thumb.group(1)
studio: RawStudio | None = None
m_studio = _STUDIO_LINK_RE.search(html)
if m_studio:
studio = RawStudio(
external_id=f"paradisehill:{m_studio.group(1)}",
name=_decode_html(m_studio.group(2).strip()),
)
# Genre — pierwszy itemprop="genre" w samym block-inside (nie w recommendations).
# Recommended films też mają itemprop="genre" więc match limity do block-inside.
# Wcześniejszy regex wymagał `</div></div><div class="similar"` — ale paradisehill
# czasami ma `</div></noindex>...<div class="similar"` (banner skin z 2026-05-19),
# przez co block_match failował → fallback do html[:8000] → 0 tagów. Bug-report
# `3c999b27` 2026-05-21 ("Brak kategorii"). Robust: szukaj similar jako stop boundary,
# bez wymagania zamknięcia konkretnymi `</div>`.
tags: list[RawTag] = []
block_start = re.search(
r'<div\s+class="block-inside"[^>]*itemtype="http://schema\.org/Movie"[^>]*>',
html,
)
if block_start:
rest = html[block_start.end():]
# Stop boundary: pierwszy <div class="similar...">. Wszystko przedtem to
# właściwa zawartość filmu (genre/cast/itd.); reszta to recommendations
# i komentarze ktore mają własne itemprop="genre".
stop = re.search(r'<div\s+class="similar', rest)
block = rest[: stop.start()] if stop else rest[:12000]
else:
block = html[:8000]
# Paradisehill miesza dwa szablony per-page:
# v1: `itemprop="genre">Female Domination</span>`
# v2: `itemprop="genre"><a href="/category/...">All Sex</a></span>` (od 2026-05)
# Optional `<a>` wrapper między `itemprop` a tekstem — bez tego v2 dawał empty.
for m_genre in re.finditer(
r'itemprop="genre"[^>]*>\s*(?:<a[^>]*>)?\s*([^<]+)', block, re.IGNORECASE,
):
name = _decode_html(m_genre.group(1).strip())
if name and len(tags) < 10:
tags.append(RawTag(name=name, slug=_slugify(name)))
chapters: list[RawMovieChapter] = []
for m_ch in _CHAPTER_RE.finditer(html):
chapters.append(
RawMovieChapter(
chapter_index=int(m_ch.group(1)),
title=_decode_html(m_ch.group(2).strip()),
)
)
page_url = f"{BASE_URL}/{hex_id}/"
playback_sources = [
RawPlaybackSource(
origin=SOURCE_NAME,
page_url=page_url,
thumbnail_url=poster_url,
)
]
return RawMovie(
external_id=hex_id,
title=title,
description=description,
release_year=release_year,
release_date=release_date,
director=director,
poster_url=poster_url,
backdrop_url=backdrop_url,
url=page_url,
studio=studio,
performers=[], # Paradisehill rzadko ma cast linki — uzupełnimy przez mirrory.
tags=tags,
chapters=chapters,
playback_sources=playback_sources,
raw={"hex_id": hex_id, "html_len": len(html)},
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_TAG_RE = re.compile(r"<[^>]+>")
def _strip_tags(s: str) -> str:
return _TAG_RE.sub("", s)
_HTML_ENTITIES = {
"&amp;": "&",
"&lt;": "<",
"&gt;": ">",
"&quot;": '"',
"&#39;": "'",
"&apos;": "'",
"&nbsp;": " ",
"&rsquo;": "'",
"&lsquo;": "'",
"&rdquo;": '"',
"&ldquo;": '"',
"&hellip;": "...",
"&mdash;": "",
"&ndash;": "",
}
def _decode_html(s: str) -> str:
for k, v in _HTML_ENTITIES.items():
s = s.replace(k, v)
# Numeric entities
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
return s
_SLUG_RE = re.compile(r"[^a-z0-9]+")
def _slugify(s: str) -> str:
return _SLUG_RE.sub("-", s.lower()).strip("-") or "tag"