goon/app/connectors/dooplay.py
jtrzupek 6ee0516e62 fix(connectors/dooplay): max_pages cap to unblock movie ingest queue
Bug-report 2026-05-28 ("od wczoraj nie ma nowych filmow"). DooplayConnector
.fetch_movies mial `while True` po stronach bez bound; streamporn (>2k filmow)
wisial godzinami az do dailowego killa schedulera, blokujac kolejke mangoporn
+ pandamovies. Watermark zamrozony, dziennie 0 nowych filmow.

Fix: cap _MAX_PAGES_DELTA=3 (since-driven runs, ~144 najnowszych pozycji)
i _MAX_PAGES_FULL=50 (full backfill gdy since=None). Wczesniejsza proba
filtrowania przez release_date odrzucona - release_date to data wydania filmu
(np. 2013), nie data uploadu na strone, wiec sortowanie listing nie matchuje.

Po deployu manualne re-run: streamporn 144/46s, pandamovies 120/47s,
mangoporn 108 z 72 NEW filmow w 58s. Scheduler queue unblocked.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 23:23:50 +02:00

483 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""dooplay (a.k.a. PsyPlay) WordPress theme scraper — generic dla mangoporn/streamporn/pandamovies.
Te 3 strony to dokładnie ten sam template (theme=dooplay + PsyPlay player plugin),
więc parametryzujemy connector po `(base_url, source_name)` i odpalamy 3 instancje.
Listing: `/movies/page/N/` zwraca <a href="/movies/<slug>/"> per item.
Detail: `/movies/<slug>/` ma rich meta:
- <h1> tytuł (w class="data" wrapper)
- <a href="/year/YYYY/" rel="tag"> rok produkcji
- <a href="/studios/<slug>/" rel="tag"> studio
- <span class='duration'>NN mins.</span> długość
- <a href="/pornstar/<slug>/"> cast (multi)
- <a href="/genre/<slug>/"> tagi (multi)
- <div itemprop="description"><p>...</p></div> opis
- <span class="dt_rating_vgs" itemprop="ratingValue">N</span> rating 0-10
- <li ... data-fl-source="<embed_url>"><a href="<embed_link>">Host</a></li> player options
Player ma multi-host options (DoodStream, LuluStream, RPMShare etc.) — każdy embed
URL idzie jako osobny `playback_source` z origin=`{site}:{host}` żeby później mobile
mógł wybrać czyim embedem chce odpalić scenę.
"""
from __future__ import annotations
import logging
import re
from collections.abc import Iterator
from datetime import date, datetime
from typing import Any
import httpx
from app.connectors.base import (
BaseMovieConnector,
RawMovie,
RawPerformer,
RawPlaybackSource,
RawStudio,
RawTag,
)
from app.extractors import browser_get
from app.models.source import SourceKind
log = logging.getLogger(__name__)
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
# ---- selektory (theme-agnostic — działa dla dowolnego dooplay) -----------
# Listing item — dwa wzorce w zależności od witryny:
# 1. mangoporn: zwykłe <a href="https://site/movies/<slug>/"> bez klasy
# (theme wyrendurował SEO-friendly URL bezpośrednio w grid)
# 2. streamporn/pandamovies: <a class="ml-mask jt" href="<base>/<slug>/">
# (slug bez /movies/ prefix, np. /watch-xxx-...-adult-movie-online-free/)
# Łapiemy oba przez alternatywę.
_LIST_ITEM_RE = re.compile(
r'<a\s+href="(?P<url>https?://[^"]+)"[^>]*\bclass="ml-mask\b[^"]*"'
r"|"
r'<a\s+href="(?P<url2>https?://[^"]+/movies/[a-z0-9-]+/)"',
re.IGNORECASE,
)
# Tolerantny title — mangoporn (dooplay) używa <h1> w class="data", streamporn/pandamovies
# (raw PsyPlay theme) używają <h3 itemprop="name">. Łapiemy oba przez itemprop="name".
_TITLE_RE = re.compile(
r'<h[1-6][^>]*\sitemprop="name"[^>]*>([^<]+)</h[1-6]>'
r'|class="data"[^>]*>\s*<h[1-6][^>]*>([^<]+)</h[1-6]>',
re.IGNORECASE | re.DOTALL,
)
# dooplay uses /year/, raw PsyPlay uses /release-year/. Same dla pozostałych slugów —
# różne thema dziedziczą podstawowy markup ale customizują URL słowniki.
_YEAR_RE = re.compile(
r'/(?:year|release-year)/(\d{4})/"\s*rel="tag"', re.IGNORECASE
)
_STUDIO_RE = re.compile(
r'href="https?://[^/]+/(?:studios?|director)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Duration: span class='duration' (dooplay) lub <p><strong>Duration:</strong> X hrs. Y mins.</p> (PsyPlay)
_DURATION_SPAN_RE = re.compile(
r"<span\s+class=['\"]duration['\"][^>]*>([^<]+)</span>", re.IGNORECASE
)
_DURATION_TEXT_RE = re.compile(
r"<strong>\s*Duration:\s*</strong>\s*([^<]+)<", re.IGNORECASE
)
# Release date: span class='release_date' (dooplay) lub <p><strong>Released Date:</strong> X</p> (PsyPlay)
_RELEASE_DATE_SPAN_RE = re.compile(
r"<span\s+class=['\"]release_date['\"]'?[^>]*>([^<]+)</span>", re.IGNORECASE
)
_RELEASE_DATE_TEXT_RE = re.compile(
r"<strong>\s*Released?\s*Date:\s*</strong>\s*([^<]+)<", re.IGNORECASE
)
_DESCRIPTION_RE = re.compile(
r'itemprop="description"[^>]*>(.*?)</div>', re.IGNORECASE | re.DOTALL
)
_RATING_RE = re.compile(
r'itemprop="ratingValue"[^>]*>([\d.]+)</span>', re.IGNORECASE
)
# Cast: dooplay /pornstar/, PsyPlay /actor/
_PORNSTAR_RE = re.compile(
r'href="https?://[^/]+/(?:pornstar|actor)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Genre: same /genre(s)/ w obu themach
_GENRE_TAG_RE = re.compile(
r'href="https?://[^/]+/genres?/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Player options: data-fl-source jest oryginalnym embed URL hostera, data-fl-url
# to page URL u hostera. Stare theme (mangoporn): `<li class="hosts-buttons-wpx">`.
# Nowe theme (pandamovies od ~2026-04): `<div class="Rtable1-cell" data-fl-url=...
# data-fl-source=...>`. Trzeba też tolerować order-independent attrs — nowe theme
# emituje url BEFORE source, stare odwrotnie. Łapiemy oba wzorce dwoma osobnymi
# regexami i konsolidujemy w `_iter_player_options`.
_PLAYER_OPTION_RE = re.compile(
r'<li[^>]*\bclass="hosts-buttons-wpx"[^>]*'
r'(?:data-fl-source="(?P<source>[^"]*)"[^>]*)?'
r'(?:data-fl-url="(?P<page>[^"]*)"[^>]*)?'
r'>\s*<a[^>]*href="(?P<href>[^"]+)"[^>]*'
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
re.IGNORECASE | re.DOTALL,
)
# Nowy markup pandamovies: `<div class="Rtable1-cell" data-fl-* ...><a href=...>HostName</a></div>`.
# Attrs są w kolejności url→source, source często pusty (`data-fl-source=""` dla
# doodstream/mixdrop/easyvidplayer). Capturujemy CAŁY opening tag w group(1)
# żeby data-fl-source należał gwarantowanie do TEGO konkretnego div (wcześniejszy
# window-lookback 600 chars mógł pickować poprzedni cell — cross-attribution
# doodstream→mixdrop entry, code-review #14).
_PLAYER_OPTION_DIV_RE = re.compile(
r'(<div[^>]*\bclass="Rtable1-cell"[^>]*>)\s*'
r'<a[^>]*href="(?P<href>[^"]+)"[^>]*'
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
re.IGNORECASE | re.DOTALL,
)
_DATA_FL_SOURCE_RE = re.compile(r'data-fl-source="([^"]*)"', re.IGNORECASE)
# Poster — JSON-LD `thumbnailUrl` jest najbardziej stabilny (każdy dooplay/PsyPlay
# theme z SEO ma JSON-LD VideoObject schema). Fallback na class="poster" img dla starych
# instalacji bez schema. Trzeci fallback: og:image meta tag.
_POSTER_JSONLD_RE = re.compile(
r'"thumbnailUrl"\s*:\s*"([^"]+\.(?:jpg|jpeg|png|webp)[^"]*)"', re.IGNORECASE
)
_POSTER_RE = re.compile(
r'class="poster"[^>]*>\s*<img\s+[^>]*src="([^"]+)"', re.IGNORECASE
)
_POSTER_OG_RE = re.compile(
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
)
_DURATION_MINS_RE = re.compile(r"(\d+)\s*min", re.IGNORECASE)
class DooplayConnector(BaseMovieConnector):
"""Generic dooplay scraper. Instantiated per-site via subclasses below."""
kind = SourceKind.scraper
base_url: str
name: str
def __init__(self, *, timeout: float = 30.0):
if not getattr(self, "base_url", None):
raise RuntimeError(f"{type(self).__name__} requires class-level `base_url`")
if not getattr(self, "name", None):
raise RuntimeError(f"{type(self).__name__} requires class-level `name`")
self._timeout = timeout
def close(self) -> None:
pass
def _fetch(self, url: str) -> str:
"""browser_get z chrome120 impersonation — psyplay sites czasem blokują
czysty httpx (Python TLS fingerprint) zwracając 500/403. curl_cffi fixuje to."""
if not url.startswith("http"):
url = self.base_url.rstrip("/") + url
headers = {
"User-Agent": USER_AGENT,
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
"Referer": self.base_url + "/",
}
r = browser_get(url, headers=headers, timeout=self._timeout, follow_redirects=True)
if r.status_code >= 400:
raise httpx.HTTPStatusError(
f"{r.status_code} for {url}",
request=None, # type: ignore[arg-type]
response=httpx.Response(r.status_code, text=r.text[:200]),
)
return r.text
# Bezpiecznik — dooplay listing potrafi mieć tysiące stron (streamporn.nl ma
# >2k filmów). Bez tego ingest wisi godzinami, jest killowany przy restartcie
# schedulera, blokując kolejne connectory w queue (bug-report 2026-05-28: "od
# wczoraj nie ma nowych filmów" — streamporn wisiał od 5-24, blokował
# mangoporn + pandamovies). Listing jest sortowany po dacie uploadu (NIE
# release_date filmu — release może być z 2013 a upload z dziś), ale upload
# date nie jest w markupie, więc filtrowanie po `since` przez release_date
# nie działa. Pragmatyczny cap stron: 3 dla delta (≈150 nowych pozycji/dzień
# to znacznie powyżej realnego upload-rate), 50 dla full ingestu (`since=None`).
_MAX_PAGES_DELTA = 3
_MAX_PAGES_FULL = 50
def fetch_movies(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawMovie]:
seen = 0
page = 1
seen_urls: set[str] = set()
max_pages = self._MAX_PAGES_DELTA if since is not None else self._MAX_PAGES_FULL
while page <= max_pages:
try:
urls = list(self._fetch_listing(page))
except httpx.HTTPError as e:
log.warning("%s listing page=%d failed: %s", self.name, page, e)
return
if not urls:
log.info("%s: empty page=%d, stop", self.name, page)
return
for url in urls:
if url in seen_urls:
continue
seen_urls.add(url)
try:
movie = self._fetch_detail(url)
except httpx.HTTPError as e:
log.warning("%s detail %s failed: %s", self.name, url, e)
continue
if movie is None:
continue
yield movie
seen += 1
if limit is not None and seen >= limit:
return
page += 1
log.info(
"%s: hit max_pages=%d cap (delta=%s), stopping after seen=%d",
self.name, max_pages, since is not None, seen,
)
def _fetch_listing(self, page: int) -> Iterator[str]:
path = self._listing_path(page)
text = self._fetch(path)
from urllib.parse import urlparse
site_host = urlparse(self.base_url).hostname
for m in _LIST_ITEM_RE.finditer(text):
url = m.group("url") or m.group("url2")
if not url:
continue
try:
if urlparse(url).hostname != site_host:
continue
except Exception:
continue
yield url
def _listing_path(self, page: int) -> str:
return "/movies/" if page == 1 else f"/movies/page/{page}/"
def _fetch_detail(self, url: str) -> RawMovie | None:
from urllib.parse import urlparse
path = urlparse(url).path.rstrip("/")
slug = path.split("/")[-1] or "root"
text = self._fetch(url)
return _parse_dooplay_detail(
slug=slug, page_url=url, html=text,
source_name=self.name, base_url=self.base_url,
)
def _parse_dooplay_detail(
*, slug: str, html: str, source_name: str, base_url: str, page_url: str | None = None
) -> RawMovie | None:
m_title = _TITLE_RE.search(html)
if not m_title:
log.warning("%s: no title in %s", source_name, slug)
return None
title = _decode_html((m_title.group(1) or m_title.group(2)).strip())
m_year = _YEAR_RE.search(html)
release_year = int(m_year.group(1)) if m_year else None
studio: RawStudio | None = None
m_studio = _STUDIO_RE.search(html)
if m_studio:
studio_slug = m_studio.group(1)
studio_name = _decode_html(m_studio.group(2).strip())
studio = RawStudio(
external_id=f"{source_name}:{studio_slug}",
name=studio_name,
slug=studio_slug,
)
duration_sec: int | None = None
m_dur = _DURATION_SPAN_RE.search(html) or _DURATION_TEXT_RE.search(html)
if m_dur:
text = m_dur.group(1)
# Może być "32 mins." (dooplay) albo "1 hrs. 12 mins." (PsyPlay)
m_h = re.search(r"(\d+)\s*hr", text, re.IGNORECASE)
m_m = re.search(r"(\d+)\s*min", text, re.IGNORECASE)
if m_h or m_m:
duration_sec = (int(m_h.group(1)) * 3600 if m_h else 0) + (int(m_m.group(1)) * 60 if m_m else 0)
release_date: date | None = None
m_rd = _RELEASE_DATE_SPAN_RE.search(html) or _RELEASE_DATE_TEXT_RE.search(html)
if m_rd:
text = m_rd.group(1).strip()
for fmt in ("%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"):
try:
release_date = datetime.strptime(text, fmt).date()
break
except ValueError:
continue
description: str | None = None
m_desc = _DESCRIPTION_RE.search(html)
if m_desc:
description = _decode_html(_strip_tags(m_desc.group(1))).strip() or None
rating: float | None = None
m_rating = _RATING_RE.search(html)
if m_rating:
try:
rating = float(m_rating.group(1))
except ValueError:
pass
poster_url: str | None = None
for rgx in (_POSTER_JSONLD_RE, _POSTER_RE, _POSTER_OG_RE):
m = rgx.search(html)
if m:
candidate = m.group(1).strip()
if candidate and "blank.gif" not in candidate and "no-poster" not in candidate:
poster_url = candidate
break
# Performers — tylko sekcja "Pornstars" ma /pornstar/<slug>/ linki, dooplay
# filtruje cast w tej sekcji. Jaccard może łapać dubel ale dedup robimy w
# resolverze (po performer_id).
performers = [
RawPerformer(
external_id=f"{source_name}:{m.group(1)}",
name=_decode_html(m.group(2).strip()),
)
for m in _PORNSTAR_RE.finditer(html)
]
tags = [
RawTag(
external_id=f"{source_name}:{m.group(1)}",
name=_decode_html(m.group(2).strip()),
slug=m.group(1),
)
for m in _GENRE_TAG_RE.finditer(html)
]
if page_url is None:
page_url = f"{base_url}/movies/{slug}/"
# Playback sources: każdy host (Doodstream/Lulu/RPM/...) jako osobny entry.
# Dedup po href żeby ten sam host nie wpadł 2x. Raw landing page (origin=
# source_name, bez :host) appendujemy TYLKO gdy nie ma żadnych sub-hosters —
# inaczej myli usera (otwiera WebView z reklamami zamiast video; bug-report
# 2026-05-16: "mangoporn przekierowuje do strony, reklama full screen").
playback_sources: list[RawPlaybackSource] = []
seen_hrefs: set[str] = set()
# Hostery file-download (non-streamable) + malware. Mobile player nie potrafi
# ich odtworzyć — rapidgator/nitroflare/frdl serwują .zip/.rar/.mp4 do download
# (premium login required), streamtape ma malware drive-by .reg. Skipujemy
# przy ingest żeby nie zaśmiecać UI martwym contentem (bug-report 2026-05-18).
SKIP_HOSTERS = {"rapidgator", "nitroflare", "nitro", "frdl", "streamtape"}
def _emit_host_entry(href: str, source: str | None) -> None:
href = href.strip()
if not href or href in seen_hrefs:
return
seen_hrefs.add(href)
try:
from urllib.parse import urlparse
host = urlparse(href).hostname or "unknown"
host_short = host.split(".")[-2] if host.count(".") >= 1 else host
except Exception:
host_short = "unknown"
if host_short.lower() in SKIP_HOSTERS:
return
playback_sources.append(
RawPlaybackSource(
origin=f"{source_name}:{host_short}",
page_url=href,
embed_url=source or href,
thumbnail_url=poster_url,
duration_sec=duration_sec,
)
)
# Stary `<li class="hosts-buttons-wpx">` markup (mangoporn).
for m in _PLAYER_OPTION_RE.finditer(html):
_emit_host_entry(m.group("href") or "", (m.group("source") or "").strip() or None)
# Nowy `<div class="Rtable1-cell">` markup (pandamovies od ~2026-04 + nowe
# streamporn instances). data-fl-source jest opcjonalny — capturujemy CAŁY
# opening tag w group(1), data-fl-source extract z TEGO tagu (nie z window
# lookback po HTMLu, bo to mogło pickować poprzedni cell).
for m in _PLAYER_OPTION_DIV_RE.finditer(html):
href = m.group("href") or ""
opening_tag = m.group(1)
src_match = _DATA_FL_SOURCE_RE.search(opening_tag)
source = (src_match.group(1).strip() if src_match else "") or None
_emit_host_entry(href, source)
if not playback_sources:
# Brak sub-hosters znalezionych — fallback do landing page (mobile otworzy
# w WebView). Robimy to TYLKO gdy nie ma alternatyw, inaczej landing jest
# niepotrzebnym ad-pageiem.
playback_sources.append(
RawPlaybackSource(
origin=source_name,
page_url=page_url,
thumbnail_url=poster_url,
)
)
return RawMovie(
external_id=slug,
title=title,
description=description,
release_year=release_year,
release_date=release_date,
duration_sec=duration_sec,
rating=rating,
poster_url=poster_url,
url=page_url,
studio=studio,
performers=performers,
tags=tags,
playback_sources=playback_sources,
raw={"slug": slug, "html_len": len(html)},
)
# ---- per-site instances ----------------------------------------------------
class StreampornConnector(DooplayConnector):
name = "streamporn"
base_url = "https://streamporn.nl"
class PandamoviesConnector(DooplayConnector):
name = "pandamovies"
base_url = "https://pandamovies.pw"
class MangopornConnector(DooplayConnector):
name = "mangoporn"
base_url = "https://mangoporn.net"
# ---------------------------------------------------------------------------
# Helpers (zduplikowane z paradisehill.py — celowo, żeby connectory były niezależne)
# ---------------------------------------------------------------------------
_TAG_RE = re.compile(r"<[^>]+>")
def _strip_tags(s: str) -> str:
return _TAG_RE.sub("", s)
_HTML_ENTITIES = {
"&amp;": "&", "&lt;": "<", "&gt;": ">", "&quot;": '"', "&#39;": "'",
"&apos;": "'", "&nbsp;": " ", "&rsquo;": "'", "&lsquo;": "'",
"&rdquo;": '"', "&ldquo;": '"', "&hellip;": "...", "&mdash;": "", "&ndash;": "",
}
def _decode_html(s: str) -> str:
for k, v in _HTML_ENTITIES.items():
s = s.replace(k, v)
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
return s