goon/app/connectors/dooplay.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

466 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""dooplay (a.k.a. PsyPlay) WordPress theme scraper — generic dla mangoporn/streamporn/pandamovies.
Te 3 strony to dokładnie ten sam template (theme=dooplay + PsyPlay player plugin),
więc parametryzujemy connector po `(base_url, source_name)` i odpalamy 3 instancje.
Listing: `/movies/page/N/` zwraca <a href="/movies/<slug>/"> per item.
Detail: `/movies/<slug>/` ma rich meta:
- <h1> tytuł (w class="data" wrapper)
- <a href="/year/YYYY/" rel="tag"> rok produkcji
- <a href="/studios/<slug>/" rel="tag"> studio
- <span class='duration'>NN mins.</span> długość
- <a href="/pornstar/<slug>/"> cast (multi)
- <a href="/genre/<slug>/"> tagi (multi)
- <div itemprop="description"><p>...</p></div> opis
- <span class="dt_rating_vgs" itemprop="ratingValue">N</span> rating 0-10
- <li ... data-fl-source="<embed_url>"><a href="<embed_link>">Host</a></li> player options
Player ma multi-host options (DoodStream, LuluStream, RPMShare etc.) — każdy embed
URL idzie jako osobny `playback_source` z origin=`{site}:{host}` żeby później mobile
mógł wybrać czyim embedem chce odpalić scenę.
"""
from __future__ import annotations
import logging
import re
from collections.abc import Iterator
from datetime import date, datetime
from typing import Any
import httpx
from app.connectors.base import (
BaseMovieConnector,
RawMovie,
RawPerformer,
RawPlaybackSource,
RawStudio,
RawTag,
)
from app.extractors import browser_get
from app.models.source import SourceKind
log = logging.getLogger(__name__)
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
# ---- selektory (theme-agnostic — działa dla dowolnego dooplay) -----------
# Listing item — dwa wzorce w zależności od witryny:
# 1. mangoporn: zwykłe <a href="https://site/movies/<slug>/"> bez klasy
# (theme wyrendurował SEO-friendly URL bezpośrednio w grid)
# 2. streamporn/pandamovies: <a class="ml-mask jt" href="<base>/<slug>/">
# (slug bez /movies/ prefix, np. /watch-xxx-...-adult-movie-online-free/)
# Łapiemy oba przez alternatywę.
_LIST_ITEM_RE = re.compile(
r'<a\s+href="(?P<url>https?://[^"]+)"[^>]*\bclass="ml-mask\b[^"]*"'
r"|"
r'<a\s+href="(?P<url2>https?://[^"]+/movies/[a-z0-9-]+/)"',
re.IGNORECASE,
)
# Tolerantny title — mangoporn (dooplay) używa <h1> w class="data", streamporn/pandamovies
# (raw PsyPlay theme) używają <h3 itemprop="name">. Łapiemy oba przez itemprop="name".
_TITLE_RE = re.compile(
r'<h[1-6][^>]*\sitemprop="name"[^>]*>([^<]+)</h[1-6]>'
r'|class="data"[^>]*>\s*<h[1-6][^>]*>([^<]+)</h[1-6]>',
re.IGNORECASE | re.DOTALL,
)
# dooplay uses /year/, raw PsyPlay uses /release-year/. Same dla pozostałych slugów —
# różne thema dziedziczą podstawowy markup ale customizują URL słowniki.
_YEAR_RE = re.compile(
r'/(?:year|release-year)/(\d{4})/"\s*rel="tag"', re.IGNORECASE
)
_STUDIO_RE = re.compile(
r'href="https?://[^/]+/(?:studios?|director)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Duration: span class='duration' (dooplay) lub <p><strong>Duration:</strong> X hrs. Y mins.</p> (PsyPlay)
_DURATION_SPAN_RE = re.compile(
r"<span\s+class=['\"]duration['\"][^>]*>([^<]+)</span>", re.IGNORECASE
)
_DURATION_TEXT_RE = re.compile(
r"<strong>\s*Duration:\s*</strong>\s*([^<]+)<", re.IGNORECASE
)
# Release date: span class='release_date' (dooplay) lub <p><strong>Released Date:</strong> X</p> (PsyPlay)
_RELEASE_DATE_SPAN_RE = re.compile(
r"<span\s+class=['\"]release_date['\"]'?[^>]*>([^<]+)</span>", re.IGNORECASE
)
_RELEASE_DATE_TEXT_RE = re.compile(
r"<strong>\s*Released?\s*Date:\s*</strong>\s*([^<]+)<", re.IGNORECASE
)
_DESCRIPTION_RE = re.compile(
r'itemprop="description"[^>]*>(.*?)</div>', re.IGNORECASE | re.DOTALL
)
_RATING_RE = re.compile(
r'itemprop="ratingValue"[^>]*>([\d.]+)</span>', re.IGNORECASE
)
# Cast: dooplay /pornstar/, PsyPlay /actor/
_PORNSTAR_RE = re.compile(
r'href="https?://[^/]+/(?:pornstar|actor)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Genre: same /genre(s)/ w obu themach
_GENRE_TAG_RE = re.compile(
r'href="https?://[^/]+/genres?/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Player options: data-fl-source jest oryginalnym embed URL hostera, data-fl-url
# to page URL u hostera. Stare theme (mangoporn): `<li class="hosts-buttons-wpx">`.
# Nowe theme (pandamovies od ~2026-04): `<div class="Rtable1-cell" data-fl-url=...
# data-fl-source=...>`. Trzeba też tolerować order-independent attrs — nowe theme
# emituje url BEFORE source, stare odwrotnie. Łapiemy oba wzorce dwoma osobnymi
# regexami i konsolidujemy w `_iter_player_options`.
_PLAYER_OPTION_RE = re.compile(
r'<li[^>]*\bclass="hosts-buttons-wpx"[^>]*'
r'(?:data-fl-source="(?P<source>[^"]*)"[^>]*)?'
r'(?:data-fl-url="(?P<page>[^"]*)"[^>]*)?'
r'>\s*<a[^>]*href="(?P<href>[^"]+)"[^>]*'
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
re.IGNORECASE | re.DOTALL,
)
# Nowy markup pandamovies: `<div class="Rtable1-cell" data-fl-* ...><a href=...>HostName</a></div>`.
# Attrs są w kolejności url→source, source często pusty (`data-fl-source=""` dla
# doodstream/mixdrop/easyvidplayer). Capturujemy CAŁY opening tag w group(1)
# żeby data-fl-source należał gwarantowanie do TEGO konkretnego div (wcześniejszy
# window-lookback 600 chars mógł pickować poprzedni cell — cross-attribution
# doodstream→mixdrop entry, code-review #14).
_PLAYER_OPTION_DIV_RE = re.compile(
r'(<div[^>]*\bclass="Rtable1-cell"[^>]*>)\s*'
r'<a[^>]*href="(?P<href>[^"]+)"[^>]*'
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
re.IGNORECASE | re.DOTALL,
)
_DATA_FL_SOURCE_RE = re.compile(r'data-fl-source="([^"]*)"', re.IGNORECASE)
# Poster — JSON-LD `thumbnailUrl` jest najbardziej stabilny (każdy dooplay/PsyPlay
# theme z SEO ma JSON-LD VideoObject schema). Fallback na class="poster" img dla starych
# instalacji bez schema. Trzeci fallback: og:image meta tag.
_POSTER_JSONLD_RE = re.compile(
r'"thumbnailUrl"\s*:\s*"([^"]+\.(?:jpg|jpeg|png|webp)[^"]*)"', re.IGNORECASE
)
_POSTER_RE = re.compile(
r'class="poster"[^>]*>\s*<img\s+[^>]*src="([^"]+)"', re.IGNORECASE
)
_POSTER_OG_RE = re.compile(
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
)
_DURATION_MINS_RE = re.compile(r"(\d+)\s*min", re.IGNORECASE)
class DooplayConnector(BaseMovieConnector):
"""Generic dooplay scraper. Instantiated per-site via subclasses below."""
kind = SourceKind.scraper
base_url: str
name: str
def __init__(self, *, timeout: float = 30.0):
if not getattr(self, "base_url", None):
raise RuntimeError(f"{type(self).__name__} requires class-level `base_url`")
if not getattr(self, "name", None):
raise RuntimeError(f"{type(self).__name__} requires class-level `name`")
self._timeout = timeout
def close(self) -> None:
pass
def _fetch(self, url: str) -> str:
"""browser_get z chrome120 impersonation — psyplay sites czasem blokują
czysty httpx (Python TLS fingerprint) zwracając 500/403. curl_cffi fixuje to."""
if not url.startswith("http"):
url = self.base_url.rstrip("/") + url
headers = {
"User-Agent": USER_AGENT,
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
"Referer": self.base_url + "/",
}
r = browser_get(url, headers=headers, timeout=self._timeout, follow_redirects=True)
if r.status_code >= 400:
raise httpx.HTTPStatusError(
f"{r.status_code} for {url}",
request=None, # type: ignore[arg-type]
response=httpx.Response(r.status_code, text=r.text[:200]),
)
return r.text
def fetch_movies(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawMovie]:
seen = 0
page = 1
seen_urls: set[str] = set()
while True:
try:
urls = list(self._fetch_listing(page))
except httpx.HTTPError as e:
log.warning("%s listing page=%d failed: %s", self.name, page, e)
return
if not urls:
log.info("%s: empty page=%d, stop", self.name, page)
return
for url in urls:
if url in seen_urls:
continue
seen_urls.add(url)
try:
movie = self._fetch_detail(url)
except httpx.HTTPError as e:
log.warning("%s detail %s failed: %s", self.name, url, e)
continue
if movie is None:
continue
yield movie
seen += 1
if limit is not None and seen >= limit:
return
page += 1
def _fetch_listing(self, page: int) -> Iterator[str]:
path = self._listing_path(page)
text = self._fetch(path)
from urllib.parse import urlparse
site_host = urlparse(self.base_url).hostname
for m in _LIST_ITEM_RE.finditer(text):
url = m.group("url") or m.group("url2")
if not url:
continue
try:
if urlparse(url).hostname != site_host:
continue
except Exception:
continue
yield url
def _listing_path(self, page: int) -> str:
return "/movies/" if page == 1 else f"/movies/page/{page}/"
def _fetch_detail(self, url: str) -> RawMovie | None:
from urllib.parse import urlparse
path = urlparse(url).path.rstrip("/")
slug = path.split("/")[-1] or "root"
text = self._fetch(url)
return _parse_dooplay_detail(
slug=slug, page_url=url, html=text,
source_name=self.name, base_url=self.base_url,
)
def _parse_dooplay_detail(
*, slug: str, html: str, source_name: str, base_url: str, page_url: str | None = None
) -> RawMovie | None:
m_title = _TITLE_RE.search(html)
if not m_title:
log.warning("%s: no title in %s", source_name, slug)
return None
title = _decode_html((m_title.group(1) or m_title.group(2)).strip())
m_year = _YEAR_RE.search(html)
release_year = int(m_year.group(1)) if m_year else None
studio: RawStudio | None = None
m_studio = _STUDIO_RE.search(html)
if m_studio:
studio_slug = m_studio.group(1)
studio_name = _decode_html(m_studio.group(2).strip())
studio = RawStudio(
external_id=f"{source_name}:{studio_slug}",
name=studio_name,
slug=studio_slug,
)
duration_sec: int | None = None
m_dur = _DURATION_SPAN_RE.search(html) or _DURATION_TEXT_RE.search(html)
if m_dur:
text = m_dur.group(1)
# Może być "32 mins." (dooplay) albo "1 hrs. 12 mins." (PsyPlay)
m_h = re.search(r"(\d+)\s*hr", text, re.IGNORECASE)
m_m = re.search(r"(\d+)\s*min", text, re.IGNORECASE)
if m_h or m_m:
duration_sec = (int(m_h.group(1)) * 3600 if m_h else 0) + (int(m_m.group(1)) * 60 if m_m else 0)
release_date: date | None = None
m_rd = _RELEASE_DATE_SPAN_RE.search(html) or _RELEASE_DATE_TEXT_RE.search(html)
if m_rd:
text = m_rd.group(1).strip()
for fmt in ("%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"):
try:
release_date = datetime.strptime(text, fmt).date()
break
except ValueError:
continue
description: str | None = None
m_desc = _DESCRIPTION_RE.search(html)
if m_desc:
description = _decode_html(_strip_tags(m_desc.group(1))).strip() or None
rating: float | None = None
m_rating = _RATING_RE.search(html)
if m_rating:
try:
rating = float(m_rating.group(1))
except ValueError:
pass
poster_url: str | None = None
for rgx in (_POSTER_JSONLD_RE, _POSTER_RE, _POSTER_OG_RE):
m = rgx.search(html)
if m:
candidate = m.group(1).strip()
if candidate and "blank.gif" not in candidate and "no-poster" not in candidate:
poster_url = candidate
break
# Performers — tylko sekcja "Pornstars" ma /pornstar/<slug>/ linki, dooplay
# filtruje cast w tej sekcji. Jaccard może łapać dubel ale dedup robimy w
# resolverze (po performer_id).
performers = [
RawPerformer(
external_id=f"{source_name}:{m.group(1)}",
name=_decode_html(m.group(2).strip()),
)
for m in _PORNSTAR_RE.finditer(html)
]
tags = [
RawTag(
external_id=f"{source_name}:{m.group(1)}",
name=_decode_html(m.group(2).strip()),
slug=m.group(1),
)
for m in _GENRE_TAG_RE.finditer(html)
]
if page_url is None:
page_url = f"{base_url}/movies/{slug}/"
# Playback sources: każdy host (Doodstream/Lulu/RPM/...) jako osobny entry.
# Dedup po href żeby ten sam host nie wpadł 2x. Raw landing page (origin=
# source_name, bez :host) appendujemy TYLKO gdy nie ma żadnych sub-hosters —
# inaczej myli usera (otwiera WebView z reklamami zamiast video; bug-report
# 2026-05-16: "mangoporn przekierowuje do strony, reklama full screen").
playback_sources: list[RawPlaybackSource] = []
seen_hrefs: set[str] = set()
# Hostery file-download (non-streamable) + malware. Mobile player nie potrafi
# ich odtworzyć — rapidgator/nitroflare/frdl serwują .zip/.rar/.mp4 do download
# (premium login required), streamtape ma malware drive-by .reg. Skipujemy
# przy ingest żeby nie zaśmiecać UI martwym contentem (bug-report 2026-05-18).
SKIP_HOSTERS = {"rapidgator", "nitroflare", "nitro", "frdl", "streamtape"}
def _emit_host_entry(href: str, source: str | None) -> None:
href = href.strip()
if not href or href in seen_hrefs:
return
seen_hrefs.add(href)
try:
from urllib.parse import urlparse
host = urlparse(href).hostname or "unknown"
host_short = host.split(".")[-2] if host.count(".") >= 1 else host
except Exception:
host_short = "unknown"
if host_short.lower() in SKIP_HOSTERS:
return
playback_sources.append(
RawPlaybackSource(
origin=f"{source_name}:{host_short}",
page_url=href,
embed_url=source or href,
thumbnail_url=poster_url,
duration_sec=duration_sec,
)
)
# Stary `<li class="hosts-buttons-wpx">` markup (mangoporn).
for m in _PLAYER_OPTION_RE.finditer(html):
_emit_host_entry(m.group("href") or "", (m.group("source") or "").strip() or None)
# Nowy `<div class="Rtable1-cell">` markup (pandamovies od ~2026-04 + nowe
# streamporn instances). data-fl-source jest opcjonalny — capturujemy CAŁY
# opening tag w group(1), data-fl-source extract z TEGO tagu (nie z window
# lookback po HTMLu, bo to mogło pickować poprzedni cell).
for m in _PLAYER_OPTION_DIV_RE.finditer(html):
href = m.group("href") or ""
opening_tag = m.group(1)
src_match = _DATA_FL_SOURCE_RE.search(opening_tag)
source = (src_match.group(1).strip() if src_match else "") or None
_emit_host_entry(href, source)
if not playback_sources:
# Brak sub-hosters znalezionych — fallback do landing page (mobile otworzy
# w WebView). Robimy to TYLKO gdy nie ma alternatyw, inaczej landing jest
# niepotrzebnym ad-pageiem.
playback_sources.append(
RawPlaybackSource(
origin=source_name,
page_url=page_url,
thumbnail_url=poster_url,
)
)
return RawMovie(
external_id=slug,
title=title,
description=description,
release_year=release_year,
release_date=release_date,
duration_sec=duration_sec,
rating=rating,
poster_url=poster_url,
url=page_url,
studio=studio,
performers=performers,
tags=tags,
playback_sources=playback_sources,
raw={"slug": slug, "html_len": len(html)},
)
# ---- per-site instances ----------------------------------------------------
class StreampornConnector(DooplayConnector):
name = "streamporn"
base_url = "https://streamporn.nl"
class PandamoviesConnector(DooplayConnector):
name = "pandamovies"
base_url = "https://pandamovies.pw"
class MangopornConnector(DooplayConnector):
name = "mangoporn"
base_url = "https://mangoporn.net"
# ---------------------------------------------------------------------------
# Helpers (zduplikowane z paradisehill.py — celowo, żeby connectory były niezależne)
# ---------------------------------------------------------------------------
_TAG_RE = re.compile(r"<[^>]+>")
def _strip_tags(s: str) -> str:
return _TAG_RE.sub("", s)
_HTML_ENTITIES = {
"&amp;": "&", "&lt;": "<", "&gt;": ">", "&quot;": '"', "&#39;": "'",
"&apos;": "'", "&nbsp;": " ", "&rsquo;": "'", "&lsquo;": "'",
"&rdquo;": '"', "&ldquo;": '"', "&hellip;": "...", "&mdash;": "", "&ndash;": "",
}
def _decode_html(s: str) -> str:
for k, v in _HTML_ENTITIES.items():
s = s.replace(k, v)
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
return s