"""dooplay (a.k.a. PsyPlay) WordPress theme scraper — generic dla mangoporn/streamporn/pandamovies. Te 3 strony to dokładnie ten sam template (theme=dooplay + PsyPlay player plugin), więc parametryzujemy connector po `(base_url, source_name)` i odpalamy 3 instancje. Listing: `/movies/page/N/` zwraca per item. Detail: `/movies//` ma rich meta: -

tytuł (w class="data" wrapper) - cast (multi) - tagi (multi) -

...

opis - N rating 0-10 -
  • Host
  • player options Player ma multi-host options (DoodStream, LuluStream, RPMShare etc.) — każdy embed URL idzie jako osobny `playback_source` z origin=`{site}:{host}` żeby później mobile mógł wybrać czyim embedem chce odpalić scenę. """ from __future__ import annotations import logging import re from collections.abc import Iterator from datetime import date, datetime from typing import Any import httpx from app.connectors.base import ( BaseMovieConnector, RawMovie, RawPerformer, RawPlaybackSource, RawStudio, RawTag, ) from app.extractors import browser_get from app.models.source import SourceKind log = logging.getLogger(__name__) USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" ) # ---- selektory (theme-agnostic — działa dla dowolnego dooplay) ----------- # Listing item — dwa wzorce w zależności od witryny: # 1. mangoporn: zwykłe bez klasy # (theme wyrendurował SEO-friendly URL bezpośrednio w grid) # 2. streamporn/pandamovies: # (slug bez /movies/ prefix, np. /watch-xxx-...-adult-movie-online-free/) # Łapiemy oba przez alternatywę. _LIST_ITEM_RE = re.compile( r'https?://[^"]+)"[^>]*\bclass="ml-mask\b[^"]*"' r"|" r'https?://[^"]+/movies/[a-z0-9-]+/)"', re.IGNORECASE, ) # Tolerantny title — mangoporn (dooplay) używa

    w class="data", streamporn/pandamovies # (raw PsyPlay theme) używają

    . Łapiemy oba przez itemprop="name". _TITLE_RE = re.compile( r']*\sitemprop="name"[^>]*>([^<]+)' r'|class="data"[^>]*>\s*]*>([^<]+)', re.IGNORECASE | re.DOTALL, ) # dooplay uses /year/, raw PsyPlay uses /release-year/. Same dla pozostałych slugów — # różne thema dziedziczą podstawowy markup ale customizują URL słowniki. _YEAR_RE = re.compile( r'/(?:year|release-year)/(\d{4})/"\s*rel="tag"', re.IGNORECASE ) _STUDIO_RE = re.compile( r'href="https?://[^/]+/(?:studios?|director)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)', re.IGNORECASE, ) # Duration: span class='duration' (dooplay) lub

    Duration: X hrs. Y mins.

    (PsyPlay) _DURATION_SPAN_RE = re.compile( r"]*>([^<]+)", re.IGNORECASE ) _DURATION_TEXT_RE = re.compile( r"\s*Duration:\s*\s*([^<]+)<", re.IGNORECASE ) # Release date: span class='release_date' (dooplay) lub

    Released Date: X

    (PsyPlay) _RELEASE_DATE_SPAN_RE = re.compile( r"]*>([^<]+)", re.IGNORECASE ) _RELEASE_DATE_TEXT_RE = re.compile( r"\s*Released?\s*Date:\s*\s*([^<]+)<", re.IGNORECASE ) _DESCRIPTION_RE = re.compile( r'itemprop="description"[^>]*>(.*?)', re.IGNORECASE | re.DOTALL ) _RATING_RE = re.compile( r'itemprop="ratingValue"[^>]*>([\d.]+)', re.IGNORECASE ) # Cast: dooplay /pornstar/, PsyPlay /actor/ _PORNSTAR_RE = re.compile( r'href="https?://[^/]+/(?:pornstar|actor)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)', re.IGNORECASE, ) # Genre: same /genre(s)/ w obu themach _GENRE_TAG_RE = re.compile( r'href="https?://[^/]+/genres?/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)', re.IGNORECASE, ) # Player options: data-fl-source jest oryginalnym embed URL hostera, data-fl-url # to page URL u hostera. Stare theme (mangoporn): `
  • `. # Nowe theme (pandamovies od ~2026-04): `
    `. Trzeba też tolerować order-independent attrs — nowe theme # emituje url BEFORE source, stare odwrotnie. Łapiemy oba wzorce dwoma osobnymi # regexami i konsolidujemy w `_iter_player_options`. _PLAYER_OPTION_RE = re.compile( r']*\bclass="hosts-buttons-wpx"[^>]*' r'(?:data-fl-source="(?P[^"]*)"[^>]*)?' r'(?:data-fl-url="(?P[^"]*)"[^>]*)?' r'>\s*]*href="(?P[^"]+)"[^>]*' r'(?:[^<]*]+>)?\s*([^<]+?)\s*', re.IGNORECASE | re.DOTALL, ) # Nowy markup pandamovies: ``. # Attrs są w kolejności url→source, source często pusty (`data-fl-source=""` dla # doodstream/mixdrop/easyvidplayer). Capturujemy CAŁY opening tag w group(1) # żeby data-fl-source należał gwarantowanie do TEGO konkretnego div (wcześniejszy # window-lookback 600 chars mógł pickować poprzedni cell — cross-attribution # doodstream→mixdrop entry, code-review #14). _PLAYER_OPTION_DIV_RE = re.compile( r'(]*\bclass="Rtable1-cell"[^>]*>)\s*' r']*href="(?P[^"]+)"[^>]*' r'(?:[^<]*]+>)?\s*([^<]+?)\s*', re.IGNORECASE | re.DOTALL, ) _DATA_FL_SOURCE_RE = re.compile(r'data-fl-source="([^"]*)"', re.IGNORECASE) # Poster — JSON-LD `thumbnailUrl` jest najbardziej stabilny (każdy dooplay/PsyPlay # theme z SEO ma JSON-LD VideoObject schema). Fallback na class="poster" img dla starych # instalacji bez schema. Trzeci fallback: og:image meta tag. _POSTER_JSONLD_RE = re.compile( r'"thumbnailUrl"\s*:\s*"([^"]+\.(?:jpg|jpeg|png|webp)[^"]*)"', re.IGNORECASE ) _POSTER_RE = re.compile( r'class="poster"[^>]*>\s*]*src="([^"]+)"', re.IGNORECASE ) _POSTER_OG_RE = re.compile( r' None: pass def _fetch(self, url: str) -> str: """browser_get z chrome120 impersonation — psyplay sites czasem blokują czysty httpx (Python TLS fingerprint) zwracając 500/403. curl_cffi fixuje to.""" if not url.startswith("http"): url = self.base_url.rstrip("/") + url headers = { "User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml", "Referer": self.base_url + "/", } r = browser_get(url, headers=headers, timeout=self._timeout, follow_redirects=True) if r.status_code >= 400: raise httpx.HTTPStatusError( f"{r.status_code} for {url}", request=None, # type: ignore[arg-type] response=httpx.Response(r.status_code, text=r.text[:200]), ) return r.text # Bezpiecznik — dooplay listing potrafi mieć tysiące stron (streamporn.nl ma # >2k filmów). Bez tego ingest wisi godzinami, jest killowany przy restartcie # schedulera, blokując kolejne connectory w queue (bug-report 2026-05-28: "od # wczoraj nie ma nowych filmów" — streamporn wisiał od 5-24, blokował # mangoporn + pandamovies). Listing jest sortowany po dacie uploadu (NIE # release_date filmu — release może być z 2013 a upload z dziś), ale upload # date nie jest w markupie, więc filtrowanie po `since` przez release_date # nie działa. Pragmatyczny cap stron: 3 dla delta (≈150 nowych pozycji/dzień # to znacznie powyżej realnego upload-rate), 50 dla full ingestu (`since=None`). _MAX_PAGES_DELTA = 3 _MAX_PAGES_FULL = 50 def fetch_movies( self, *, since: datetime | None = None, limit: int | None = None, ) -> Iterator[RawMovie]: seen = 0 page = 1 seen_urls: set[str] = set() max_pages = self._MAX_PAGES_DELTA if since is not None else self._MAX_PAGES_FULL while page <= max_pages: try: urls = list(self._fetch_listing(page)) except httpx.HTTPError as e: log.warning("%s listing page=%d failed: %s", self.name, page, e) return if not urls: log.info("%s: empty page=%d, stop", self.name, page) return for url in urls: if url in seen_urls: continue seen_urls.add(url) try: movie = self._fetch_detail(url) except httpx.HTTPError as e: log.warning("%s detail %s failed: %s", self.name, url, e) continue if movie is None: continue yield movie seen += 1 if limit is not None and seen >= limit: return page += 1 log.info( "%s: hit max_pages=%d cap (delta=%s), stopping after seen=%d", self.name, max_pages, since is not None, seen, ) def _fetch_listing(self, page: int) -> Iterator[str]: path = self._listing_path(page) text = self._fetch(path) from urllib.parse import urlparse site_host = urlparse(self.base_url).hostname for m in _LIST_ITEM_RE.finditer(text): url = m.group("url") or m.group("url2") if not url: continue try: if urlparse(url).hostname != site_host: continue except Exception: continue yield url def _listing_path(self, page: int) -> str: return "/movies/" if page == 1 else f"/movies/page/{page}/" def _fetch_detail(self, url: str) -> RawMovie | None: from urllib.parse import urlparse path = urlparse(url).path.rstrip("/") slug = path.split("/")[-1] or "root" text = self._fetch(url) return _parse_dooplay_detail( slug=slug, page_url=url, html=text, source_name=self.name, base_url=self.base_url, ) def _parse_dooplay_detail( *, slug: str, html: str, source_name: str, base_url: str, page_url: str | None = None ) -> RawMovie | None: m_title = _TITLE_RE.search(html) if not m_title: log.warning("%s: no title in %s", source_name, slug) return None title = _decode_html((m_title.group(1) or m_title.group(2)).strip()) m_year = _YEAR_RE.search(html) release_year = int(m_year.group(1)) if m_year else None studio: RawStudio | None = None m_studio = _STUDIO_RE.search(html) if m_studio: studio_slug = m_studio.group(1) studio_name = _decode_html(m_studio.group(2).strip()) studio = RawStudio( external_id=f"{source_name}:{studio_slug}", name=studio_name, slug=studio_slug, ) duration_sec: int | None = None m_dur = _DURATION_SPAN_RE.search(html) or _DURATION_TEXT_RE.search(html) if m_dur: text = m_dur.group(1) # Może być "32 mins." (dooplay) albo "1 hrs. 12 mins." (PsyPlay) m_h = re.search(r"(\d+)\s*hr", text, re.IGNORECASE) m_m = re.search(r"(\d+)\s*min", text, re.IGNORECASE) if m_h or m_m: duration_sec = (int(m_h.group(1)) * 3600 if m_h else 0) + (int(m_m.group(1)) * 60 if m_m else 0) release_date: date | None = None m_rd = _RELEASE_DATE_SPAN_RE.search(html) or _RELEASE_DATE_TEXT_RE.search(html) if m_rd: text = m_rd.group(1).strip() for fmt in ("%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"): try: release_date = datetime.strptime(text, fmt).date() break except ValueError: continue description: str | None = None m_desc = _DESCRIPTION_RE.search(html) if m_desc: description = _decode_html(_strip_tags(m_desc.group(1))).strip() or None rating: float | None = None m_rating = _RATING_RE.search(html) if m_rating: try: rating = float(m_rating.group(1)) except ValueError: pass poster_url: str | None = None for rgx in (_POSTER_JSONLD_RE, _POSTER_RE, _POSTER_OG_RE): m = rgx.search(html) if m: candidate = m.group(1).strip() if candidate and "blank.gif" not in candidate and "no-poster" not in candidate: poster_url = candidate break # Performers — tylko sekcja "Pornstars" ma /pornstar// linki, dooplay # filtruje cast w tej sekcji. Jaccard może łapać dubel ale dedup robimy w # resolverze (po performer_id). performers = [ RawPerformer( external_id=f"{source_name}:{m.group(1)}", name=_decode_html(m.group(2).strip()), ) for m in _PORNSTAR_RE.finditer(html) ] tags = [ RawTag( external_id=f"{source_name}:{m.group(1)}", name=_decode_html(m.group(2).strip()), slug=m.group(1), ) for m in _GENRE_TAG_RE.finditer(html) ] if page_url is None: page_url = f"{base_url}/movies/{slug}/" # Playback sources: każdy host (Doodstream/Lulu/RPM/...) jako osobny entry. # Dedup po href żeby ten sam host nie wpadł 2x. Raw landing page (origin= # source_name, bez :host) appendujemy TYLKO gdy nie ma żadnych sub-hosters — # inaczej myli usera (otwiera WebView z reklamami zamiast video; bug-report # 2026-05-16: "mangoporn przekierowuje do strony, reklama full screen"). playback_sources: list[RawPlaybackSource] = [] seen_hrefs: set[str] = set() # Hostery file-download (non-streamable) + malware. Mobile player nie potrafi # ich odtworzyć — rapidgator/nitroflare/frdl serwują .zip/.rar/.mp4 do download # (premium login required), streamtape ma malware drive-by .reg. Skipujemy # przy ingest żeby nie zaśmiecać UI martwym contentem (bug-report 2026-05-18). SKIP_HOSTERS = {"rapidgator", "nitroflare", "nitro", "frdl", "streamtape"} def _emit_host_entry(href: str, source: str | None) -> None: href = href.strip() if not href or href in seen_hrefs: return seen_hrefs.add(href) try: from urllib.parse import urlparse host = urlparse(href).hostname or "unknown" host_short = host.split(".")[-2] if host.count(".") >= 1 else host except Exception: host_short = "unknown" if host_short.lower() in SKIP_HOSTERS: return playback_sources.append( RawPlaybackSource( origin=f"{source_name}:{host_short}", page_url=href, embed_url=source or href, thumbnail_url=poster_url, duration_sec=duration_sec, ) ) # Stary `
  • ` markup (mangoporn). for m in _PLAYER_OPTION_RE.finditer(html): _emit_host_entry(m.group("href") or "", (m.group("source") or "").strip() or None) # Nowy `
    ` markup (pandamovies od ~2026-04 + nowe # streamporn instances). data-fl-source jest opcjonalny — capturujemy CAŁY # opening tag w group(1), data-fl-source extract z TEGO tagu (nie z window # lookback po HTMLu, bo to mogło pickować poprzedni cell). for m in _PLAYER_OPTION_DIV_RE.finditer(html): href = m.group("href") or "" opening_tag = m.group(1) src_match = _DATA_FL_SOURCE_RE.search(opening_tag) source = (src_match.group(1).strip() if src_match else "") or None _emit_host_entry(href, source) if not playback_sources: # Brak sub-hosters znalezionych — fallback do landing page (mobile otworzy # w WebView). Robimy to TYLKO gdy nie ma alternatyw, inaczej landing jest # niepotrzebnym ad-pageiem. playback_sources.append( RawPlaybackSource( origin=source_name, page_url=page_url, thumbnail_url=poster_url, ) ) return RawMovie( external_id=slug, title=title, description=description, release_year=release_year, release_date=release_date, duration_sec=duration_sec, rating=rating, poster_url=poster_url, url=page_url, studio=studio, performers=performers, tags=tags, playback_sources=playback_sources, raw={"slug": slug, "html_len": len(html)}, ) # ---- per-site instances ---------------------------------------------------- class StreampornConnector(DooplayConnector): name = "streamporn" base_url = "https://streamporn.nl" class PandamoviesConnector(DooplayConnector): name = "pandamovies" base_url = "https://pandamovies.pw" class MangopornConnector(DooplayConnector): name = "mangoporn" base_url = "https://mangoporn.net" # --------------------------------------------------------------------------- # Helpers (zduplikowane z paradisehill.py — celowo, żeby connectory były niezależne) # --------------------------------------------------------------------------- _TAG_RE = re.compile(r"<[^>]+>") def _strip_tags(s: str) -> str: return _TAG_RE.sub("", s) _HTML_ENTITIES = { "&": "&", "<": "<", ">": ">", """: '"', "'": "'", "'": "'", " ": " ", "’": "'", "‘": "'", "”": '"', "“": '"', "…": "...", "—": "—", "–": "–", } def _decode_html(s: str) -> str: for k, v in _HTML_ENTITIES.items(): s = s.replace(k, v) s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s) s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s) return s