]*\bclass="Rtable1-cell"[^>]*>)\s*'
r'
]*href="(?P[^"]+)"[^>]*'
r'(?:[^<]*
]+>)?\s*([^<]+?)\s*',
re.IGNORECASE | re.DOTALL,
)
_DATA_FL_SOURCE_RE = re.compile(r'data-fl-source="([^"]*)"', re.IGNORECASE)
# Poster — JSON-LD `thumbnailUrl` jest najbardziej stabilny (każdy dooplay/PsyPlay
# theme z SEO ma JSON-LD VideoObject schema). Fallback na class="poster" img dla starych
# instalacji bez schema. Trzeci fallback: og:image meta tag.
_POSTER_JSONLD_RE = re.compile(
r'"thumbnailUrl"\s*:\s*"([^"]+\.(?:jpg|jpeg|png|webp)[^"]*)"', re.IGNORECASE
)
_POSTER_RE = re.compile(
r'class="poster"[^>]*>\s*
![]()
]*src="([^"]+)"', re.IGNORECASE
)
_POSTER_OG_RE = re.compile(
r'
None:
pass
def _fetch(self, url: str) -> str:
"""browser_get z chrome120 impersonation — psyplay sites czasem blokują
czysty httpx (Python TLS fingerprint) zwracając 500/403. curl_cffi fixuje to."""
if not url.startswith("http"):
url = self.base_url.rstrip("/") + url
headers = {
"User-Agent": USER_AGENT,
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
"Referer": self.base_url + "/",
}
r = browser_get(url, headers=headers, timeout=self._timeout, follow_redirects=True)
if r.status_code >= 400:
raise httpx.HTTPStatusError(
f"{r.status_code} for {url}",
request=None, # type: ignore[arg-type]
response=httpx.Response(r.status_code, text=r.text[:200]),
)
return r.text
# Bezpiecznik — dooplay listing potrafi mieć tysiące stron (streamporn.nl ma
# >2k filmów). Bez tego ingest wisi godzinami, jest killowany przy restartcie
# schedulera, blokując kolejne connectory w queue (bug-report 2026-05-28: "od
# wczoraj nie ma nowych filmów" — streamporn wisiał od 5-24, blokował
# mangoporn + pandamovies). Listing jest sortowany po dacie uploadu (NIE
# release_date filmu — release może być z 2013 a upload z dziś), ale upload
# date nie jest w markupie, więc filtrowanie po `since` przez release_date
# nie działa. Pragmatyczny cap stron: 3 dla delta (≈150 nowych pozycji/dzień
# to znacznie powyżej realnego upload-rate), 50 dla full ingestu (`since=None`).
_MAX_PAGES_DELTA = 3
_MAX_PAGES_FULL = 50
def fetch_movies(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawMovie]:
seen = 0
page = 1
seen_urls: set[str] = set()
max_pages = self._MAX_PAGES_DELTA if since is not None else self._MAX_PAGES_FULL
while page <= max_pages:
try:
urls = list(self._fetch_listing(page))
except httpx.HTTPError as e:
log.warning("%s listing page=%d failed: %s", self.name, page, e)
return
if not urls:
log.info("%s: empty page=%d, stop", self.name, page)
return
for url in urls:
if url in seen_urls:
continue
seen_urls.add(url)
try:
movie = self._fetch_detail(url)
except httpx.HTTPError as e:
log.warning("%s detail %s failed: %s", self.name, url, e)
continue
if movie is None:
continue
yield movie
seen += 1
if limit is not None and seen >= limit:
return
page += 1
log.info(
"%s: hit max_pages=%d cap (delta=%s), stopping after seen=%d",
self.name, max_pages, since is not None, seen,
)
def _fetch_listing(self, page: int) -> Iterator[str]:
path = self._listing_path(page)
text = self._fetch(path)
from urllib.parse import urlparse
site_host = urlparse(self.base_url).hostname
for m in _LIST_ITEM_RE.finditer(text):
url = m.group("url") or m.group("url2")
if not url:
continue
try:
if urlparse(url).hostname != site_host:
continue
except Exception:
continue
yield url
def _listing_path(self, page: int) -> str:
return "/movies/" if page == 1 else f"/movies/page/{page}/"
def _fetch_detail(self, url: str) -> RawMovie | None:
from urllib.parse import urlparse
path = urlparse(url).path.rstrip("/")
slug = path.split("/")[-1] or "root"
text = self._fetch(url)
return _parse_dooplay_detail(
slug=slug, page_url=url, html=text,
source_name=self.name, base_url=self.base_url,
)
def _parse_dooplay_detail(
*, slug: str, html: str, source_name: str, base_url: str, page_url: str | None = None
) -> RawMovie | None:
m_title = _TITLE_RE.search(html)
if not m_title:
log.warning("%s: no title in %s", source_name, slug)
return None
title = _decode_html((m_title.group(1) or m_title.group(2)).strip())
m_year = _YEAR_RE.search(html)
release_year = int(m_year.group(1)) if m_year else None
studio: RawStudio | None = None
m_studio = _STUDIO_RE.search(html)
if m_studio:
studio_slug = m_studio.group(1)
studio_name = _decode_html(m_studio.group(2).strip())
studio = RawStudio(
external_id=f"{source_name}:{studio_slug}",
name=studio_name,
slug=studio_slug,
)
duration_sec: int | None = None
m_dur = _DURATION_SPAN_RE.search(html) or _DURATION_TEXT_RE.search(html)
if m_dur:
text = m_dur.group(1)
# Może być "32 mins." (dooplay) albo "1 hrs. 12 mins." (PsyPlay)
m_h = re.search(r"(\d+)\s*hr", text, re.IGNORECASE)
m_m = re.search(r"(\d+)\s*min", text, re.IGNORECASE)
if m_h or m_m:
duration_sec = (int(m_h.group(1)) * 3600 if m_h else 0) + (int(m_m.group(1)) * 60 if m_m else 0)
release_date: date | None = None
m_rd = _RELEASE_DATE_SPAN_RE.search(html) or _RELEASE_DATE_TEXT_RE.search(html)
if m_rd:
text = m_rd.group(1).strip()
for fmt in ("%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"):
try:
release_date = datetime.strptime(text, fmt).date()
break
except ValueError:
continue
description: str | None = None
m_desc = _DESCRIPTION_RE.search(html)
if m_desc:
description = _decode_html(_strip_tags(m_desc.group(1))).strip() or None
rating: float | None = None
m_rating = _RATING_RE.search(html)
if m_rating:
try:
rating = float(m_rating.group(1))
except ValueError:
pass
poster_url: str | None = None
for rgx in (_POSTER_JSONLD_RE, _POSTER_RE, _POSTER_OG_RE):
m = rgx.search(html)
if m:
candidate = m.group(1).strip()
if candidate and "blank.gif" not in candidate and "no-poster" not in candidate:
poster_url = candidate
break
# Performers — tylko sekcja "Pornstars" ma /pornstar/
/ linki, dooplay
# filtruje cast w tej sekcji. Jaccard może łapać dubel ale dedup robimy w
# resolverze (po performer_id).
performers = [
RawPerformer(
external_id=f"{source_name}:{m.group(1)}",
name=_decode_html(m.group(2).strip()),
)
for m in _PORNSTAR_RE.finditer(html)
]
tags = [
RawTag(
external_id=f"{source_name}:{m.group(1)}",
name=_decode_html(m.group(2).strip()),
slug=m.group(1),
)
for m in _GENRE_TAG_RE.finditer(html)
]
if page_url is None:
page_url = f"{base_url}/movies/{slug}/"
# Playback sources: każdy host (Doodstream/Lulu/RPM/...) jako osobny entry.
# Dedup po href żeby ten sam host nie wpadł 2x. Raw landing page (origin=
# source_name, bez :host) appendujemy TYLKO gdy nie ma żadnych sub-hosters —
# inaczej myli usera (otwiera WebView z reklamami zamiast video; bug-report
# 2026-05-16: "mangoporn przekierowuje do strony, reklama full screen").
playback_sources: list[RawPlaybackSource] = []
seen_hrefs: set[str] = set()
# Hostery file-download (non-streamable) + malware. Mobile player nie potrafi
# ich odtworzyć — rapidgator/nitroflare/frdl serwują .zip/.rar/.mp4 do download
# (premium login required), streamtape ma malware drive-by .reg. Skipujemy
# przy ingest żeby nie zaśmiecać UI martwym contentem (bug-report 2026-05-18).
SKIP_HOSTERS = {"rapidgator", "nitroflare", "nitro", "frdl", "streamtape"}
def _emit_host_entry(href: str, source: str | None) -> None:
href = href.strip()
if not href or href in seen_hrefs:
return
seen_hrefs.add(href)
try:
from urllib.parse import urlparse
host = urlparse(href).hostname or "unknown"
host_short = host.split(".")[-2] if host.count(".") >= 1 else host
except Exception:
host_short = "unknown"
if host_short.lower() in SKIP_HOSTERS:
return
playback_sources.append(
RawPlaybackSource(
origin=f"{source_name}:{host_short}",
page_url=href,
embed_url=source or href,
thumbnail_url=poster_url,
duration_sec=duration_sec,
)
)
# Stary `` markup (mangoporn).
for m in _PLAYER_OPTION_RE.finditer(html):
_emit_host_entry(m.group("href") or "", (m.group("source") or "").strip() or None)
# Nowy `` markup (pandamovies od ~2026-04 + nowe
# streamporn instances). data-fl-source jest opcjonalny — capturujemy CAŁY
# opening tag w group(1), data-fl-source extract z TEGO tagu (nie z window
# lookback po HTMLu, bo to mogło pickować poprzedni cell).
for m in _PLAYER_OPTION_DIV_RE.finditer(html):
href = m.group("href") or ""
opening_tag = m.group(1)
src_match = _DATA_FL_SOURCE_RE.search(opening_tag)
source = (src_match.group(1).strip() if src_match else "") or None
_emit_host_entry(href, source)
if not playback_sources:
# Brak sub-hosters znalezionych — fallback do landing page (mobile otworzy
# w WebView). Robimy to TYLKO gdy nie ma alternatyw, inaczej landing jest
# niepotrzebnym ad-pageiem.
playback_sources.append(
RawPlaybackSource(
origin=source_name,
page_url=page_url,
thumbnail_url=poster_url,
)
)
return RawMovie(
external_id=slug,
title=title,
description=description,
release_year=release_year,
release_date=release_date,
duration_sec=duration_sec,
rating=rating,
poster_url=poster_url,
url=page_url,
studio=studio,
performers=performers,
tags=tags,
playback_sources=playback_sources,
raw={"slug": slug, "html_len": len(html)},
)
# ---- per-site instances ----------------------------------------------------
class StreampornConnector(DooplayConnector):
name = "streamporn"
base_url = "https://streamporn.nl"
class PandamoviesConnector(DooplayConnector):
name = "pandamovies"
base_url = "https://pandamovies.pw"
class MangopornConnector(DooplayConnector):
name = "mangoporn"
base_url = "https://mangoporn.net"
# ---------------------------------------------------------------------------
# Helpers (zduplikowane z paradisehill.py — celowo, żeby connectory były niezależne)
# ---------------------------------------------------------------------------
_TAG_RE = re.compile(r"<[^>]+>")
def _strip_tags(s: str) -> str:
return _TAG_RE.sub("", s)
_HTML_ENTITIES = {
"&": "&", "<": "<", ">": ">", """: '"', "'": "'",
"'": "'", " ": " ", "’": "'", "‘": "'",
"”": '"', "“": '"', "…": "...", "—": "—", "–": "–",
}
def _decode_html(s: str) -> str:
for k, v in _HTML_ENTITIES.items():
s = s.replace(k, v)
s = re.sub(r"(\d+);", lambda m: chr(int(m.group(1))), s)
s = re.sub(r"([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
return s