Bug-report 2026-05-28 ("od wczoraj nie ma nowych filmow"). DooplayConnector
.fetch_movies mial `while True` po stronach bez bound; streamporn (>2k filmow)
wisial godzinami az do dailowego killa schedulera, blokujac kolejke mangoporn
+ pandamovies. Watermark zamrozony, dziennie 0 nowych filmow.
Fix: cap _MAX_PAGES_DELTA=3 (since-driven runs, ~144 najnowszych pozycji)
i _MAX_PAGES_FULL=50 (full backfill gdy since=None). Wczesniejsza proba
filtrowania przez release_date odrzucona - release_date to data wydania filmu
(np. 2013), nie data uploadu na strone, wiec sortowanie listing nie matchuje.
Po deployu manualne re-run: streamporn 144/46s, pandamovies 120/47s,
mangoporn 108 z 72 NEW filmow w 58s. Scheduler queue unblocked.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
483 lines
18 KiB
Python
483 lines
18 KiB
Python
"""dooplay (a.k.a. PsyPlay) WordPress theme scraper — generic dla mangoporn/streamporn/pandamovies.
|
||
|
||
Te 3 strony to dokładnie ten sam template (theme=dooplay + PsyPlay player plugin),
|
||
więc parametryzujemy connector po `(base_url, source_name)` i odpalamy 3 instancje.
|
||
|
||
Listing: `/movies/page/N/` zwraca <a href="/movies/<slug>/"> per item.
|
||
Detail: `/movies/<slug>/` ma rich meta:
|
||
- <h1> tytuł (w class="data" wrapper)
|
||
- <a href="/year/YYYY/" rel="tag"> rok produkcji
|
||
- <a href="/studios/<slug>/" rel="tag"> studio
|
||
- <span class='duration'>NN mins.</span> długość
|
||
- <a href="/pornstar/<slug>/"> cast (multi)
|
||
- <a href="/genre/<slug>/"> tagi (multi)
|
||
- <div itemprop="description"><p>...</p></div> opis
|
||
- <span class="dt_rating_vgs" itemprop="ratingValue">N</span> rating 0-10
|
||
- <li ... data-fl-source="<embed_url>"><a href="<embed_link>">Host</a></li> player options
|
||
|
||
Player ma multi-host options (DoodStream, LuluStream, RPMShare etc.) — każdy embed
|
||
URL idzie jako osobny `playback_source` z origin=`{site}:{host}` żeby później mobile
|
||
mógł wybrać czyim embedem chce odpalić scenę.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import re
|
||
from collections.abc import Iterator
|
||
from datetime import date, datetime
|
||
from typing import Any
|
||
|
||
import httpx
|
||
|
||
from app.connectors.base import (
|
||
BaseMovieConnector,
|
||
RawMovie,
|
||
RawPerformer,
|
||
RawPlaybackSource,
|
||
RawStudio,
|
||
RawTag,
|
||
)
|
||
from app.extractors import browser_get
|
||
from app.models.source import SourceKind
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
USER_AGENT = (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||
)
|
||
|
||
|
||
# ---- selektory (theme-agnostic — działa dla dowolnego dooplay) -----------
|
||
|
||
# Listing item — dwa wzorce w zależności od witryny:
|
||
# 1. mangoporn: zwykłe <a href="https://site/movies/<slug>/"> bez klasy
|
||
# (theme wyrendurował SEO-friendly URL bezpośrednio w grid)
|
||
# 2. streamporn/pandamovies: <a class="ml-mask jt" href="<base>/<slug>/">
|
||
# (slug bez /movies/ prefix, np. /watch-xxx-...-adult-movie-online-free/)
|
||
# Łapiemy oba przez alternatywę.
|
||
_LIST_ITEM_RE = re.compile(
|
||
r'<a\s+href="(?P<url>https?://[^"]+)"[^>]*\bclass="ml-mask\b[^"]*"'
|
||
r"|"
|
||
r'<a\s+href="(?P<url2>https?://[^"]+/movies/[a-z0-9-]+/)"',
|
||
re.IGNORECASE,
|
||
)
|
||
# Tolerantny title — mangoporn (dooplay) używa <h1> w class="data", streamporn/pandamovies
|
||
# (raw PsyPlay theme) używają <h3 itemprop="name">. Łapiemy oba przez itemprop="name".
|
||
_TITLE_RE = re.compile(
|
||
r'<h[1-6][^>]*\sitemprop="name"[^>]*>([^<]+)</h[1-6]>'
|
||
r'|class="data"[^>]*>\s*<h[1-6][^>]*>([^<]+)</h[1-6]>',
|
||
re.IGNORECASE | re.DOTALL,
|
||
)
|
||
# dooplay uses /year/, raw PsyPlay uses /release-year/. Same dla pozostałych slugów —
|
||
# różne thema dziedziczą podstawowy markup ale customizują URL słowniki.
|
||
_YEAR_RE = re.compile(
|
||
r'/(?:year|release-year)/(\d{4})/"\s*rel="tag"', re.IGNORECASE
|
||
)
|
||
_STUDIO_RE = re.compile(
|
||
r'href="https?://[^/]+/(?:studios?|director)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
|
||
re.IGNORECASE,
|
||
)
|
||
# Duration: span class='duration' (dooplay) lub <p><strong>Duration:</strong> X hrs. Y mins.</p> (PsyPlay)
|
||
_DURATION_SPAN_RE = re.compile(
|
||
r"<span\s+class=['\"]duration['\"][^>]*>([^<]+)</span>", re.IGNORECASE
|
||
)
|
||
_DURATION_TEXT_RE = re.compile(
|
||
r"<strong>\s*Duration:\s*</strong>\s*([^<]+)<", re.IGNORECASE
|
||
)
|
||
# Release date: span class='release_date' (dooplay) lub <p><strong>Released Date:</strong> X</p> (PsyPlay)
|
||
_RELEASE_DATE_SPAN_RE = re.compile(
|
||
r"<span\s+class=['\"]release_date['\"]'?[^>]*>([^<]+)</span>", re.IGNORECASE
|
||
)
|
||
_RELEASE_DATE_TEXT_RE = re.compile(
|
||
r"<strong>\s*Released?\s*Date:\s*</strong>\s*([^<]+)<", re.IGNORECASE
|
||
)
|
||
_DESCRIPTION_RE = re.compile(
|
||
r'itemprop="description"[^>]*>(.*?)</div>', re.IGNORECASE | re.DOTALL
|
||
)
|
||
_RATING_RE = re.compile(
|
||
r'itemprop="ratingValue"[^>]*>([\d.]+)</span>', re.IGNORECASE
|
||
)
|
||
# Cast: dooplay /pornstar/, PsyPlay /actor/
|
||
_PORNSTAR_RE = re.compile(
|
||
r'href="https?://[^/]+/(?:pornstar|actor)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
|
||
re.IGNORECASE,
|
||
)
|
||
# Genre: same /genre(s)/ w obu themach
|
||
_GENRE_TAG_RE = re.compile(
|
||
r'href="https?://[^/]+/genres?/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
|
||
re.IGNORECASE,
|
||
)
|
||
# Player options: data-fl-source jest oryginalnym embed URL hostera, data-fl-url
|
||
# to page URL u hostera. Stare theme (mangoporn): `<li class="hosts-buttons-wpx">`.
|
||
# Nowe theme (pandamovies od ~2026-04): `<div class="Rtable1-cell" data-fl-url=...
|
||
# data-fl-source=...>`. Trzeba też tolerować order-independent attrs — nowe theme
|
||
# emituje url BEFORE source, stare odwrotnie. Łapiemy oba wzorce dwoma osobnymi
|
||
# regexami i konsolidujemy w `_iter_player_options`.
|
||
_PLAYER_OPTION_RE = re.compile(
|
||
r'<li[^>]*\bclass="hosts-buttons-wpx"[^>]*'
|
||
r'(?:data-fl-source="(?P<source>[^"]*)"[^>]*)?'
|
||
r'(?:data-fl-url="(?P<page>[^"]*)"[^>]*)?'
|
||
r'>\s*<a[^>]*href="(?P<href>[^"]+)"[^>]*'
|
||
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
|
||
re.IGNORECASE | re.DOTALL,
|
||
)
|
||
# Nowy markup pandamovies: `<div class="Rtable1-cell" data-fl-* ...><a href=...>HostName</a></div>`.
|
||
# Attrs są w kolejności url→source, source często pusty (`data-fl-source=""` dla
|
||
# doodstream/mixdrop/easyvidplayer). Capturujemy CAŁY opening tag w group(1)
|
||
# żeby data-fl-source należał gwarantowanie do TEGO konkretnego div (wcześniejszy
|
||
# window-lookback 600 chars mógł pickować poprzedni cell — cross-attribution
|
||
# doodstream→mixdrop entry, code-review #14).
|
||
_PLAYER_OPTION_DIV_RE = re.compile(
|
||
r'(<div[^>]*\bclass="Rtable1-cell"[^>]*>)\s*'
|
||
r'<a[^>]*href="(?P<href>[^"]+)"[^>]*'
|
||
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
|
||
re.IGNORECASE | re.DOTALL,
|
||
)
|
||
_DATA_FL_SOURCE_RE = re.compile(r'data-fl-source="([^"]*)"', re.IGNORECASE)
|
||
# Poster — JSON-LD `thumbnailUrl` jest najbardziej stabilny (każdy dooplay/PsyPlay
|
||
# theme z SEO ma JSON-LD VideoObject schema). Fallback na class="poster" img dla starych
|
||
# instalacji bez schema. Trzeci fallback: og:image meta tag.
|
||
_POSTER_JSONLD_RE = re.compile(
|
||
r'"thumbnailUrl"\s*:\s*"([^"]+\.(?:jpg|jpeg|png|webp)[^"]*)"', re.IGNORECASE
|
||
)
|
||
_POSTER_RE = re.compile(
|
||
r'class="poster"[^>]*>\s*<img\s+[^>]*src="([^"]+)"', re.IGNORECASE
|
||
)
|
||
_POSTER_OG_RE = re.compile(
|
||
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
|
||
)
|
||
_DURATION_MINS_RE = re.compile(r"(\d+)\s*min", re.IGNORECASE)
|
||
|
||
|
||
class DooplayConnector(BaseMovieConnector):
|
||
"""Generic dooplay scraper. Instantiated per-site via subclasses below."""
|
||
|
||
kind = SourceKind.scraper
|
||
base_url: str
|
||
name: str
|
||
|
||
def __init__(self, *, timeout: float = 30.0):
|
||
if not getattr(self, "base_url", None):
|
||
raise RuntimeError(f"{type(self).__name__} requires class-level `base_url`")
|
||
if not getattr(self, "name", None):
|
||
raise RuntimeError(f"{type(self).__name__} requires class-level `name`")
|
||
self._timeout = timeout
|
||
|
||
def close(self) -> None:
|
||
pass
|
||
|
||
def _fetch(self, url: str) -> str:
|
||
"""browser_get z chrome120 impersonation — psyplay sites czasem blokują
|
||
czysty httpx (Python TLS fingerprint) zwracając 500/403. curl_cffi fixuje to."""
|
||
if not url.startswith("http"):
|
||
url = self.base_url.rstrip("/") + url
|
||
headers = {
|
||
"User-Agent": USER_AGENT,
|
||
"Accept-Language": "en-US,en;q=0.9",
|
||
"Accept": "text/html,application/xhtml+xml",
|
||
"Referer": self.base_url + "/",
|
||
}
|
||
r = browser_get(url, headers=headers, timeout=self._timeout, follow_redirects=True)
|
||
if r.status_code >= 400:
|
||
raise httpx.HTTPStatusError(
|
||
f"{r.status_code} for {url}",
|
||
request=None, # type: ignore[arg-type]
|
||
response=httpx.Response(r.status_code, text=r.text[:200]),
|
||
)
|
||
return r.text
|
||
|
||
# Bezpiecznik — dooplay listing potrafi mieć tysiące stron (streamporn.nl ma
|
||
# >2k filmów). Bez tego ingest wisi godzinami, jest killowany przy restartcie
|
||
# schedulera, blokując kolejne connectory w queue (bug-report 2026-05-28: "od
|
||
# wczoraj nie ma nowych filmów" — streamporn wisiał od 5-24, blokował
|
||
# mangoporn + pandamovies). Listing jest sortowany po dacie uploadu (NIE
|
||
# release_date filmu — release może być z 2013 a upload z dziś), ale upload
|
||
# date nie jest w markupie, więc filtrowanie po `since` przez release_date
|
||
# nie działa. Pragmatyczny cap stron: 3 dla delta (≈150 nowych pozycji/dzień
|
||
# to znacznie powyżej realnego upload-rate), 50 dla full ingestu (`since=None`).
|
||
_MAX_PAGES_DELTA = 3
|
||
_MAX_PAGES_FULL = 50
|
||
|
||
def fetch_movies(
|
||
self,
|
||
*,
|
||
since: datetime | None = None,
|
||
limit: int | None = None,
|
||
) -> Iterator[RawMovie]:
|
||
seen = 0
|
||
page = 1
|
||
seen_urls: set[str] = set()
|
||
max_pages = self._MAX_PAGES_DELTA if since is not None else self._MAX_PAGES_FULL
|
||
while page <= max_pages:
|
||
try:
|
||
urls = list(self._fetch_listing(page))
|
||
except httpx.HTTPError as e:
|
||
log.warning("%s listing page=%d failed: %s", self.name, page, e)
|
||
return
|
||
if not urls:
|
||
log.info("%s: empty page=%d, stop", self.name, page)
|
||
return
|
||
for url in urls:
|
||
if url in seen_urls:
|
||
continue
|
||
seen_urls.add(url)
|
||
try:
|
||
movie = self._fetch_detail(url)
|
||
except httpx.HTTPError as e:
|
||
log.warning("%s detail %s failed: %s", self.name, url, e)
|
||
continue
|
||
if movie is None:
|
||
continue
|
||
yield movie
|
||
seen += 1
|
||
if limit is not None and seen >= limit:
|
||
return
|
||
page += 1
|
||
log.info(
|
||
"%s: hit max_pages=%d cap (delta=%s), stopping after seen=%d",
|
||
self.name, max_pages, since is not None, seen,
|
||
)
|
||
|
||
def _fetch_listing(self, page: int) -> Iterator[str]:
|
||
path = self._listing_path(page)
|
||
text = self._fetch(path)
|
||
from urllib.parse import urlparse
|
||
site_host = urlparse(self.base_url).hostname
|
||
for m in _LIST_ITEM_RE.finditer(text):
|
||
url = m.group("url") or m.group("url2")
|
||
if not url:
|
||
continue
|
||
try:
|
||
if urlparse(url).hostname != site_host:
|
||
continue
|
||
except Exception:
|
||
continue
|
||
yield url
|
||
|
||
def _listing_path(self, page: int) -> str:
|
||
return "/movies/" if page == 1 else f"/movies/page/{page}/"
|
||
|
||
def _fetch_detail(self, url: str) -> RawMovie | None:
|
||
from urllib.parse import urlparse
|
||
path = urlparse(url).path.rstrip("/")
|
||
slug = path.split("/")[-1] or "root"
|
||
text = self._fetch(url)
|
||
return _parse_dooplay_detail(
|
||
slug=slug, page_url=url, html=text,
|
||
source_name=self.name, base_url=self.base_url,
|
||
)
|
||
|
||
|
||
def _parse_dooplay_detail(
|
||
*, slug: str, html: str, source_name: str, base_url: str, page_url: str | None = None
|
||
) -> RawMovie | None:
|
||
m_title = _TITLE_RE.search(html)
|
||
if not m_title:
|
||
log.warning("%s: no title in %s", source_name, slug)
|
||
return None
|
||
title = _decode_html((m_title.group(1) or m_title.group(2)).strip())
|
||
|
||
m_year = _YEAR_RE.search(html)
|
||
release_year = int(m_year.group(1)) if m_year else None
|
||
|
||
studio: RawStudio | None = None
|
||
m_studio = _STUDIO_RE.search(html)
|
||
if m_studio:
|
||
studio_slug = m_studio.group(1)
|
||
studio_name = _decode_html(m_studio.group(2).strip())
|
||
studio = RawStudio(
|
||
external_id=f"{source_name}:{studio_slug}",
|
||
name=studio_name,
|
||
slug=studio_slug,
|
||
)
|
||
|
||
duration_sec: int | None = None
|
||
m_dur = _DURATION_SPAN_RE.search(html) or _DURATION_TEXT_RE.search(html)
|
||
if m_dur:
|
||
text = m_dur.group(1)
|
||
# Może być "32 mins." (dooplay) albo "1 hrs. 12 mins." (PsyPlay)
|
||
m_h = re.search(r"(\d+)\s*hr", text, re.IGNORECASE)
|
||
m_m = re.search(r"(\d+)\s*min", text, re.IGNORECASE)
|
||
if m_h or m_m:
|
||
duration_sec = (int(m_h.group(1)) * 3600 if m_h else 0) + (int(m_m.group(1)) * 60 if m_m else 0)
|
||
|
||
release_date: date | None = None
|
||
m_rd = _RELEASE_DATE_SPAN_RE.search(html) or _RELEASE_DATE_TEXT_RE.search(html)
|
||
if m_rd:
|
||
text = m_rd.group(1).strip()
|
||
for fmt in ("%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"):
|
||
try:
|
||
release_date = datetime.strptime(text, fmt).date()
|
||
break
|
||
except ValueError:
|
||
continue
|
||
|
||
description: str | None = None
|
||
m_desc = _DESCRIPTION_RE.search(html)
|
||
if m_desc:
|
||
description = _decode_html(_strip_tags(m_desc.group(1))).strip() or None
|
||
|
||
rating: float | None = None
|
||
m_rating = _RATING_RE.search(html)
|
||
if m_rating:
|
||
try:
|
||
rating = float(m_rating.group(1))
|
||
except ValueError:
|
||
pass
|
||
|
||
poster_url: str | None = None
|
||
for rgx in (_POSTER_JSONLD_RE, _POSTER_RE, _POSTER_OG_RE):
|
||
m = rgx.search(html)
|
||
if m:
|
||
candidate = m.group(1).strip()
|
||
if candidate and "blank.gif" not in candidate and "no-poster" not in candidate:
|
||
poster_url = candidate
|
||
break
|
||
|
||
# Performers — tylko sekcja "Pornstars" ma /pornstar/<slug>/ linki, dooplay
|
||
# filtruje cast w tej sekcji. Jaccard może łapać dubel ale dedup robimy w
|
||
# resolverze (po performer_id).
|
||
performers = [
|
||
RawPerformer(
|
||
external_id=f"{source_name}:{m.group(1)}",
|
||
name=_decode_html(m.group(2).strip()),
|
||
)
|
||
for m in _PORNSTAR_RE.finditer(html)
|
||
]
|
||
|
||
tags = [
|
||
RawTag(
|
||
external_id=f"{source_name}:{m.group(1)}",
|
||
name=_decode_html(m.group(2).strip()),
|
||
slug=m.group(1),
|
||
)
|
||
for m in _GENRE_TAG_RE.finditer(html)
|
||
]
|
||
|
||
if page_url is None:
|
||
page_url = f"{base_url}/movies/{slug}/"
|
||
|
||
# Playback sources: każdy host (Doodstream/Lulu/RPM/...) jako osobny entry.
|
||
# Dedup po href żeby ten sam host nie wpadł 2x. Raw landing page (origin=
|
||
# source_name, bez :host) appendujemy TYLKO gdy nie ma żadnych sub-hosters —
|
||
# inaczej myli usera (otwiera WebView z reklamami zamiast video; bug-report
|
||
# 2026-05-16: "mangoporn przekierowuje do strony, reklama full screen").
|
||
playback_sources: list[RawPlaybackSource] = []
|
||
seen_hrefs: set[str] = set()
|
||
|
||
# Hostery file-download (non-streamable) + malware. Mobile player nie potrafi
|
||
# ich odtworzyć — rapidgator/nitroflare/frdl serwują .zip/.rar/.mp4 do download
|
||
# (premium login required), streamtape ma malware drive-by .reg. Skipujemy
|
||
# przy ingest żeby nie zaśmiecać UI martwym contentem (bug-report 2026-05-18).
|
||
SKIP_HOSTERS = {"rapidgator", "nitroflare", "nitro", "frdl", "streamtape"}
|
||
|
||
def _emit_host_entry(href: str, source: str | None) -> None:
|
||
href = href.strip()
|
||
if not href or href in seen_hrefs:
|
||
return
|
||
seen_hrefs.add(href)
|
||
try:
|
||
from urllib.parse import urlparse
|
||
host = urlparse(href).hostname or "unknown"
|
||
host_short = host.split(".")[-2] if host.count(".") >= 1 else host
|
||
except Exception:
|
||
host_short = "unknown"
|
||
if host_short.lower() in SKIP_HOSTERS:
|
||
return
|
||
playback_sources.append(
|
||
RawPlaybackSource(
|
||
origin=f"{source_name}:{host_short}",
|
||
page_url=href,
|
||
embed_url=source or href,
|
||
thumbnail_url=poster_url,
|
||
duration_sec=duration_sec,
|
||
)
|
||
)
|
||
|
||
# Stary `<li class="hosts-buttons-wpx">` markup (mangoporn).
|
||
for m in _PLAYER_OPTION_RE.finditer(html):
|
||
_emit_host_entry(m.group("href") or "", (m.group("source") or "").strip() or None)
|
||
|
||
# Nowy `<div class="Rtable1-cell">` markup (pandamovies od ~2026-04 + nowe
|
||
# streamporn instances). data-fl-source jest opcjonalny — capturujemy CAŁY
|
||
# opening tag w group(1), data-fl-source extract z TEGO tagu (nie z window
|
||
# lookback po HTMLu, bo to mogło pickować poprzedni cell).
|
||
for m in _PLAYER_OPTION_DIV_RE.finditer(html):
|
||
href = m.group("href") or ""
|
||
opening_tag = m.group(1)
|
||
src_match = _DATA_FL_SOURCE_RE.search(opening_tag)
|
||
source = (src_match.group(1).strip() if src_match else "") or None
|
||
_emit_host_entry(href, source)
|
||
|
||
if not playback_sources:
|
||
# Brak sub-hosters znalezionych — fallback do landing page (mobile otworzy
|
||
# w WebView). Robimy to TYLKO gdy nie ma alternatyw, inaczej landing jest
|
||
# niepotrzebnym ad-pageiem.
|
||
playback_sources.append(
|
||
RawPlaybackSource(
|
||
origin=source_name,
|
||
page_url=page_url,
|
||
thumbnail_url=poster_url,
|
||
)
|
||
)
|
||
|
||
return RawMovie(
|
||
external_id=slug,
|
||
title=title,
|
||
description=description,
|
||
release_year=release_year,
|
||
release_date=release_date,
|
||
duration_sec=duration_sec,
|
||
rating=rating,
|
||
poster_url=poster_url,
|
||
url=page_url,
|
||
studio=studio,
|
||
performers=performers,
|
||
tags=tags,
|
||
playback_sources=playback_sources,
|
||
raw={"slug": slug, "html_len": len(html)},
|
||
)
|
||
|
||
|
||
# ---- per-site instances ----------------------------------------------------
|
||
|
||
class StreampornConnector(DooplayConnector):
|
||
name = "streamporn"
|
||
base_url = "https://streamporn.nl"
|
||
|
||
|
||
class PandamoviesConnector(DooplayConnector):
|
||
name = "pandamovies"
|
||
base_url = "https://pandamovies.pw"
|
||
|
||
|
||
class MangopornConnector(DooplayConnector):
|
||
name = "mangoporn"
|
||
base_url = "https://mangoporn.net"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers (zduplikowane z paradisehill.py — celowo, żeby connectory były niezależne)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_TAG_RE = re.compile(r"<[^>]+>")
|
||
|
||
|
||
def _strip_tags(s: str) -> str:
|
||
return _TAG_RE.sub("", s)
|
||
|
||
|
||
_HTML_ENTITIES = {
|
||
"&": "&", "<": "<", ">": ">", """: '"', "'": "'",
|
||
"'": "'", " ": " ", "’": "'", "‘": "'",
|
||
"”": '"', "“": '"', "…": "...", "—": "—", "–": "–",
|
||
}
|
||
|
||
|
||
def _decode_html(s: str) -> str:
|
||
for k, v in _HTML_ENTITIES.items():
|
||
s = s.replace(k, v)
|
||
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
|
||
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
|
||
return s
|