feat(sources): remove 0dayxx + pornditt + pornhat entirely
Three orphan-factory tubes (0–0.2% canonical match — auto-screenshot thumbs and slug titles that never match TPDB/StashDB) — to be replaced by better sources. Removed scrapers (files + imports), extractors (registry + modules), the pornhat entry from tag-enrichment priority lists and the 0dayxx display override, and purged the DB (19,003 playback_sources + 9,904 solo-orphan scenes; shared mirror scenes keep their other sources). The pornhat-based enrich_studio endpoint stays as a graceful no-op (no pornhat sources → returns no studio). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2f3e57c0ac
commit
cbb2390a2a
10 changed files with 9 additions and 391 deletions
|
|
@ -969,7 +969,7 @@ def enrich_tags_from_tube(
|
||||||
|
|
||||||
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
|
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
|
||||||
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||||
"xvideoscom", "xnxxcom", "pornhatcom"]
|
"xvideoscom", "xnxxcom"]
|
||||||
sources = session.execute(
|
sources = session.execute(
|
||||||
select(PlaybackSource).where(
|
select(PlaybackSource).where(
|
||||||
PlaybackSource.scene_id == scene_id,
|
PlaybackSource.scene_id == scene_id,
|
||||||
|
|
|
||||||
|
|
@ -91,7 +91,6 @@ _DISPLAY_OVERRIDES: dict[str, str] = {
|
||||||
"porn00org": "porn00.org",
|
"porn00org": "porn00.org",
|
||||||
"freshpornoorg": "freshporno.org",
|
"freshpornoorg": "freshporno.org",
|
||||||
"pornxpph": "pornxp.ph",
|
"pornxpph": "pornxp.ph",
|
||||||
"0dayxxcom": "0dayxx.com",
|
|
||||||
"shyfapnet": "shyfap.net",
|
"shyfapnet": "shyfap.net",
|
||||||
"hdporngg": "hdporn.gg",
|
"hdporngg": "hdporn.gg",
|
||||||
"fullmoviesxxx": "fullmovies.xxx",
|
"fullmoviesxxx": "fullmovies.xxx",
|
||||||
|
|
|
||||||
|
|
@ -349,7 +349,7 @@ _TAG_RESCRAPE_THRESHOLD = 3
|
||||||
# Mainstream tubes priority dla tagów — bogate metadane.
|
# Mainstream tubes priority dla tagów — bogate metadane.
|
||||||
_TAG_PRIORITY = [
|
_TAG_PRIORITY = [
|
||||||
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||||
"xvideoscom", "xnxxcom", "pornhatcom",
|
"xvideoscom", "xnxxcom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,9 +34,7 @@ from app.connectors.direct_scrapers.latestpornvideo import LatestPornVideoScrape
|
||||||
from app.connectors.direct_scrapers.mypornerleak import MyPornerLeakScraper
|
from app.connectors.direct_scrapers.mypornerleak import MyPornerLeakScraper
|
||||||
from app.connectors.direct_scrapers.perverzija import PerverzijaScraper
|
from app.connectors.direct_scrapers.perverzija import PerverzijaScraper
|
||||||
from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
||||||
from app.connectors.direct_scrapers.pornditt import PornDittScraper
|
|
||||||
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
||||||
from app.connectors.direct_scrapers.pornhat import PornHatScraper # noqa: F401 — kept for backref; ingest disabled
|
|
||||||
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
||||||
from app.connectors.direct_scrapers.siska import SiskaScraper
|
from app.connectors.direct_scrapers.siska import SiskaScraper
|
||||||
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
||||||
|
|
@ -48,7 +46,6 @@ from app.connectors.direct_scrapers.xnxx import XnxxScraper
|
||||||
from app.connectors.direct_scrapers.xvideos import XVideosScraper
|
from app.connectors.direct_scrapers.xvideos import XVideosScraper
|
||||||
from app.connectors.direct_scrapers.xxxfreewatch import XxxFreeWatchScraper # noqa: F401 — kept for backref; delisted
|
from app.connectors.direct_scrapers.xxxfreewatch import XxxFreeWatchScraper # noqa: F401 — kept for backref; delisted
|
||||||
from app.connectors.direct_scrapers.youporn import YouPornScraper
|
from app.connectors.direct_scrapers.youporn import YouPornScraper
|
||||||
from app.connectors.direct_scrapers.zerodayxx import ZeroDayXXScraper
|
|
||||||
|
|
||||||
ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# Existing 4 (verified, in production)
|
# Existing 4 (verified, in production)
|
||||||
|
|
@ -58,12 +55,8 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# popunder redirect. Mobile WebView page-as-hoster pokazuje ad redirect zamiast video.
|
# popunder redirect. Mobile WebView page-as-hoster pokazuje ad redirect zamiast video.
|
||||||
# 33,598 playback_sources mass-marked dead, 27,374 solo-orphan scenes deleted.
|
# 33,598 playback_sources mass-marked dead, 27,374 solo-orphan scenes deleted.
|
||||||
SxyLandScraper,
|
SxyLandScraper,
|
||||||
# ZeroDayXXScraper — wyłączony 2026-05-12 (source quality report): 25,596 scen, 0.1% canonical
|
# ZeroDayXXScraper (0dayxx) — USUNIĘTY CAŁKOWICIE 2026-06-22 (user request). Orphan
|
||||||
# match. Slug-concat tytuły (`bella reese big butt ready to be filled with cum analized`) bez
|
# factory (0.1% canonical), zastępujemy lepszymi źródłami. Dane/pliki/extractor skasowane.
|
||||||
# `[Studio]` lub `Studio - Perf - Title` prefixu (parse rate 3%) → resolver nie ma żadnego
|
|
||||||
# signalu do matchu. Wraps watchporn ale dziedziczy stripped metadata. Solo orphany usunięte
|
|
||||||
# (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
|
|
||||||
# resolvują).
|
|
||||||
# Mainstream (URL templates well-known)
|
# Mainstream (URL templates well-known)
|
||||||
# PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od
|
# PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od
|
||||||
# 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów
|
# 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów
|
||||||
|
|
@ -105,19 +98,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# zwraca consistent search results. KVS engine, slug-aware scene URLs. Mostly
|
# zwraca consistent search results. KVS engine, slug-aware scene URLs. Mostly
|
||||||
# orphan ingest (auto-screenshots, no canonical phash match — sprawdzone), ale
|
# orphan ingest (auto-screenshots, no canonical phash match — sprawdzone), ale
|
||||||
# może łapać sceny popularnych performerów których jeszcze nie mamy w TPDB.
|
# może łapać sceny popularnych performerów których jeszcze nie mamy w TPDB.
|
||||||
# PornHatScraper — wyłączony 2026-05-18. 9,799 scen, 0.2% canonical match, 100% solo-orphan.
|
# PornHat (pornhatcom) + PornDitt (porndittcom) — USUNIĘTE CAŁKOWICIE 2026-06-22
|
||||||
# Pure orphan factory — auto-screenshot thumbs nie matchują phash do canonical, slug tytuły
|
# (user request). Orphan factories (0.2% / weak-signal canonical match), zastępujemy
|
||||||
# nie matchują rapidfuzz, brak duration/date signals. KEEP `pornhatcom` extractor i istniejące
|
# lepszymi źródłami. Dane/pliki scraperów/extractory skasowane.
|
||||||
# playback_sources żywe — mobile może je odtwarzać; disable tylko future ingest.
|
|
||||||
# PornDittScraper — wyłączony 2026-05-12 (bug-report 64356e9b). Każdy link
|
|
||||||
# produkował nową Scene row zamiast matchować do istniejącej kanonicznej
|
|
||||||
# (TPDB/StashDB) bo pornditt ma weak signal: title + cz. performera, brak
|
|
||||||
# fingerprintu/duration/date → composite_score zawsze poniżej auto_merge
|
|
||||||
# threshold (0.92). Plik scrapera + extractor zostają (istniejące playback_sources
|
|
||||||
# nadal się resolvują, _REGISTRY w app/extractors/__init__.py odpala
|
|
||||||
# `porndittcom` → _embed_iframe.extract). Re-enable wymaga albo
|
|
||||||
# "alternative-source mode" w resolverze (match-only, never create new),
|
|
||||||
# albo bogatszej extracji metadanych (duration + fingerprint).
|
|
||||||
# Special
|
# Special
|
||||||
SxyPrnScraper,
|
SxyPrnScraper,
|
||||||
PerverzijaScraper,
|
PerverzijaScraper,
|
||||||
|
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
"""pornditt.com — direct HTML scrape.
|
|
||||||
|
|
||||||
KVS-style site (kt_player engine). Search URL: `/search/<slug>/?from=<page>` z slug-style
|
|
||||||
zapytaniem (spacje → `-`). Sceny renderują się na subdomenie `v.pornditt.com/videos/<id>/<slug>/`,
|
|
||||||
więc regex matchuje oba (z i bez `v.` prefix).
|
|
||||||
|
|
||||||
Sitetag `porndittcom` (legacy z porn-app DEFAULT_SITETAGS — suffix-stripped name).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
|
||||||
|
|
||||||
|
|
||||||
class PornDittScraper(BaseSearchScraper):
|
|
||||||
sitetag = "porndittcom"
|
|
||||||
_search_url_template = "https://pornditt.com/search/{query}/?from={page}"
|
|
||||||
_scene_url_re = re.compile(
|
|
||||||
r'href="(?P<url>https://(?:v\.)?pornditt\.com/videos/(?P<sid>\d+)/(?P<slug>[a-z0-9\-]+))/"',
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _format_query_for_url(self, query: str) -> str:
|
|
||||||
# KVS slug: lowercase, spacja/interpunkcja → `-`. URL-encoded (`+`) tu nie zadziała.
|
|
||||||
return re.sub(r"[^a-z0-9]+", "-", query.lower()).strip("-")
|
|
||||||
|
|
@ -1,99 +0,0 @@
|
||||||
"""pornhat.com — search-mode scraper (performer-driven backfill).
|
|
||||||
|
|
||||||
KVS engine. Search URL: `/search/<query>/` z `+` jako space separator. Scene URLs
|
|
||||||
to `/video/<slug>/` (slug bez ID prefix, w przeciwieństwie do 3Movs/OK.xxx). Slug
|
|
||||||
zawiera tokens query gdy match jest relevant, więc filtruje się automatycznie.
|
|
||||||
|
|
||||||
Auto-screenshot thumbnaile (`static.pornhat.com/contents/videos_screenshots/.../1.jpg`)
|
|
||||||
— do canonical match przez phash NIE nadają się (sprawdzone w probe 2026-05-12, 8%).
|
|
||||||
Ale wartość scrapera: discovering nowych scen performera których inne tube'y/canonical
|
|
||||||
nie mają. Mostly orphan ingest, ale dla popular performers może łapać studio scenes
|
|
||||||
których nie mamy w TPDB jeszcze.
|
|
||||||
|
|
||||||
Metadata enrich: scene page ma `class="info-video js-ajax-{dvd,model,tag}"` div'y
|
|
||||||
z `data-setup='{"title": ..., "url": ..., "dir": ...}'` JSON. Parsujemy w
|
|
||||||
`_fetch_scene_metadata()` żeby insertować studio (dvd), dodatkowych performerów
|
|
||||||
(models), i tagi do każdej sceny.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
|
|
||||||
from app.connectors.base import RawPerformer, RawStudio, RawTag
|
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
|
||||||
from app.extractors import browser_get
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
# `class="info-video js-ajax-<kind>"` ... `data-setup='<json>'`. JSON jest
|
|
||||||
# single-quoted (HTML attribute), z double-quotes wewnątrz dla string values.
|
|
||||||
# `\1` w replacement: backreference do `<kind>` żeby wiedzieć co matchujemy.
|
|
||||||
_AJAX_DATA_RE = re.compile(
|
|
||||||
r"class=\"info-video js-ajax-(?P<kind>dvd|model|tag)[^\"]*\"[^>]*data-setup='(?P<json>[^']+)'",
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class PornHatScraper(BaseSearchScraper):
|
|
||||||
sitetag = "pornhatcom"
|
|
||||||
# Pagination KVS-style: /search/<query>/<page>/ (page=1 ALSO works z explicit `/1/`)
|
|
||||||
_search_url_template = "https://www.pornhat.com/search/{query}/{page}/"
|
|
||||||
# PornHat search HTML używa relative hrefs `/video/<slug>/`. BaseSearchScraper
|
|
||||||
# automatycznie konwertuje relative → absolute via urlparse(search_url).netloc.
|
|
||||||
_scene_url_re = re.compile(
|
|
||||||
r'href="(?P<url>(?:https://www\.pornhat\.com)?/video/(?P<slug>[a-z0-9\-]+)/)"',
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _format_query_for_url(self, query: str) -> str:
|
|
||||||
# KVS: lowercase + spaces → `-` (slug-style), działa też `+`
|
|
||||||
return query.strip().lower().replace(" ", "-")
|
|
||||||
|
|
||||||
def _fetch_scene_metadata(
|
|
||||||
self, scene_url: str
|
|
||||||
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
|
|
||||||
"""Fetch scene detail + parse `js-ajax-{dvd,model,tag}` data-setup JSON."""
|
|
||||||
try:
|
|
||||||
r = browser_get(scene_url, timeout=self._timeout)
|
|
||||||
if r.status_code != 200:
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
log.debug("pornhat detail fetch failed %s: %s", scene_url, e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
studio: RawStudio | None = None
|
|
||||||
performers: list[RawPerformer] = []
|
|
||||||
tags: list[RawTag] = []
|
|
||||||
|
|
||||||
for m in _AJAX_DATA_RE.finditer(r.text):
|
|
||||||
kind = m.group("kind").lower()
|
|
||||||
try:
|
|
||||||
data = json.loads(m.group("json"))
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
continue
|
|
||||||
name = (data.get("title") or "").strip()
|
|
||||||
slug = (data.get("dir") or "").strip() or None
|
|
||||||
if not name:
|
|
||||||
continue
|
|
||||||
if kind == "dvd":
|
|
||||||
# `dvd` to studio/series wrapper (np. "Adult Time"). Pierwsze
|
|
||||||
# wystąpienie bierzemy jako studio sceny — rzadko jest ich więcej.
|
|
||||||
if studio is None:
|
|
||||||
studio = RawStudio(
|
|
||||||
external_id=f"pornhatcom:dvd:{slug or name.lower()}",
|
|
||||||
name=name,
|
|
||||||
slug=slug,
|
|
||||||
)
|
|
||||||
elif kind == "model":
|
|
||||||
performers.append(RawPerformer(name=name))
|
|
||||||
elif kind == "tag":
|
|
||||||
tags.append(RawTag(
|
|
||||||
external_id=f"pornhatcom:tag:{slug or name.lower()}",
|
|
||||||
name=name,
|
|
||||||
slug=slug,
|
|
||||||
))
|
|
||||||
|
|
||||||
return studio, performers, tags
|
|
||||||
|
|
@ -1,119 +0,0 @@
|
||||||
"""ZeroDayXXScraper — direct HTML scrape 0dayxx.com search.
|
|
||||||
|
|
||||||
Search: `https://0dayxx.com/page/<n>/?s=<query>`. Scene URL format:
|
|
||||||
`https://0dayxx.com/0day-porn-video/<slug>/` (lub czasem `/<category>/<slug>/`).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
import urllib.parse
|
|
||||||
from collections.abc import Iterator
|
|
||||||
|
|
||||||
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
|
|
||||||
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
|
||||||
from app.extractors import browser_get
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
_SCENE_URL_RE = re.compile(
|
|
||||||
r'href="(https://0dayxx\.com/(?:0day-porn-video|latest-porn-videos|porn-(?:bf|videos))/([^"/]+))/?"'
|
|
||||||
)
|
|
||||||
_OG_TITLE_RE = re.compile(
|
|
||||||
r'<meta\s+property="og:title"\s+content="([^"]+)"', re.IGNORECASE
|
|
||||||
)
|
|
||||||
_OG_IMAGE_RE = re.compile(
|
|
||||||
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _fetch_detail(scene_url: str) -> tuple[str | None, str | None]:
|
|
||||||
"""Pobiera 0dayxx detail page i wyciąga (real_title, thumbnail_url).
|
|
||||||
|
|
||||||
0dayxx jest wrapperem (embeduje watchporn.to/inne), więc duration/tagi tu
|
|
||||||
nie są — siedzą na watchporn.to. og:image jednak jest na 0dayxx i daje
|
|
||||||
miniaturkę z poprawnym wymiarem (200x200 — mała, ale lepsza niż żadna).
|
|
||||||
|
|
||||||
Bez tego fetch'u sceny 0dayxx trafiały do dedupu z slug'iem jako title +
|
|
||||||
bez thumbnail_url — czyli z dwoma najsłabszymi sygnałami na raz, co
|
|
||||||
powodowało albo brak match'y albo false-positive merge'y (zgłoszone
|
|
||||||
2026-05-09).
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
r = browser_get(scene_url, timeout=20)
|
|
||||||
except Exception as e:
|
|
||||||
log.debug("0dayxx detail fetch failed for %s: %s", scene_url, e)
|
|
||||||
return None, None
|
|
||||||
if r.status_code != 200:
|
|
||||||
return None, None
|
|
||||||
title = None
|
|
||||||
thumb = None
|
|
||||||
if (m := _OG_TITLE_RE.search(r.text)):
|
|
||||||
# Strip ` | 0dayxx.com Daily...` suffix (powtórki og:title czasem mają go).
|
|
||||||
title = m.group(1).split("|")[0].strip()
|
|
||||||
if (m := _OG_IMAGE_RE.search(r.text)):
|
|
||||||
thumb = m.group(1).strip()
|
|
||||||
return title, thumb
|
|
||||||
|
|
||||||
|
|
||||||
class ZeroDayXXScraper(BaseDirectTubeScraper):
|
|
||||||
sitetag = "0dayxxcom"
|
|
||||||
|
|
||||||
def search(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
*,
|
|
||||||
page: int = 1,
|
|
||||||
limit: int | None = None,
|
|
||||||
) -> Iterator[RawScene]:
|
|
||||||
q = urllib.parse.quote_plus(query.strip())
|
|
||||||
url = f"https://0dayxx.com/page/{page}/?s={q}"
|
|
||||||
try:
|
|
||||||
r = browser_get(url, timeout=30)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("0dayxx search fetch failed: %s", e)
|
|
||||||
return
|
|
||||||
if r.status_code != 200:
|
|
||||||
return
|
|
||||||
|
|
||||||
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
|
|
||||||
|
|
||||||
seen: set[str] = set()
|
|
||||||
yielded = 0
|
|
||||||
for m in _SCENE_URL_RE.finditer(r.text):
|
|
||||||
scene_url = m.group(1) + "/"
|
|
||||||
slug = m.group(2)
|
|
||||||
if scene_url in seen:
|
|
||||||
continue
|
|
||||||
seen.add(scene_url)
|
|
||||||
|
|
||||||
slug_lower = slug.lower()
|
|
||||||
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
|
|
||||||
continue
|
|
||||||
|
|
||||||
real_title, thumb = _fetch_detail(scene_url)
|
|
||||||
title = real_title or slug.replace("-", " ").strip()
|
|
||||||
|
|
||||||
yield RawScene(
|
|
||||||
external_id=f"0dayxxcom:{scene_url}",
|
|
||||||
title=title,
|
|
||||||
url=scene_url,
|
|
||||||
playback_sources=[
|
|
||||||
RawPlaybackSource(
|
|
||||||
origin="tube:0dayxxcom",
|
|
||||||
page_url=scene_url,
|
|
||||||
thumbnail_url=thumb,
|
|
||||||
)
|
|
||||||
],
|
|
||||||
performers=[RawPerformer(name=query.strip())],
|
|
||||||
raw={
|
|
||||||
"source": "direct_scraper:0dayxx",
|
|
||||||
"query": query,
|
|
||||||
"page": page,
|
|
||||||
"url": scene_url,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
yielded += 1
|
|
||||||
if limit is not None and yielded >= limit:
|
|
||||||
return
|
|
||||||
|
|
@ -38,8 +38,6 @@ from app.extractors.tubes import (
|
||||||
latestpornvideo,
|
latestpornvideo,
|
||||||
paradisehill,
|
paradisehill,
|
||||||
porn00,
|
porn00,
|
||||||
pornditt,
|
|
||||||
pornhat,
|
|
||||||
porntrex,
|
porntrex,
|
||||||
sxyprn,
|
sxyprn,
|
||||||
xhamster,
|
xhamster,
|
||||||
|
|
@ -85,10 +83,6 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# flashvars `video_url` → `get_file` 302 → CDN time-bound signed URL
|
# flashvars `video_url` → `get_file` 302 → CDN time-bound signed URL
|
||||||
# (`expires`+`md5`, NIE IP-bound) → mobile gra direct, zero VPS bandwidth.
|
# (`expires`+`md5`, NIE IP-bound) → mobile gra direct, zero VPS bandwidth.
|
||||||
"porntrexcom": porntrex.extract,
|
"porntrexcom": porntrex.extract,
|
||||||
# pornditt — KVS jak yespornvip (function/0 + license). VPS dociera → resolve
|
|
||||||
# server-side (decode + follow 302 → portable twa.tgprn.com CDN). Wcześniej WebView
|
|
||||||
# fallback łapał VAST preroll (trafostatic) zamiast contentu. Patrz pornditt.py/_kvs.py.
|
|
||||||
"porndittcom": pornditt.extract,
|
|
||||||
# fpoxxx — KVS, plain get_file + license. 2026-06-01 (task #20): get_file 302 →
|
# fpoxxx — KVS, plain get_file + license. 2026-06-01 (task #20): get_file 302 →
|
||||||
# `videos3.fpo.xxx/remote_control.php?acctoken=<base64>` — zdekodowany acctoken
|
# `videos3.fpo.xxx/remote_control.php?acctoken=<base64>` — zdekodowany acctoken
|
||||||
# zawiera WBITY IP serwera-resolvera → definitywnie IP-bound. WebView only.
|
# zawiera WBITY IP serwera-resolvera → definitywnie IP-bound. WebView only.
|
||||||
|
|
@ -118,10 +112,6 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# ~155k solo-scen upgrade z WebView-z-reklamami na natywne. Wcześniej WebView fallback
|
# ~155k solo-scen upgrade z WebView-z-reklamami na natywne. Wcześniej WebView fallback
|
||||||
# ładował ad-heavy stronę z phone IP (działało, ale gorszy UX + preroll VAST).
|
# ładował ad-heavy stronę z phone IP (działało, ale gorszy UX + preroll VAST).
|
||||||
"xhamstercom": xhamster.extract,
|
"xhamstercom": xhamster.extract,
|
||||||
# PornHat — dedicated extractor: tylko `<source>` z player area (skip sidebar
|
|
||||||
# trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy
|
|
||||||
# follow_redirects=True wymagane (fix w stream_proxy.py).
|
|
||||||
"pornhatcom": pornhat.extract,
|
|
||||||
# Freshporno KVS (function/0 + license). 2026-06-04 DevTools + cross-IP re-test
|
# Freshporno KVS (function/0 + license). 2026-06-04 DevTools + cross-IP re-test
|
||||||
# NAPRAWIA błąd z #20: finalny cdn4.freshporno.org/remote_control.php jest PORTABLE
|
# NAPRAWIA błąd z #20: finalny cdn4.freshporno.org/remote_control.php jest PORTABLE
|
||||||
# (token time-bound nie IP-bound — VPS odtworzył token z residential → 206) ale
|
# (token time-bound nie IP-bound — VPS odtworzył token z residential → 206) ale
|
||||||
|
|
@ -152,9 +142,8 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# — używają identycznego embed-iframe pattern dla streamingu.
|
# — używają identycznego embed-iframe pattern dla streamingu.
|
||||||
# hdporn92com — DELISTED 2026-05-18. Scene pages to SEO shell bez player iframe,
|
# hdporn92com — DELISTED 2026-05-18. Scene pages to SEO shell bez player iframe,
|
||||||
# JS hijackuje kliki na popunder. Wszystkie playback_sources mass-marked dead.
|
# JS hijackuje kliki na popunder. Wszystkie playback_sources mass-marked dead.
|
||||||
# 0dayxx wraps watchporn.to embed. watchporn.to/get_file/ token IP-bound (302→410
|
# 0dayxx + pornditt + pornhat — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request): orphan
|
||||||
# cross-IP). Switch na WebView fallback. ~5k scen.
|
# factories (0–0.2% canonical match), zastępujemy lepszymi źródłami. Dane skasowane.
|
||||||
"0dayxxcom": _vps_blocked_fallback.extract,
|
|
||||||
# CF-protected tube — curl_cffi w fetch_tube_html bypassa JA3, embed-iframe pattern.
|
# CF-protected tube — curl_cffi w fetch_tube_html bypassa JA3, embed-iframe pattern.
|
||||||
"perverzijacom": _embed_iframe.extract,
|
"perverzijacom": _embed_iframe.extract,
|
||||||
# Special: WebView-only (Yii2 session-bound player).
|
# Special: WebView-only (Yii2 session-bound player).
|
||||||
|
|
|
||||||
|
|
@ -1,23 +0,0 @@
|
||||||
"""pornditt.com — KVS (kt_player) direct stream extractor. Patrz app/extractors/tubes/_kvs.py.
|
|
||||||
|
|
||||||
User bug 2026-05-31 (scene 40f118e1): "Pornditt łapie reklamę zamiast video". pornditt
|
|
||||||
był na _vps_blocked_fallback (WebView), gdzie scrape łapał VAST preroll (trafostatic) zamiast
|
|
||||||
contentu. Identyczny silnik jak yespornvip: flashvars `video_url`/`video_alt_url` =
|
|
||||||
`function/0/...get_file/...` + `license_code`; VPS dociera (HTTP 200). Resolve server-side:
|
|
||||||
decode + follow 302 → portable CDN (twa.tgprn.com, time-bound, NIE IP/cookie-bound —
|
|
||||||
zweryfikowane cross-IP 2026-06-01 fresh session → 206 video/mp4). Native, multi-quality,
|
|
||||||
zero WebView/reklam.
|
|
||||||
|
|
||||||
NB: runtime `window.flashvars.video_url` pokazuje już ZDEKODOWANY plain get_file, ale raw
|
|
||||||
HTML (server-fetch) ma formę `function/0/...` + license — dekodujemy sami (_kvs.real_url).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from app.extractors._models import StreamSource
|
|
||||||
from app.extractors.tubes import _kvs
|
|
||||||
|
|
||||||
_BASE = "https://v.pornditt.com"
|
|
||||||
|
|
||||||
|
|
||||||
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
|
||||||
return _kvs.resolve_kvs(page_url, base_url=_BASE, timeout=timeout)
|
|
||||||
|
|
@ -1,86 +0,0 @@
|
||||||
"""pornhat.com — KVS engine. get_file 302 → HLS m3u8 manifest.
|
|
||||||
|
|
||||||
**2026-05-18 bandwidth optimization**: pornhat CDN tokens (`cdn.privatehost.com`) są
|
|
||||||
**time-bound, nie IP-bound** (`?sign=<HMAC>&exp_time=<unix>`). Zweryfikowane Chrome
|
|
||||||
DevTools MCP — VPS-resolved URL działa z każdego IP, bez Referer header. Zamiast
|
|
||||||
zwracać `pornhat.com/get_file/` URL (mobile dostaje go i robi 302 chain przez VPS
|
|
||||||
proxy), robimy server-side resolve i zwracamy końcowy manifest URL z signed token.
|
|
||||||
|
|
||||||
Mobile ExoPlayer otrzymuje:
|
|
||||||
`https://nvms12.cdn.privatehost.com/hls/contents/.../?sign=...&exp_time=...`
|
|
||||||
i pobiera manifest + segments direct z CDN. **Zero VPS bandwidth** (poza ~5KB
|
|
||||||
initial resolve fetch).
|
|
||||||
|
|
||||||
`mobile_direct_ok=True` w `raw` mówi playback.py że dla type=m3u8 ten URL jest OK
|
|
||||||
dla `direct_url=raw_url` (zazwyczaj m3u8 by szły przez proxy).
|
|
||||||
|
|
||||||
Token wygasa za ~30-120 min od resolve (depends na lra param). User pause+resume
|
|
||||||
po >2h może dostać 403 → mobile fallback na proxified URL re-resolve'a.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
from app.extractors._models import StreamSource
|
|
||||||
from app.extractors.tubes._kvs_source import extract_kvs_sources
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_get_file_redirect(get_file_url: str, *, timeout: float = 15.0) -> str | None:
|
|
||||||
"""Follow 302 chain pornhat.com/get_file/ → cdn.privatehost.com/hls/...
|
|
||||||
|
|
||||||
Returns final manifest URL z signed token, lub None gdy fail.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with httpx.Client(
|
|
||||||
timeout=timeout,
|
|
||||||
follow_redirects=True,
|
|
||||||
headers={
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
||||||
"Referer": "https://www.pornhat.com/",
|
|
||||||
},
|
|
||||||
) as c:
|
|
||||||
r = c.head(get_file_url)
|
|
||||||
final = str(r.url)
|
|
||||||
if "cdn.privatehost.com" in final and ".m3u8" not in final:
|
|
||||||
# Generic master URL: /hls/contents/... CDN serves jako m3u8 mime
|
|
||||||
# nawet bez .m3u8 w path (sprawdzone Content-Type).
|
|
||||||
return final
|
|
||||||
if ".m3u8" in final:
|
|
||||||
return final
|
|
||||||
log.info("pornhat resolve: unexpected final URL %s", final)
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("pornhat resolve %s failed: %s", get_file_url, e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
|
||||||
sources = extract_kvs_sources(
|
|
||||||
page_url, stream_type="m3u8", timeout=timeout, log_tag="pornhat"
|
|
||||||
)
|
|
||||||
if not sources:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Resolve każdy get_file URL → CDN signed manifest URL. Mobile dostaje direct.
|
|
||||||
resolved: list[StreamSource] = []
|
|
||||||
for s in sources:
|
|
||||||
final = _resolve_get_file_redirect(s.link)
|
|
||||||
if final:
|
|
||||||
resolved.append(
|
|
||||||
StreamSource(
|
|
||||||
link=final,
|
|
||||||
type="m3u8",
|
|
||||||
quality=s.quality,
|
|
||||||
referer=s.referer,
|
|
||||||
raw={"mobile_direct_ok": True},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Fallback: keep original (proxy will re-resolve)
|
|
||||||
resolved.append(s)
|
|
||||||
|
|
||||||
return resolved
|
|
||||||
Loading…
Add table
Reference in a new issue