feat(sources): remove 0dayxx + pornditt + pornhat entirely

Three orphan-factory tubes (0–0.2% canonical match — auto-screenshot thumbs and
slug titles that never match TPDB/StashDB) — to be replaced by better sources.
Removed scrapers (files + imports), extractors (registry + modules), the pornhat
entry from tag-enrichment priority lists and the 0dayxx display override, and purged
the DB (19,003 playback_sources + 9,904 solo-orphan scenes; shared mirror scenes keep
their other sources). The pornhat-based enrich_studio endpoint stays as a graceful
no-op (no pornhat sources → returns no studio).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-22 12:23:29 +02:00
parent 2f3e57c0ac
commit cbb2390a2a
10 changed files with 9 additions and 391 deletions

View file

@ -969,7 +969,7 @@ def enrich_tags_from_tube(
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
"xvideoscom", "xnxxcom", "pornhatcom"]
"xvideoscom", "xnxxcom"]
sources = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,

View file

@ -91,7 +91,6 @@ _DISPLAY_OVERRIDES: dict[str, str] = {
"porn00org": "porn00.org",
"freshpornoorg": "freshporno.org",
"pornxpph": "pornxp.ph",
"0dayxxcom": "0dayxx.com",
"shyfapnet": "shyfap.net",
"hdporngg": "hdporn.gg",
"fullmoviesxxx": "fullmovies.xxx",

View file

@ -349,7 +349,7 @@ _TAG_RESCRAPE_THRESHOLD = 3
# Mainstream tubes priority dla tagów — bogate metadane.
_TAG_PRIORITY = [
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
"xvideoscom", "xnxxcom", "pornhatcom",
"xvideoscom", "xnxxcom",
]

View file

@ -34,9 +34,7 @@ from app.connectors.direct_scrapers.latestpornvideo import LatestPornVideoScrape
from app.connectors.direct_scrapers.mypornerleak import MyPornerLeakScraper
from app.connectors.direct_scrapers.perverzija import PerverzijaScraper
from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
from app.connectors.direct_scrapers.pornditt import PornDittScraper
from app.connectors.direct_scrapers.porndish import PornDishScraper
from app.connectors.direct_scrapers.pornhat import PornHatScraper # noqa: F401 — kept for backref; ingest disabled
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
from app.connectors.direct_scrapers.siska import SiskaScraper
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
@ -48,7 +46,6 @@ from app.connectors.direct_scrapers.xnxx import XnxxScraper
from app.connectors.direct_scrapers.xvideos import XVideosScraper
from app.connectors.direct_scrapers.xxxfreewatch import XxxFreeWatchScraper # noqa: F401 — kept for backref; delisted
from app.connectors.direct_scrapers.youporn import YouPornScraper
from app.connectors.direct_scrapers.zerodayxx import ZeroDayXXScraper
ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
# Existing 4 (verified, in production)
@ -58,12 +55,8 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
# popunder redirect. Mobile WebView page-as-hoster pokazuje ad redirect zamiast video.
# 33,598 playback_sources mass-marked dead, 27,374 solo-orphan scenes deleted.
SxyLandScraper,
# ZeroDayXXScraper — wyłączony 2026-05-12 (source quality report): 25,596 scen, 0.1% canonical
# match. Slug-concat tytuły (`bella reese big butt ready to be filled with cum analized`) bez
# `[Studio]` lub `Studio - Perf - Title` prefixu (parse rate 3%) → resolver nie ma żadnego
# signalu do matchu. Wraps watchporn ale dziedziczy stripped metadata. Solo orphany usunięte
# (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
# resolvują).
# ZeroDayXXScraper (0dayxx) — USUNIĘTY CAŁKOWICIE 2026-06-22 (user request). Orphan
# factory (0.1% canonical), zastępujemy lepszymi źródłami. Dane/pliki/extractor skasowane.
# Mainstream (URL templates well-known)
# PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od
# 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów
@ -105,19 +98,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
# zwraca consistent search results. KVS engine, slug-aware scene URLs. Mostly
# orphan ingest (auto-screenshots, no canonical phash match — sprawdzone), ale
# może łapać sceny popularnych performerów których jeszcze nie mamy w TPDB.
# PornHatScraper — wyłączony 2026-05-18. 9,799 scen, 0.2% canonical match, 100% solo-orphan.
# Pure orphan factory — auto-screenshot thumbs nie matchują phash do canonical, slug tytuły
# nie matchują rapidfuzz, brak duration/date signals. KEEP `pornhatcom` extractor i istniejące
# playback_sources żywe — mobile może je odtwarzać; disable tylko future ingest.
# PornDittScraper — wyłączony 2026-05-12 (bug-report 64356e9b). Każdy link
# produkował nową Scene row zamiast matchować do istniejącej kanonicznej
# (TPDB/StashDB) bo pornditt ma weak signal: title + cz. performera, brak
# fingerprintu/duration/date → composite_score zawsze poniżej auto_merge
# threshold (0.92). Plik scrapera + extractor zostają (istniejące playback_sources
# nadal się resolvują, _REGISTRY w app/extractors/__init__.py odpala
# `porndittcom` → _embed_iframe.extract). Re-enable wymaga albo
# "alternative-source mode" w resolverze (match-only, never create new),
# albo bogatszej extracji metadanych (duration + fingerprint).
# PornHat (pornhatcom) + PornDitt (porndittcom) — USUNIĘTE CAŁKOWICIE 2026-06-22
# (user request). Orphan factories (0.2% / weak-signal canonical match), zastępujemy
# lepszymi źródłami. Dane/pliki scraperów/extractory skasowane.
# Special
SxyPrnScraper,
PerverzijaScraper,

View file

@ -1,26 +0,0 @@
"""pornditt.com — direct HTML scrape.
KVS-style site (kt_player engine). Search URL: `/search/<slug>/?from=<page>` z slug-style
zapytaniem (spacje `-`). Sceny renderują się na subdomenie `v.pornditt.com/videos/<id>/<slug>/`,
więc regex matchuje oba (z i bez `v.` prefix).
Sitetag `porndittcom` (legacy z porn-app DEFAULT_SITETAGS suffix-stripped name).
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class PornDittScraper(BaseSearchScraper):
sitetag = "porndittcom"
_search_url_template = "https://pornditt.com/search/{query}/?from={page}"
_scene_url_re = re.compile(
r'href="(?P<url>https://(?:v\.)?pornditt\.com/videos/(?P<sid>\d+)/(?P<slug>[a-z0-9\-]+))/"',
re.IGNORECASE,
)
def _format_query_for_url(self, query: str) -> str:
# KVS slug: lowercase, spacja/interpunkcja → `-`. URL-encoded (`+`) tu nie zadziała.
return re.sub(r"[^a-z0-9]+", "-", query.lower()).strip("-")

View file

@ -1,99 +0,0 @@
"""pornhat.com — search-mode scraper (performer-driven backfill).
KVS engine. Search URL: `/search/<query>/` z `+` jako space separator. Scene URLs
to `/video/<slug>/` (slug bez ID prefix, w przeciwieństwie do 3Movs/OK.xxx). Slug
zawiera tokens query gdy match jest relevant, więc filtruje się automatycznie.
Auto-screenshot thumbnaile (`static.pornhat.com/contents/videos_screenshots/.../1.jpg`)
do canonical match przez phash NIE nadają się (sprawdzone w probe 2026-05-12, 8%).
Ale wartość scrapera: discovering nowych scen performera których inne tube'y/canonical
nie mają. Mostly orphan ingest, ale dla popular performers może łapać studio scenes
których nie mamy w TPDB jeszcze.
Metadata enrich: scene page ma `class="info-video js-ajax-{dvd,model,tag}"` div'y
z `data-setup='{"title": ..., "url": ..., "dir": ...}'` JSON. Parsujemy w
`_fetch_scene_metadata()` żeby insertować studio (dvd), dodatkowych performerów
(models), i tagi do każdej sceny.
"""
from __future__ import annotations
import json
import logging
import re
from app.connectors.base import RawPerformer, RawStudio, RawTag
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
# `class="info-video js-ajax-<kind>"` ... `data-setup='<json>'`. JSON jest
# single-quoted (HTML attribute), z double-quotes wewnątrz dla string values.
# `\1` w replacement: backreference do `<kind>` żeby wiedzieć co matchujemy.
_AJAX_DATA_RE = re.compile(
r"class=\"info-video js-ajax-(?P<kind>dvd|model|tag)[^\"]*\"[^>]*data-setup='(?P<json>[^']+)'",
re.IGNORECASE,
)
class PornHatScraper(BaseSearchScraper):
sitetag = "pornhatcom"
# Pagination KVS-style: /search/<query>/<page>/ (page=1 ALSO works z explicit `/1/`)
_search_url_template = "https://www.pornhat.com/search/{query}/{page}/"
# PornHat search HTML używa relative hrefs `/video/<slug>/`. BaseSearchScraper
# automatycznie konwertuje relative → absolute via urlparse(search_url).netloc.
_scene_url_re = re.compile(
r'href="(?P<url>(?:https://www\.pornhat\.com)?/video/(?P<slug>[a-z0-9\-]+)/)"',
re.IGNORECASE,
)
def _format_query_for_url(self, query: str) -> str:
# KVS: lowercase + spaces → `-` (slug-style), działa też `+`
return query.strip().lower().replace(" ", "-")
def _fetch_scene_metadata(
self, scene_url: str
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
"""Fetch scene detail + parse `js-ajax-{dvd,model,tag}` data-setup JSON."""
try:
r = browser_get(scene_url, timeout=self._timeout)
if r.status_code != 200:
return None
except Exception as e:
log.debug("pornhat detail fetch failed %s: %s", scene_url, e)
return None
studio: RawStudio | None = None
performers: list[RawPerformer] = []
tags: list[RawTag] = []
for m in _AJAX_DATA_RE.finditer(r.text):
kind = m.group("kind").lower()
try:
data = json.loads(m.group("json"))
except json.JSONDecodeError:
continue
name = (data.get("title") or "").strip()
slug = (data.get("dir") or "").strip() or None
if not name:
continue
if kind == "dvd":
# `dvd` to studio/series wrapper (np. "Adult Time"). Pierwsze
# wystąpienie bierzemy jako studio sceny — rzadko jest ich więcej.
if studio is None:
studio = RawStudio(
external_id=f"pornhatcom:dvd:{slug or name.lower()}",
name=name,
slug=slug,
)
elif kind == "model":
performers.append(RawPerformer(name=name))
elif kind == "tag":
tags.append(RawTag(
external_id=f"pornhatcom:tag:{slug or name.lower()}",
name=name,
slug=slug,
))
return studio, performers, tags

View file

@ -1,119 +0,0 @@
"""ZeroDayXXScraper — direct HTML scrape 0dayxx.com search.
Search: `https://0dayxx.com/page/<n>/?s=<query>`. Scene URL format:
`https://0dayxx.com/0day-porn-video/<slug>/` (lub czasem `/<category>/<slug>/`).
"""
from __future__ import annotations
import logging
import re
import urllib.parse
from collections.abc import Iterator
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
_SCENE_URL_RE = re.compile(
r'href="(https://0dayxx\.com/(?:0day-porn-video|latest-porn-videos|porn-(?:bf|videos))/([^"/]+))/?"'
)
_OG_TITLE_RE = re.compile(
r'<meta\s+property="og:title"\s+content="([^"]+)"', re.IGNORECASE
)
_OG_IMAGE_RE = re.compile(
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
)
def _fetch_detail(scene_url: str) -> tuple[str | None, str | None]:
"""Pobiera 0dayxx detail page i wyciąga (real_title, thumbnail_url).
0dayxx jest wrapperem (embeduje watchporn.to/inne), więc duration/tagi tu
nie siedzą na watchporn.to. og:image jednak jest na 0dayxx i daje
miniaturkę z poprawnym wymiarem (200x200 mała, ale lepsza niż żadna).
Bez tego fetch'u sceny 0dayxx trafiały do dedupu z slug'iem jako title +
bez thumbnail_url czyli z dwoma najsłabszymi sygnałami na raz, co
powodowało albo brak match'y albo false-positive merge'y (zgłoszone
2026-05-09).
"""
try:
r = browser_get(scene_url, timeout=20)
except Exception as e:
log.debug("0dayxx detail fetch failed for %s: %s", scene_url, e)
return None, None
if r.status_code != 200:
return None, None
title = None
thumb = None
if (m := _OG_TITLE_RE.search(r.text)):
# Strip ` | 0dayxx.com Daily...` suffix (powtórki og:title czasem mają go).
title = m.group(1).split("|")[0].strip()
if (m := _OG_IMAGE_RE.search(r.text)):
thumb = m.group(1).strip()
return title, thumb
class ZeroDayXXScraper(BaseDirectTubeScraper):
sitetag = "0dayxxcom"
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
q = urllib.parse.quote_plus(query.strip())
url = f"https://0dayxx.com/page/{page}/?s={q}"
try:
r = browser_get(url, timeout=30)
except Exception as e:
log.warning("0dayxx search fetch failed: %s", e)
return
if r.status_code != 200:
return
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
seen: set[str] = set()
yielded = 0
for m in _SCENE_URL_RE.finditer(r.text):
scene_url = m.group(1) + "/"
slug = m.group(2)
if scene_url in seen:
continue
seen.add(scene_url)
slug_lower = slug.lower()
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
continue
real_title, thumb = _fetch_detail(scene_url)
title = real_title or slug.replace("-", " ").strip()
yield RawScene(
external_id=f"0dayxxcom:{scene_url}",
title=title,
url=scene_url,
playback_sources=[
RawPlaybackSource(
origin="tube:0dayxxcom",
page_url=scene_url,
thumbnail_url=thumb,
)
],
performers=[RawPerformer(name=query.strip())],
raw={
"source": "direct_scraper:0dayxx",
"query": query,
"page": page,
"url": scene_url,
},
)
yielded += 1
if limit is not None and yielded >= limit:
return

View file

@ -38,8 +38,6 @@ from app.extractors.tubes import (
latestpornvideo,
paradisehill,
porn00,
pornditt,
pornhat,
porntrex,
sxyprn,
xhamster,
@ -85,10 +83,6 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
# flashvars `video_url` → `get_file` 302 → CDN time-bound signed URL
# (`expires`+`md5`, NIE IP-bound) → mobile gra direct, zero VPS bandwidth.
"porntrexcom": porntrex.extract,
# pornditt — KVS jak yespornvip (function/0 + license). VPS dociera → resolve
# server-side (decode + follow 302 → portable twa.tgprn.com CDN). Wcześniej WebView
# fallback łapał VAST preroll (trafostatic) zamiast contentu. Patrz pornditt.py/_kvs.py.
"porndittcom": pornditt.extract,
# fpoxxx — KVS, plain get_file + license. 2026-06-01 (task #20): get_file 302 →
# `videos3.fpo.xxx/remote_control.php?acctoken=<base64>` — zdekodowany acctoken
# zawiera WBITY IP serwera-resolvera → definitywnie IP-bound. WebView only.
@ -118,10 +112,6 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
# ~155k solo-scen upgrade z WebView-z-reklamami na natywne. Wcześniej WebView fallback
# ładował ad-heavy stronę z phone IP (działało, ale gorszy UX + preroll VAST).
"xhamstercom": xhamster.extract,
# PornHat — dedicated extractor: tylko `<source>` z player area (skip sidebar
# trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy
# follow_redirects=True wymagane (fix w stream_proxy.py).
"pornhatcom": pornhat.extract,
# Freshporno KVS (function/0 + license). 2026-06-04 DevTools + cross-IP re-test
# NAPRAWIA błąd z #20: finalny cdn4.freshporno.org/remote_control.php jest PORTABLE
# (token time-bound nie IP-bound — VPS odtworzył token z residential → 206) ale
@ -152,9 +142,8 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
# — używają identycznego embed-iframe pattern dla streamingu.
# hdporn92com — DELISTED 2026-05-18. Scene pages to SEO shell bez player iframe,
# JS hijackuje kliki na popunder. Wszystkie playback_sources mass-marked dead.
# 0dayxx wraps watchporn.to embed. watchporn.to/get_file/ token IP-bound (302→410
# cross-IP). Switch na WebView fallback. ~5k scen.
"0dayxxcom": _vps_blocked_fallback.extract,
# 0dayxx + pornditt + pornhat — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request): orphan
# factories (00.2% canonical match), zastępujemy lepszymi źródłami. Dane skasowane.
# CF-protected tube — curl_cffi w fetch_tube_html bypassa JA3, embed-iframe pattern.
"perverzijacom": _embed_iframe.extract,
# Special: WebView-only (Yii2 session-bound player).

View file

@ -1,23 +0,0 @@
"""pornditt.com — KVS (kt_player) direct stream extractor. Patrz app/extractors/tubes/_kvs.py.
User bug 2026-05-31 (scene 40f118e1): "Pornditt łapie reklamę zamiast video". pornditt
był na _vps_blocked_fallback (WebView), gdzie scrape łapał VAST preroll (trafostatic) zamiast
contentu. Identyczny silnik jak yespornvip: flashvars `video_url`/`video_alt_url` =
`function/0/...get_file/...` + `license_code`; VPS dociera (HTTP 200). Resolve server-side:
decode + follow 302 portable CDN (twa.tgprn.com, time-bound, NIE IP/cookie-bound
zweryfikowane cross-IP 2026-06-01 fresh session 206 video/mp4). Native, multi-quality,
zero WebView/reklam.
NB: runtime `window.flashvars.video_url` pokazuje już ZDEKODOWANY plain get_file, ale raw
HTML (server-fetch) ma formę `function/0/...` + license dekodujemy sami (_kvs.real_url).
"""
from __future__ import annotations
from app.extractors._models import StreamSource
from app.extractors.tubes import _kvs
_BASE = "https://v.pornditt.com"
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
return _kvs.resolve_kvs(page_url, base_url=_BASE, timeout=timeout)

View file

@ -1,86 +0,0 @@
"""pornhat.com — KVS engine. get_file 302 → HLS m3u8 manifest.
**2026-05-18 bandwidth optimization**: pornhat CDN tokens (`cdn.privatehost.com`)
**time-bound, nie IP-bound** (`?sign=<HMAC>&exp_time=<unix>`). Zweryfikowane Chrome
DevTools MCP VPS-resolved URL działa z każdego IP, bez Referer header. Zamiast
zwracać `pornhat.com/get_file/` URL (mobile dostaje go i robi 302 chain przez VPS
proxy), robimy server-side resolve i zwracamy końcowy manifest URL z signed token.
Mobile ExoPlayer otrzymuje:
`https://nvms12.cdn.privatehost.com/hls/contents/.../?sign=...&exp_time=...`
i pobiera manifest + segments direct z CDN. **Zero VPS bandwidth** (poza ~5KB
initial resolve fetch).
`mobile_direct_ok=True` w `raw` mówi playback.py że dla type=m3u8 ten URL jest OK
dla `direct_url=raw_url` (zazwyczaj m3u8 by szły przez proxy).
Token wygasa za ~30-120 min od resolve (depends na lra param). User pause+resume
po >2h może dostać 403 mobile fallback na proxified URL re-resolve'a.
"""
from __future__ import annotations
import logging
import httpx
from app.extractors._models import StreamSource
from app.extractors.tubes._kvs_source import extract_kvs_sources
log = logging.getLogger(__name__)
def _resolve_get_file_redirect(get_file_url: str, *, timeout: float = 15.0) -> str | None:
"""Follow 302 chain pornhat.com/get_file/ → cdn.privatehost.com/hls/...
Returns final manifest URL z signed token, lub None gdy fail.
"""
try:
with httpx.Client(
timeout=timeout,
follow_redirects=True,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Referer": "https://www.pornhat.com/",
},
) as c:
r = c.head(get_file_url)
final = str(r.url)
if "cdn.privatehost.com" in final and ".m3u8" not in final:
# Generic master URL: /hls/contents/... CDN serves jako m3u8 mime
# nawet bez .m3u8 w path (sprawdzone Content-Type).
return final
if ".m3u8" in final:
return final
log.info("pornhat resolve: unexpected final URL %s", final)
return None
except Exception as e:
log.warning("pornhat resolve %s failed: %s", get_file_url, e)
return None
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
sources = extract_kvs_sources(
page_url, stream_type="m3u8", timeout=timeout, log_tag="pornhat"
)
if not sources:
return None
# Resolve każdy get_file URL → CDN signed manifest URL. Mobile dostaje direct.
resolved: list[StreamSource] = []
for s in sources:
final = _resolve_get_file_redirect(s.link)
if final:
resolved.append(
StreamSource(
link=final,
type="m3u8",
quality=s.quality,
referer=s.referer,
raw={"mobile_direct_ok": True},
)
)
else:
# Fallback: keep original (proxy will re-resolve)
resolved.append(s)
return resolved