feat(sources): remove pornhub + redtube entirely
Both scrapers were disabled since 2026-05-12 (~0.4% canonical match — mostly short amateur clips that never match studio content); their data sat frozen. Removed for good: deleted the extractor registry entries, scraper files and imports, dropped them from the tag-enrichment priority lists, and purged the DB (17,906 playback_sources + 122 scenes that had no other source; mirror scenes shared with other tubes just lost the ph/rt link). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1875604c6d
commit
90e391e255
6 changed files with 9 additions and 64 deletions
|
|
@ -969,7 +969,7 @@ def enrich_tags_from_tube(
|
||||||
|
|
||||||
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
|
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
|
||||||
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||||
"xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"]
|
"xvideoscom", "xnxxcom", "pornhatcom"]
|
||||||
sources = session.execute(
|
sources = session.execute(
|
||||||
select(PlaybackSource).where(
|
select(PlaybackSource).where(
|
||||||
PlaybackSource.scene_id == scene_id,
|
PlaybackSource.scene_id == scene_id,
|
||||||
|
|
|
||||||
|
|
@ -349,7 +349,7 @@ _TAG_RESCRAPE_THRESHOLD = 3
|
||||||
# Mainstream tubes priority dla tagów — bogate metadane.
|
# Mainstream tubes priority dla tagów — bogate metadane.
|
||||||
_TAG_PRIORITY = [
|
_TAG_PRIORITY = [
|
||||||
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||||
"xvideoscom", "xnxxcom", "redtubecom", "pornhatcom",
|
"xvideoscom", "xnxxcom", "pornhatcom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,9 +37,7 @@ from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
||||||
from app.connectors.direct_scrapers.pornditt import PornDittScraper
|
from app.connectors.direct_scrapers.pornditt import PornDittScraper
|
||||||
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
||||||
from app.connectors.direct_scrapers.pornhat import PornHatScraper # noqa: F401 — kept for backref; ingest disabled
|
from app.connectors.direct_scrapers.pornhat import PornHatScraper # noqa: F401 — kept for backref; ingest disabled
|
||||||
from app.connectors.direct_scrapers.pornhub import PornHubScraper
|
|
||||||
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
||||||
from app.connectors.direct_scrapers.redtube import RedTubeScraper
|
|
||||||
from app.connectors.direct_scrapers.siska import SiskaScraper
|
from app.connectors.direct_scrapers.siska import SiskaScraper
|
||||||
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
||||||
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
|
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
|
||||||
|
|
@ -67,13 +65,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
|
# (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
|
||||||
# resolvują).
|
# resolvują).
|
||||||
# Mainstream (URL templates well-known)
|
# Mainstream (URL templates well-known)
|
||||||
# PornHubScraper — wyłączony 2026-05-12 (analiza źródeł): 23,750 scen scrapnietych,
|
# PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od
|
||||||
# tylko 105 (0.4%) match z TPDB/StashDB. PH hostuje głównie własne shortened
|
# 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów
|
||||||
# clipy + amateur upload — nigdy nie zmatchują studio canonical content. Plik
|
# i ekstraktory usunięte. Powód: skrócone amatorskie clipy, nigdy nie matchują studio.
|
||||||
# zostaje (extractor `pornhubcom` używa go w playback resolve dla istniejących
|
|
||||||
# playback_sources).
|
|
||||||
# RedTubeScraper — wyłączony 2026-05-12 (analiza źródeł): 20,127 scen, 82 match
|
|
||||||
# (0.4%). Same powody co PH (skrócone clipy + amateur upload).
|
|
||||||
XVideosScraper,
|
XVideosScraper,
|
||||||
XnxxScraper,
|
XnxxScraper,
|
||||||
XHamsterScraper,
|
XHamsterScraper,
|
||||||
|
|
|
||||||
|
|
@ -1,24 +0,0 @@
|
||||||
"""PornHub.com — direct HTML scrape search results.
|
|
||||||
|
|
||||||
Search: `https://www.pornhub.com/video/search?search=<q>&page=<n>`
|
|
||||||
Scene URL: `https://www.pornhub.com/view_video.php?viewkey=<id>`
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
|
||||||
|
|
||||||
|
|
||||||
class PornHubScraper(BaseSearchScraper):
|
|
||||||
sitetag = "pornhubcom"
|
|
||||||
_search_url_template = "https://www.pornhub.com/video/search?search={query}&page={page}"
|
|
||||||
_scene_url_re = re.compile(
|
|
||||||
r'href="(?P<url>/view_video\.php\?viewkey=[A-Za-z0-9]+)"',
|
|
||||||
)
|
|
||||||
|
|
||||||
def _slug_from_match(self, m, scene_url):
|
|
||||||
# Pornhub URL nie ma slugu — używamy viewkey jako slug do query token filtering.
|
|
||||||
# Tytuł będzie derived z viewkey (krótki ID), ale faktyczny title backfilluje
|
|
||||||
# się przy resolve (yt-dlp ma metadata).
|
|
||||||
return m.group("url").split("=")[-1]
|
|
||||||
|
|
@ -1,22 +0,0 @@
|
||||||
"""RedTube.com — direct HTML scrape search results.
|
|
||||||
|
|
||||||
Search: `https://www.redtube.com/?search=<q>&page=<n>`
|
|
||||||
Scene URL: `https://www.redtube.com/<id>` (slug nie ma w URL — viewkey-only).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
|
||||||
|
|
||||||
|
|
||||||
class RedTubeScraper(BaseSearchScraper):
|
|
||||||
sitetag = "redtubecom"
|
|
||||||
_search_url_template = "https://www.redtube.com/?search={query}&page={page}"
|
|
||||||
_scene_url_re = re.compile(
|
|
||||||
r'href="(?P<url>https://www\.redtube\.com/(?P<slug>\d+))"',
|
|
||||||
)
|
|
||||||
|
|
||||||
def _title_from_slug(self, slug):
|
|
||||||
# Numeric ID jako tytuł nie ma sensu — placeholder, title backfill przy resolve.
|
|
||||||
return f"redtube:{slug}"
|
|
||||||
|
|
@ -75,12 +75,9 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# NB: 2026-05-18 cross-IP test potwierdził że xvideos/xnxx/pornhub/youporn/redtube
|
# NB: 2026-05-18 cross-IP test potwierdził że xvideos/xnxx/pornhub/youporn/redtube
|
||||||
# CDN URLs są **time-bound** (nie IP-bound) — mobile_direct_ok auto-detect w
|
# CDN URLs są **time-bound** (nie IP-bound) — mobile_direct_ok auto-detect w
|
||||||
# playback.py daje mobile direct fetch, zero VPS bandwidth.
|
# playback.py daje mobile direct fetch, zero VPS bandwidth.
|
||||||
# pornhub — 2026-06-02: yt-dlp z VPS dostaje HTTP 403 (Pornhub blokuje Hetzner IP;
|
# pornhub + redtube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Scrapery były
|
||||||
# yt-dlp aktualny, inne yt-dlp tuby działają → blok specyficzny dla PH). WebView
|
# disabled od 2026-05-12 (0.4% canonical match — głównie skrócone amatorskie clipy),
|
||||||
# fallback gra z residential IP telefonu (jak xhamster). Wcześniej `_ytdlp.extract`
|
# zamrożone sceny/źródła skasowane z DB. Brak ekstraktorów → zero resolve.
|
||||||
# zwracał 0 źródeł → "nie działa odtwarzanie".
|
|
||||||
"pornhubcom": _vps_blocked_fallback.extract,
|
|
||||||
"redtubecom": _ytdlp.extract,
|
|
||||||
"xvideoscom": _ytdlp.extract,
|
"xvideoscom": _ytdlp.extract,
|
||||||
"xnxxcom": _ytdlp.extract,
|
"xnxxcom": _ytdlp.extract,
|
||||||
"youporncom": _ytdlp.extract,
|
"youporncom": _ytdlp.extract,
|
||||||
|
|
@ -227,7 +224,7 @@ def supported_sitetags() -> tuple[str, ...]:
|
||||||
|
|
||||||
def is_vps_blocked_fallback(sitetag: str) -> bool:
|
def is_vps_blocked_fallback(sitetag: str) -> bool:
|
||||||
"""True gdy sitetag resolvuje się TYLKO przez WebView fallback (IP-bound CDN /
|
"""True gdy sitetag resolvuje się TYLKO przez WebView fallback (IP-bound CDN /
|
||||||
ad-heavy / CAPTCHA — np. fpoxxx, pornxpph, pornhubcom). Takie źródła dają gorszy
|
ad-heavy / CAPTCHA — np. fpoxxx, pornxpph). Takie źródła dają gorszy
|
||||||
UX (reklamy, czarny ekran) niż natywny KVS/direct resolve, więc UI powinien je
|
UX (reklamy, czarny ekran) niż natywny KVS/direct resolve, więc UI powinien je
|
||||||
rankować NIŻEJ gdy scena ma też natywne źródło (bug-report 2026-06-07: scena
|
rankować NIŻEJ gdy scena ma też natywne źródło (bug-report 2026-06-07: scena
|
||||||
pokazywała fpoxxx-WebView przed działającym freshporno bo sort był alfabetyczny)."""
|
pokazywała fpoxxx-WebView przed działającym freshporno bo sort był alfabetyczny)."""
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue