feat(sources): remove pornhub + redtube entirely
Both scrapers were disabled since 2026-05-12 (~0.4% canonical match — mostly short amateur clips that never match studio content); their data sat frozen. Removed for good: deleted the extractor registry entries, scraper files and imports, dropped them from the tag-enrichment priority lists, and purged the DB (17,906 playback_sources + 122 scenes that had no other source; mirror scenes shared with other tubes just lost the ph/rt link). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1875604c6d
commit
90e391e255
6 changed files with 9 additions and 64 deletions
|
|
@ -969,7 +969,7 @@ def enrich_tags_from_tube(
|
|||
|
||||
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
|
||||
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||
"xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"]
|
||||
"xvideoscom", "xnxxcom", "pornhatcom"]
|
||||
sources = session.execute(
|
||||
select(PlaybackSource).where(
|
||||
PlaybackSource.scene_id == scene_id,
|
||||
|
|
|
|||
|
|
@ -349,7 +349,7 @@ _TAG_RESCRAPE_THRESHOLD = 3
|
|||
# Mainstream tubes priority dla tagów — bogate metadane.
|
||||
_TAG_PRIORITY = [
|
||||
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||
"xvideoscom", "xnxxcom", "redtubecom", "pornhatcom",
|
||||
"xvideoscom", "xnxxcom", "pornhatcom",
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -37,9 +37,7 @@ from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
|||
from app.connectors.direct_scrapers.pornditt import PornDittScraper
|
||||
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
||||
from app.connectors.direct_scrapers.pornhat import PornHatScraper # noqa: F401 — kept for backref; ingest disabled
|
||||
from app.connectors.direct_scrapers.pornhub import PornHubScraper
|
||||
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
||||
from app.connectors.direct_scrapers.redtube import RedTubeScraper
|
||||
from app.connectors.direct_scrapers.siska import SiskaScraper
|
||||
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
||||
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
|
||||
|
|
@ -67,13 +65,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
|||
# (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
|
||||
# resolvują).
|
||||
# Mainstream (URL templates well-known)
|
||||
# PornHubScraper — wyłączony 2026-05-12 (analiza źródeł): 23,750 scen scrapnietych,
|
||||
# tylko 105 (0.4%) match z TPDB/StashDB. PH hostuje głównie własne shortened
|
||||
# clipy + amateur upload — nigdy nie zmatchują studio canonical content. Plik
|
||||
# zostaje (extractor `pornhubcom` używa go w playback resolve dla istniejących
|
||||
# playback_sources).
|
||||
# RedTubeScraper — wyłączony 2026-05-12 (analiza źródeł): 20,127 scen, 82 match
|
||||
# (0.4%). Same powody co PH (skrócone clipy + amateur upload).
|
||||
# PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od
|
||||
# 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów
|
||||
# i ekstraktory usunięte. Powód: skrócone amatorskie clipy, nigdy nie matchują studio.
|
||||
XVideosScraper,
|
||||
XnxxScraper,
|
||||
XHamsterScraper,
|
||||
|
|
|
|||
|
|
@ -1,24 +0,0 @@
|
|||
"""PornHub.com — direct HTML scrape search results.
|
||||
|
||||
Search: `https://www.pornhub.com/video/search?search=<q>&page=<n>`
|
||||
Scene URL: `https://www.pornhub.com/view_video.php?viewkey=<id>`
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||
|
||||
|
||||
class PornHubScraper(BaseSearchScraper):
|
||||
sitetag = "pornhubcom"
|
||||
_search_url_template = "https://www.pornhub.com/video/search?search={query}&page={page}"
|
||||
_scene_url_re = re.compile(
|
||||
r'href="(?P<url>/view_video\.php\?viewkey=[A-Za-z0-9]+)"',
|
||||
)
|
||||
|
||||
def _slug_from_match(self, m, scene_url):
|
||||
# Pornhub URL nie ma slugu — używamy viewkey jako slug do query token filtering.
|
||||
# Tytuł będzie derived z viewkey (krótki ID), ale faktyczny title backfilluje
|
||||
# się przy resolve (yt-dlp ma metadata).
|
||||
return m.group("url").split("=")[-1]
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
"""RedTube.com — direct HTML scrape search results.
|
||||
|
||||
Search: `https://www.redtube.com/?search=<q>&page=<n>`
|
||||
Scene URL: `https://www.redtube.com/<id>` (slug nie ma w URL — viewkey-only).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||
|
||||
|
||||
class RedTubeScraper(BaseSearchScraper):
|
||||
sitetag = "redtubecom"
|
||||
_search_url_template = "https://www.redtube.com/?search={query}&page={page}"
|
||||
_scene_url_re = re.compile(
|
||||
r'href="(?P<url>https://www\.redtube\.com/(?P<slug>\d+))"',
|
||||
)
|
||||
|
||||
def _title_from_slug(self, slug):
|
||||
# Numeric ID jako tytuł nie ma sensu — placeholder, title backfill przy resolve.
|
||||
return f"redtube:{slug}"
|
||||
|
|
@ -75,12 +75,9 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
|||
# NB: 2026-05-18 cross-IP test potwierdził że xvideos/xnxx/pornhub/youporn/redtube
|
||||
# CDN URLs są **time-bound** (nie IP-bound) — mobile_direct_ok auto-detect w
|
||||
# playback.py daje mobile direct fetch, zero VPS bandwidth.
|
||||
# pornhub — 2026-06-02: yt-dlp z VPS dostaje HTTP 403 (Pornhub blokuje Hetzner IP;
|
||||
# yt-dlp aktualny, inne yt-dlp tuby działają → blok specyficzny dla PH). WebView
|
||||
# fallback gra z residential IP telefonu (jak xhamster). Wcześniej `_ytdlp.extract`
|
||||
# zwracał 0 źródeł → "nie działa odtwarzanie".
|
||||
"pornhubcom": _vps_blocked_fallback.extract,
|
||||
"redtubecom": _ytdlp.extract,
|
||||
# pornhub + redtube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Scrapery były
|
||||
# disabled od 2026-05-12 (0.4% canonical match — głównie skrócone amatorskie clipy),
|
||||
# zamrożone sceny/źródła skasowane z DB. Brak ekstraktorów → zero resolve.
|
||||
"xvideoscom": _ytdlp.extract,
|
||||
"xnxxcom": _ytdlp.extract,
|
||||
"youporncom": _ytdlp.extract,
|
||||
|
|
@ -227,7 +224,7 @@ def supported_sitetags() -> tuple[str, ...]:
|
|||
|
||||
def is_vps_blocked_fallback(sitetag: str) -> bool:
|
||||
"""True gdy sitetag resolvuje się TYLKO przez WebView fallback (IP-bound CDN /
|
||||
ad-heavy / CAPTCHA — np. fpoxxx, pornxpph, pornhubcom). Takie źródła dają gorszy
|
||||
ad-heavy / CAPTCHA — np. fpoxxx, pornxpph). Takie źródła dają gorszy
|
||||
UX (reklamy, czarny ekran) niż natywny KVS/direct resolve, więc UI powinien je
|
||||
rankować NIŻEJ gdy scena ma też natywne źródło (bug-report 2026-06-07: scena
|
||||
pokazywała fpoxxx-WebView przed działającym freshporno bo sort był alfabetyczny)."""
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue