From 90e391e2552ad3bf571128c3e3d5cd4cb3cc2242 Mon Sep 17 00:00:00 2001
From: jtrzupek <jtrzupek@gmail.com>
Date: Mon, 22 Jun 2026 11:55:08 +0200
Subject: [PATCH] feat(sources): remove pornhub + redtube entirely
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both scrapers were disabled since 2026-05-12 (~0.4% canonical match — mostly short
amateur clips that never match studio content); their data sat frozen. Removed for
good: deleted the extractor registry entries, scraper files and imports, dropped them
from the tag-enrichment priority lists, and purged the DB (17,906 playback_sources +
122 scenes that had no other source; mirror scenes shared with other tubes just lost
the ph/rt link).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/api/scenes.py                          |  2 +-
 app/api/taxonomies.py                      |  2 +-
 app/connectors/direct_scrapers/__init__.py | 12 +++--------
 app/connectors/direct_scrapers/pornhub.py  | 24 ----------------------
 app/connectors/direct_scrapers/redtube.py  | 22 --------------------
 app/extractors/__init__.py                 | 11 ++++------
 6 files changed, 9 insertions(+), 64 deletions(-)
 delete mode 100644 app/connectors/direct_scrapers/pornhub.py
 delete mode 100644 app/connectors/direct_scrapers/redtube.py

diff --git a/app/api/scenes.py b/app/api/scenes.py
index 794d141..208e97e 100644
--- a/app/api/scenes.py
+++ b/app/api/scenes.py
@@ -969,7 +969,7 @@ def enrich_tags_from_tube(
 
     # Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
     PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
-                "xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"]
+                "xvideoscom", "xnxxcom", "pornhatcom"]
     sources = session.execute(
         select(PlaybackSource).where(
             PlaybackSource.scene_id == scene_id,
diff --git a/app/api/taxonomies.py b/app/api/taxonomies.py
index 3c5fcf4..6037bbd 100644
--- a/app/api/taxonomies.py
+++ b/app/api/taxonomies.py
@@ -349,7 +349,7 @@ _TAG_RESCRAPE_THRESHOLD = 3
 # Mainstream tubes priority dla tagów — bogate metadane.
 _TAG_PRIORITY = [
     "xhamstercom", "porntrexcom", "epornercom", "youporncom",
-    "xvideoscom", "xnxxcom", "redtubecom", "pornhatcom",
+    "xvideoscom", "xnxxcom", "pornhatcom",
 ]
 
 
diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py
index bbacf88..af3474b 100644
--- a/app/connectors/direct_scrapers/__init__.py
+++ b/app/connectors/direct_scrapers/__init__.py
@@ -37,9 +37,7 @@ from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
 from app.connectors.direct_scrapers.pornditt import PornDittScraper
 from app.connectors.direct_scrapers.porndish import PornDishScraper
 from app.connectors.direct_scrapers.pornhat import PornHatScraper  # noqa: F401 — kept for backref; ingest disabled
-from app.connectors.direct_scrapers.pornhub import PornHubScraper
 from app.connectors.direct_scrapers.porntrex import PornTrexScraper
-from app.connectors.direct_scrapers.redtube import RedTubeScraper
 from app.connectors.direct_scrapers.siska import SiskaScraper
 from app.connectors.direct_scrapers.sxyland import SxyLandScraper
 from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
@@ -67,13 +65,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
     # (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
     # resolvują).
     # Mainstream (URL templates well-known)
-    # PornHubScraper — wyłączony 2026-05-12 (analiza źródeł): 23,750 scen scrapnietych,
-    # tylko 105 (0.4%) match z TPDB/StashDB. PH hostuje głównie własne shortened
-    # clipy + amateur upload — nigdy nie zmatchują studio canonical content. Plik
-    # zostaje (extractor `pornhubcom` używa go w playback resolve dla istniejących
-    # playback_sources).
-    # RedTubeScraper — wyłączony 2026-05-12 (analiza źródeł): 20,127 scen, 82 match
-    # (0.4%). Same powody co PH (skrócone clipy + amateur upload).
+    # PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od
+    # 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów
+    # i ekstraktory usunięte. Powód: skrócone amatorskie clipy, nigdy nie matchują studio.
     XVideosScraper,
     XnxxScraper,
     XHamsterScraper,
diff --git a/app/connectors/direct_scrapers/pornhub.py b/app/connectors/direct_scrapers/pornhub.py
deleted file mode 100644
index 464534d..0000000
--- a/app/connectors/direct_scrapers/pornhub.py
+++ /dev/null
@@ -1,24 +0,0 @@
-"""PornHub.com — direct HTML scrape search results.
-
-Search: `https://www.pornhub.com/video/search?search=<q>&page=<n>`
-Scene URL: `https://www.pornhub.com/view_video.php?viewkey=<id>`
-"""
-from __future__ import annotations
-
-import re
-
-from app.connectors.direct_scrapers._search_base import BaseSearchScraper
-
-
-class PornHubScraper(BaseSearchScraper):
-    sitetag = "pornhubcom"
-    _search_url_template = "https://www.pornhub.com/video/search?search={query}&page={page}"
-    _scene_url_re = re.compile(
-        r'href="(?P<url>/view_video\.php\?viewkey=[A-Za-z0-9]+)"',
-    )
-
-    def _slug_from_match(self, m, scene_url):
-        # Pornhub URL nie ma slugu — używamy viewkey jako slug do query token filtering.
-        # Tytuł będzie derived z viewkey (krótki ID), ale faktyczny title backfilluje
-        # się przy resolve (yt-dlp ma metadata).
-        return m.group("url").split("=")[-1]
diff --git a/app/connectors/direct_scrapers/redtube.py b/app/connectors/direct_scrapers/redtube.py
deleted file mode 100644
index 3876f24..0000000
--- a/app/connectors/direct_scrapers/redtube.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""RedTube.com — direct HTML scrape search results.
-
-Search: `https://www.redtube.com/?search=<q>&page=<n>`
-Scene URL: `https://www.redtube.com/<id>` (slug nie ma w URL — viewkey-only).
-"""
-from __future__ import annotations
-
-import re
-
-from app.connectors.direct_scrapers._search_base import BaseSearchScraper
-
-
-class RedTubeScraper(BaseSearchScraper):
-    sitetag = "redtubecom"
-    _search_url_template = "https://www.redtube.com/?search={query}&page={page}"
-    _scene_url_re = re.compile(
-        r'href="(?P<url>https://www\.redtube\.com/(?P<slug>\d+))"',
-    )
-
-    def _title_from_slug(self, slug):
-        # Numeric ID jako tytuł nie ma sensu — placeholder, title backfill przy resolve.
-        return f"redtube:{slug}"
diff --git a/app/extractors/__init__.py b/app/extractors/__init__.py
index 35806fe..d4173bf 100644
--- a/app/extractors/__init__.py
+++ b/app/extractors/__init__.py
@@ -75,12 +75,9 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
     # NB: 2026-05-18 cross-IP test potwierdził że xvideos/xnxx/pornhub/youporn/redtube
     # CDN URLs są **time-bound** (nie IP-bound) — mobile_direct_ok auto-detect w
     # playback.py daje mobile direct fetch, zero VPS bandwidth.
-    # pornhub — 2026-06-02: yt-dlp z VPS dostaje HTTP 403 (Pornhub blokuje Hetzner IP;
-    # yt-dlp aktualny, inne yt-dlp tuby działają → blok specyficzny dla PH). WebView
-    # fallback gra z residential IP telefonu (jak xhamster). Wcześniej `_ytdlp.extract`
-    # zwracał 0 źródeł → "nie działa odtwarzanie".
-    "pornhubcom": _vps_blocked_fallback.extract,
-    "redtubecom": _ytdlp.extract,
+    # pornhub + redtube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Scrapery były
+    # disabled od 2026-05-12 (0.4% canonical match — głównie skrócone amatorskie clipy),
+    # zamrożone sceny/źródła skasowane z DB. Brak ekstraktorów → zero resolve.
     "xvideoscom": _ytdlp.extract,
     "xnxxcom": _ytdlp.extract,
     "youporncom": _ytdlp.extract,
@@ -227,7 +224,7 @@ def supported_sitetags() -> tuple[str, ...]:
 
 def is_vps_blocked_fallback(sitetag: str) -> bool:
     """True gdy sitetag resolvuje się TYLKO przez WebView fallback (IP-bound CDN /
-    ad-heavy / CAPTCHA — np. fpoxxx, pornxpph, pornhubcom). Takie źródła dają gorszy
+    ad-heavy / CAPTCHA — np. fpoxxx, pornxpph). Takie źródła dają gorszy
     UX (reklamy, czarny ekran) niż natywny KVS/direct resolve, więc UI powinien je
     rankować NIŻEJ gdy scena ma też natywne źródło (bug-report 2026-06-07: scena
     pokazywała fpoxxx-WebView przed działającym freshporno bo sort był alfabetyczny)."""