feat(sources): remove pornhub + redtube entirely

Both scrapers were disabled since 2026-05-12 (~0.4% canonical match — mostly short amateur clips that never match studio content); their data sat frozen. Removed for good: deleted the extractor registry entries, scraper files and imports, dropped them from the tag-enrichment priority lists, and purged the DB (17,906 playback_sources + 122 scenes that had no other source; mirror scenes shared with other tubes just lost the ph/rt link). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-22 11:55:08 +02:00 · 2026-06-22 11:55:08 +02:00 · 90e391e255
commit 90e391e255
parent 1875604c6d
6 changed files with 9 additions and 64 deletions
--- a/app/api/scenes.py
+++ b/app/api/scenes.py
@ -969,7 +969,7 @@ def enrich_tags_from_tube(
    # Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
    PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
-                "xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"]
+                "xvideoscom", "xnxxcom", "pornhatcom"]
    sources = session.execute(
        select(PlaybackSource).where(
            PlaybackSource.scene_id == scene_id,
--- a/app/api/taxonomies.py
+++ b/app/api/taxonomies.py
@ -349,7 +349,7 @@ _TAG_RESCRAPE_THRESHOLD = 3
 # Mainstream tubes priority dla tagów — bogate metadane.
 _TAG_PRIORITY = [
    "xhamstercom", "porntrexcom", "epornercom", "youporncom",
-    "xvideoscom", "xnxxcom", "redtubecom", "pornhatcom",
+    "xvideoscom", "xnxxcom", "pornhatcom",
 ]
--- a/app/connectors/direct_scrapers/init.py
+++ b/app/connectors/direct_scrapers/init.py
@ -37,9 +37,7 @@ from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
 from app.connectors.direct_scrapers.pornditt import PornDittScraper
 from app.connectors.direct_scrapers.porndish import PornDishScraper
 from app.connectors.direct_scrapers.pornhat import PornHatScraper  # noqa: F401 — kept for backref; ingest disabled
 from app.connectors.direct_scrapers.pornhub import PornHubScraper
 from app.connectors.direct_scrapers.porntrex import PornTrexScraper
 from app.connectors.direct_scrapers.redtube import RedTubeScraper
 from app.connectors.direct_scrapers.siska import SiskaScraper
 from app.connectors.direct_scrapers.sxyland import SxyLandScraper
 from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
@ -67,13 +65,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
    # (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
    # resolvują).
    # Mainstream (URL templates well-known)
-    # PornHubScraper — wyłączony 2026-05-12 (analiza źródeł): 23,750 scen scrapnietych,
+    # PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od
-    # tylko 105 (0.4%) match z TPDB/StashDB. PH hostuje głównie własne shortened
+    # 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów
-    # clipy + amateur upload — nigdy nie zmatchują studio canonical content. Plik
+    # i ekstraktory usunięte. Powód: skrócone amatorskie clipy, nigdy nie matchują studio.
    # zostaje (extractor `pornhubcom` używa go w playback resolve dla istniejących
    # playback_sources).
    # RedTubeScraper — wyłączony 2026-05-12 (analiza źródeł): 20,127 scen, 82 match
    # (0.4%). Same powody co PH (skrócone clipy + amateur upload).
    XVideosScraper,
    XnxxScraper,
    XHamsterScraper,
--- a/app/connectors/direct_scrapers/pornhub.py
+++ b/app/connectors/direct_scrapers/pornhub.py
@ -1,24 +0,0 @@
 """PornHub.com — direct HTML scrape search results.
 Search: `https://www.pornhub.com/video/search?search=<q>&page=<n>`
 Scene URL: `https://www.pornhub.com/view_video.php?viewkey=<id>`
 """
 from __future__ import annotations
 import re
 from app.connectors.direct_scrapers._search_base import BaseSearchScraper
 class PornHubScraper(BaseSearchScraper):
    sitetag = "pornhubcom"
    _search_url_template = "https://www.pornhub.com/video/search?search={query}&page={page}"
    _scene_url_re = re.compile(
        r'href="(?P<url>/view_video\.php\?viewkey=[A-Za-z0-9]+)"',
    )
    def _slug_from_match(self, m, scene_url):
        # Pornhub URL nie ma slugu — używamy viewkey jako slug do query token filtering.
        # Tytuł będzie derived z viewkey (krótki ID), ale faktyczny title backfilluje
        # się przy resolve (yt-dlp ma metadata).
        return m.group("url").split("=")[-1]
--- a/app/connectors/direct_scrapers/redtube.py
+++ b/app/connectors/direct_scrapers/redtube.py
@ -1,22 +0,0 @@
 """RedTube.com — direct HTML scrape search results.
 Search: `https://www.redtube.com/?search=<q>&page=<n>`
 Scene URL: `https://www.redtube.com/<id>` (slug nie ma w URL — viewkey-only).
 """
 from __future__ import annotations
 import re
 from app.connectors.direct_scrapers._search_base import BaseSearchScraper
 class RedTubeScraper(BaseSearchScraper):
    sitetag = "redtubecom"
    _search_url_template = "https://www.redtube.com/?search={query}&page={page}"
    _scene_url_re = re.compile(
        r'href="(?P<url>https://www\.redtube\.com/(?P<slug>\d+))"',
    )
    def _title_from_slug(self, slug):
        # Numeric ID jako tytuł nie ma sensu — placeholder, title backfill przy resolve.
        return f"redtube:{slug}"
--- a/app/extractors/init.py
+++ b/app/extractors/init.py
@ -75,12 +75,9 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
    # NB: 2026-05-18 cross-IP test potwierdził że xvideos/xnxx/pornhub/youporn/redtube
    # CDN URLs są **time-bound** (nie IP-bound) — mobile_direct_ok auto-detect w
    # playback.py daje mobile direct fetch, zero VPS bandwidth.
-    # pornhub — 2026-06-02: yt-dlp z VPS dostaje HTTP 403 (Pornhub blokuje Hetzner IP;
+    # pornhub + redtube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Scrapery były
-    # yt-dlp aktualny, inne yt-dlp tuby działają → blok specyficzny dla PH). WebView
+    # disabled od 2026-05-12 (0.4% canonical match — głównie skrócone amatorskie clipy),
-    # fallback gra z residential IP telefonu (jak xhamster). Wcześniej `_ytdlp.extract`
+    # zamrożone sceny/źródła skasowane z DB. Brak ekstraktorów → zero resolve.
    # zwracał 0 źródeł → "nie działa odtwarzanie".
    "pornhubcom": _vps_blocked_fallback.extract,
    "redtubecom": _ytdlp.extract,
    "xvideoscom": _ytdlp.extract,
    "xnxxcom": _ytdlp.extract,
    "youporncom": _ytdlp.extract,
@ -227,7 +224,7 @@ def supported_sitetags() -> tuple[str, ...]:
 def is_vps_blocked_fallback(sitetag: str) -> bool:
    """True gdy sitetag resolvuje się TYLKO przez WebView fallback (IP-bound CDN /
-    ad-heavy / CAPTCHA — np. fpoxxx, pornxpph, pornhubcom). Takie źródła dają gorszy
+    ad-heavy / CAPTCHA — np. fpoxxx, pornxpph). Takie źródła dają gorszy
    UX (reklamy, czarny ekran) niż natywny KVS/direct resolve, więc UI powinien je
    rankować NIŻEJ gdy scena ma też natywne źródło (bug-report 2026-06-07: scena
    pokazywała fpoxxx-WebView przed działającym freshporno bo sort był alfabetyczny)."""