From 90e391e2552ad3bf571128c3e3d5cd4cb3cc2242 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Mon, 22 Jun 2026 11:55:08 +0200 Subject: [PATCH] feat(sources): remove pornhub + redtube entirely MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both scrapers were disabled since 2026-05-12 (~0.4% canonical match — mostly short amateur clips that never match studio content); their data sat frozen. Removed for good: deleted the extractor registry entries, scraper files and imports, dropped them from the tag-enrichment priority lists, and purged the DB (17,906 playback_sources + 122 scenes that had no other source; mirror scenes shared with other tubes just lost the ph/rt link). Co-Authored-By: Claude Opus 4.8 (1M context) --- app/api/scenes.py | 2 +- app/api/taxonomies.py | 2 +- app/connectors/direct_scrapers/__init__.py | 12 +++-------- app/connectors/direct_scrapers/pornhub.py | 24 ---------------------- app/connectors/direct_scrapers/redtube.py | 22 -------------------- app/extractors/__init__.py | 11 ++++------ 6 files changed, 9 insertions(+), 64 deletions(-) delete mode 100644 app/connectors/direct_scrapers/pornhub.py delete mode 100644 app/connectors/direct_scrapers/redtube.py diff --git a/app/api/scenes.py b/app/api/scenes.py index 794d141..208e97e 100644 --- a/app/api/scenes.py +++ b/app/api/scenes.py @@ -969,7 +969,7 @@ def enrich_tags_from_tube( # Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage). PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom", - "xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"] + "xvideoscom", "xnxxcom", "pornhatcom"] sources = session.execute( select(PlaybackSource).where( PlaybackSource.scene_id == scene_id, diff --git a/app/api/taxonomies.py b/app/api/taxonomies.py index 3c5fcf4..6037bbd 100644 --- a/app/api/taxonomies.py +++ b/app/api/taxonomies.py @@ -349,7 +349,7 @@ _TAG_RESCRAPE_THRESHOLD = 3 # Mainstream tubes priority dla tagów — bogate metadane. _TAG_PRIORITY = [ "xhamstercom", "porntrexcom", "epornercom", "youporncom", - "xvideoscom", "xnxxcom", "redtubecom", "pornhatcom", + "xvideoscom", "xnxxcom", "pornhatcom", ] diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index bbacf88..af3474b 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -37,9 +37,7 @@ from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper from app.connectors.direct_scrapers.pornditt import PornDittScraper from app.connectors.direct_scrapers.porndish import PornDishScraper from app.connectors.direct_scrapers.pornhat import PornHatScraper # noqa: F401 — kept for backref; ingest disabled -from app.connectors.direct_scrapers.pornhub import PornHubScraper from app.connectors.direct_scrapers.porntrex import PornTrexScraper -from app.connectors.direct_scrapers.redtube import RedTubeScraper from app.connectors.direct_scrapers.siska import SiskaScraper from app.connectors.direct_scrapers.sxyland import SxyLandScraper from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper @@ -67,13 +65,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [ # (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się # resolvują). # Mainstream (URL templates well-known) - # PornHubScraper — wyłączony 2026-05-12 (analiza źródeł): 23,750 scen scrapnietych, - # tylko 105 (0.4%) match z TPDB/StashDB. PH hostuje głównie własne shortened - # clipy + amateur upload — nigdy nie zmatchują studio canonical content. Plik - # zostaje (extractor `pornhubcom` używa go w playback resolve dla istniejących - # playback_sources). - # RedTubeScraper — wyłączony 2026-05-12 (analiza źródeł): 20,127 scen, 82 match - # (0.4%). Same powody co PH (skrócone clipy + amateur upload). + # PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od + # 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów + # i ekstraktory usunięte. Powód: skrócone amatorskie clipy, nigdy nie matchują studio. XVideosScraper, XnxxScraper, XHamsterScraper, diff --git a/app/connectors/direct_scrapers/pornhub.py b/app/connectors/direct_scrapers/pornhub.py deleted file mode 100644 index 464534d..0000000 --- a/app/connectors/direct_scrapers/pornhub.py +++ /dev/null @@ -1,24 +0,0 @@ -"""PornHub.com — direct HTML scrape search results. - -Search: `https://www.pornhub.com/video/search?search=&page=` -Scene URL: `https://www.pornhub.com/view_video.php?viewkey=` -""" -from __future__ import annotations - -import re - -from app.connectors.direct_scrapers._search_base import BaseSearchScraper - - -class PornHubScraper(BaseSearchScraper): - sitetag = "pornhubcom" - _search_url_template = "https://www.pornhub.com/video/search?search={query}&page={page}" - _scene_url_re = re.compile( - r'href="(?P/view_video\.php\?viewkey=[A-Za-z0-9]+)"', - ) - - def _slug_from_match(self, m, scene_url): - # Pornhub URL nie ma slugu — używamy viewkey jako slug do query token filtering. - # Tytuł będzie derived z viewkey (krótki ID), ale faktyczny title backfilluje - # się przy resolve (yt-dlp ma metadata). - return m.group("url").split("=")[-1] diff --git a/app/connectors/direct_scrapers/redtube.py b/app/connectors/direct_scrapers/redtube.py deleted file mode 100644 index 3876f24..0000000 --- a/app/connectors/direct_scrapers/redtube.py +++ /dev/null @@ -1,22 +0,0 @@ -"""RedTube.com — direct HTML scrape search results. - -Search: `https://www.redtube.com/?search=&page=` -Scene URL: `https://www.redtube.com/` (slug nie ma w URL — viewkey-only). -""" -from __future__ import annotations - -import re - -from app.connectors.direct_scrapers._search_base import BaseSearchScraper - - -class RedTubeScraper(BaseSearchScraper): - sitetag = "redtubecom" - _search_url_template = "https://www.redtube.com/?search={query}&page={page}" - _scene_url_re = re.compile( - r'href="(?Phttps://www\.redtube\.com/(?P\d+))"', - ) - - def _title_from_slug(self, slug): - # Numeric ID jako tytuł nie ma sensu — placeholder, title backfill przy resolve. - return f"redtube:{slug}" diff --git a/app/extractors/__init__.py b/app/extractors/__init__.py index 35806fe..d4173bf 100644 --- a/app/extractors/__init__.py +++ b/app/extractors/__init__.py @@ -75,12 +75,9 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = { # NB: 2026-05-18 cross-IP test potwierdził że xvideos/xnxx/pornhub/youporn/redtube # CDN URLs są **time-bound** (nie IP-bound) — mobile_direct_ok auto-detect w # playback.py daje mobile direct fetch, zero VPS bandwidth. - # pornhub — 2026-06-02: yt-dlp z VPS dostaje HTTP 403 (Pornhub blokuje Hetzner IP; - # yt-dlp aktualny, inne yt-dlp tuby działają → blok specyficzny dla PH). WebView - # fallback gra z residential IP telefonu (jak xhamster). Wcześniej `_ytdlp.extract` - # zwracał 0 źródeł → "nie działa odtwarzanie". - "pornhubcom": _vps_blocked_fallback.extract, - "redtubecom": _ytdlp.extract, + # pornhub + redtube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Scrapery były + # disabled od 2026-05-12 (0.4% canonical match — głównie skrócone amatorskie clipy), + # zamrożone sceny/źródła skasowane z DB. Brak ekstraktorów → zero resolve. "xvideoscom": _ytdlp.extract, "xnxxcom": _ytdlp.extract, "youporncom": _ytdlp.extract, @@ -227,7 +224,7 @@ def supported_sitetags() -> tuple[str, ...]: def is_vps_blocked_fallback(sitetag: str) -> bool: """True gdy sitetag resolvuje się TYLKO przez WebView fallback (IP-bound CDN / - ad-heavy / CAPTCHA — np. fpoxxx, pornxpph, pornhubcom). Takie źródła dają gorszy + ad-heavy / CAPTCHA — np. fpoxxx, pornxpph). Takie źródła dają gorszy UX (reklamy, czarny ekran) niż natywny KVS/direct resolve, więc UI powinien je rankować NIŻEJ gdy scena ma też natywne źródło (bug-report 2026-06-07: scena pokazywała fpoxxx-WebView przed działającym freshporno bo sort był alfabetyczny)."""