From a196fcbcdbb78422ef3eef243b735f398ec51a23 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Sun, 7 Jun 2026 16:54:55 +0200 Subject: [PATCH] refactor(ingest): rename scraper Source name "pornapp" -> "tube-scraper" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The umbrella Source.name for all direct tube scrapers (deep-crawl, browse-latest, performer-driven) was "pornapp" — a misleading leftover from the removed external porn-app API. It read like a dependency on a third-party "pornapp" service; it is not — these are our own scrapers hitting 25+ tubes directly (kind=scraper, origin tube:). Renamed to "tube-scraper" via a single SCRAPER_SOURCE_NAME constant; DB row renamed in place (UPDATE name, same id) so all ingest_runs + external_records history stays linked. No behavior change — external_id keying (sitetag:url) and dedup are unaffected. NOTE: playback_sources.origin "pornapp:" prefix is a separate legacy format (resolve_playback parses it) and is intentionally left untouched. Verified on prod: row renamed (0 stray "pornapp"), new runs land on "tube-scraper". Co-Authored-By: Claude Opus 4.8 --- app/connectors/direct_scrapers/__init__.py | 17 +++++++++++++---- app/connectors/direct_scrapers/base.py | 3 ++- app/scheduler/browse_latest.py | 4 ++-- app/scheduler/deep_crawl.py | 4 ++-- app/scheduler/performer_driven.py | 6 +++--- scripts/pilot_porndoe_deepcrawl.py | 3 ++- scripts/studio_retrofix.py | 2 +- 7 files changed, 25 insertions(+), 14 deletions(-) diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index 0e9d4e8..171de5c 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -2,11 +2,15 @@ Każdy scraper hit'uje tube bezpośrednio HTTPm — różne tube'y to różne rate limit budgets, więc mogą iść równolegle. Wszystkie feedują sceny do tej samej -`Source(name='pornapp')` (legacy nazwa — kept for DB compat) z external_id -`f"{sitetag}:{url}"`. Resolver mergeuje idempotentnie po tym kluczu. +`Source(name=SCRAPER_SOURCE_NAME)` z external_id `f"{sitetag}:{url}"`. Resolver +mergeuje idempotentnie po tym kluczu. -Search-based ścieżka (per performer name); category browse'ng przez `categoriesUrl` -overrides w pornapp connector był specyficzny dla porn-app API i zostanie usunięty. +Nazwa źródła: do 2026-06-07 brzmiała `"pornapp"` — myląca pozostałość po usuniętym +zewnętrznym porn-app API (sugerowała zależność od obcego serwisu, której NIE MA — +to nasze własne direct-scrapery tubów). Przemianowana na `"tube-scraper"`; wiersz +`sources` zaktualizowany w DB (UPDATE name) więc cała historia ingest_runs została. + +Search-based ścieżka (per performer name); category browse'ng przez `categoriesUrl`. UWAGA — speculative scrapers: większość aggregator + special tubes (xmoviesforyou, watchporn, siska, porn4days, porndish, xxxfreewatch, latestleaks, mypornerleak, @@ -14,6 +18,11 @@ porndittcom, perverzija, fpoxxx, ...) ma URL templates + regex'y oparte na typow WordPress conventions. Wymagają post-deploy verification — gdy któryś nie zwraca wyników, sprawdź real search HTML + popraw template/regex w odpowiednim pliku. """ +# Umbrella Source.name dla wszystkich direct-scraperów (deep-crawl, browse-latest, +# performer-driven). Rename z legacy "pornapp" 2026-06-07 (mylące — nie ma zależności +# od zewnętrznego porn-app API). +SCRAPER_SOURCE_NAME = "tube-scraper" + from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper from app.connectors.direct_scrapers.base import BaseDirectTubeScraper from app.connectors.direct_scrapers.eporner import EpornerScraper diff --git a/app/connectors/direct_scrapers/base.py b/app/connectors/direct_scrapers/base.py index 7354eb0..aba2391 100644 --- a/app/connectors/direct_scrapers/base.py +++ b/app/connectors/direct_scrapers/base.py @@ -8,7 +8,8 @@ from app.connectors.base import RawScene class BaseDirectTubeScraper(abc.ABC): - """Kontrakt direct scrapera. Wszystkie scrapery feedują do `Source(name='pornapp')` + """Kontrakt direct scrapera. Wszystkie scrapery feedują do + `Source(name=SCRAPER_SOURCE_NAME)` ("tube-scraper", rename z "pornapp" 2026-06-07) żeby dziedziczyć logikę resolvera + idempotent merge per external_id.""" sitetag: str diff --git a/app/scheduler/browse_latest.py b/app/scheduler/browse_latest.py index d1577c7..60cdac6 100644 --- a/app/scheduler/browse_latest.py +++ b/app/scheduler/browse_latest.py @@ -20,7 +20,7 @@ import logging import time from dataclasses import dataclass -from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS +from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS, SCRAPER_SOURCE_NAME from app.models.source import SourceKind from app.scheduler.performer_driven import _ingest_iter_into_run @@ -46,7 +46,7 @@ def run_browse_latest(*, max_pages: int = 5) -> BrowseCounters: try: c = _ingest_iter_into_run( source_kind=SourceKind.scraper, - source_name="pornapp", + source_name=SCRAPER_SOURCE_NAME, run_label=f"browse-latest:{scraper.sitetag}", iterator_factory=lambda s=scraper, mp=max_pages: s.latest_scenes( max_pages=mp diff --git a/app/scheduler/deep_crawl.py b/app/scheduler/deep_crawl.py index 5aef6f7..330fe85 100644 --- a/app/scheduler/deep_crawl.py +++ b/app/scheduler/deep_crawl.py @@ -22,7 +22,7 @@ import time from pathlib import Path from app.config import get_settings -from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS +from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS, SCRAPER_SOURCE_NAME from app.db import session_scope from app.ingest import _process_scene, get_or_create_source from app.models.source import SourceKind @@ -112,7 +112,7 @@ def run_deep_crawl(*, pages_per_run: int = 60, sitetags: list[str] | None = None end = min(end, cap) with session_scope() as session: - src = get_or_create_source(session, kind=SourceKind.scraper, name="pornapp") + src = get_or_create_source(session, kind=SourceKind.scraper, name=SCRAPER_SOURCE_NAME) source_id = src.id counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0} diff --git a/app/scheduler/performer_driven.py b/app/scheduler/performer_driven.py index 80da98a..d8212e9 100644 --- a/app/scheduler/performer_driven.py +++ b/app/scheduler/performer_driven.py @@ -156,7 +156,7 @@ def run_performer_driven( # feedują do `Source(name='pornapp')` (legacy nazwa kept for DB compat) z # external_id `f"{sitetag}:{url}"` — resolver mergeuje idempotentnie cross-source. if not skip_tubes: - from app.connectors.direct_scrapers import ALL_DIRECT_SCRAPERS + from app.connectors.direct_scrapers import ALL_DIRECT_SCRAPERS, SCRAPER_SOURCE_NAME sitetag_filter = set(sitetags or []) or None scrapers = [ @@ -168,13 +168,13 @@ def run_performer_driven( scraper = scraper_cls() c = _ingest_iter_into_run( source_kind=SourceKind.scraper, - source_name="pornapp", + source_name=SCRAPER_SOURCE_NAME, run_label=f"performer-driven:direct:{scraper.sitetag}:{target.canonical_name}", iterator_factory=lambda s=scraper, t=target: s.search( t.canonical_name, page=1, limit=50 ), ) - counters.merge("pornapp", c) + counters.merge(SCRAPER_SOURCE_NAME, c) counters.performers_processed += 1 diff --git a/scripts/pilot_porndoe_deepcrawl.py b/scripts/pilot_porndoe_deepcrawl.py index 06f6e3f..06a9f2c 100644 --- a/scripts/pilot_porndoe_deepcrawl.py +++ b/scripts/pilot_porndoe_deepcrawl.py @@ -32,7 +32,8 @@ def main() -> int: s = PornDoeScraper() with session_scope() as session: - src = get_or_create_source(session, kind=SourceKind.scraper, name="pornapp") + from app.connectors.direct_scrapers import SCRAPER_SOURCE_NAME + src = get_or_create_source(session, kind=SourceKind.scraper, name=SCRAPER_SOURCE_NAME) source_id = src.id counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0} diff --git a/scripts/studio_retrofix.py b/scripts/studio_retrofix.py index 5776190..fafe89c 100644 --- a/scripts/studio_retrofix.py +++ b/scripts/studio_retrofix.py @@ -80,7 +80,7 @@ def main() -> int: JOIN scenes sc ON sc.id = ser.scene_id JOIN sources s ON s.id = ser.source_id LEFT JOIN studios st ON st.id = sc.studio_id - WHERE s.name = 'pornapp' + WHERE s.name = 'tube-scraper' AND ser.external_id LIKE :prefix ORDER BY sc.id LIMIT :lim