refactor(ingest): rename scraper Source name "pornapp" -> "tube-scraper"

The umbrella Source.name for all direct tube scrapers (deep-crawl, browse-latest,
performer-driven) was "pornapp" — a misleading leftover from the removed external
porn-app API. It read like a dependency on a third-party "pornapp" service; it is
not — these are our own scrapers hitting 25+ tubes directly (kind=scraper,
origin tube:<sitetag>). Renamed to "tube-scraper" via a single SCRAPER_SOURCE_NAME
constant; DB row renamed in place (UPDATE name, same id) so all ingest_runs +
external_records history stays linked. No behavior change — external_id keying
(sitetag:url) and dedup are unaffected.

NOTE: playback_sources.origin "pornapp:<sitetag>" prefix is a separate legacy
format (resolve_playback parses it) and is intentionally left untouched.

Verified on prod: row renamed (0 stray "pornapp"), new runs land on "tube-scraper".

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-07 16:54:55 +02:00
parent 3339d3cd14
commit a196fcbcdb
7 changed files with 25 additions and 14 deletions

View file

@ -2,11 +2,15 @@
Każdy scraper hit'uje tube bezpośrednio HTTPm — różne tube'y to różne rate limit
budgets, więc mogą iść równolegle. Wszystkie feedują sceny do tej samej
`Source(name='pornapp')` (legacy nazwa kept for DB compat) z external_id
`f"{sitetag}:{url}"`. Resolver mergeuje idempotentnie po tym kluczu.
`Source(name=SCRAPER_SOURCE_NAME)` z external_id `f"{sitetag}:{url}"`. Resolver
mergeuje idempotentnie po tym kluczu.
Search-based ścieżka (per performer name); category browse'ng przez `categoriesUrl`
overrides w pornapp connector był specyficzny dla porn-app API i zostanie usunięty.
Nazwa źródła: do 2026-06-07 brzmiała `"pornapp"` myląca pozostałość po usuniętym
zewnętrznym porn-app API (sugerowała zależność od obcego serwisu, której NIE MA
to nasze własne direct-scrapery tubów). Przemianowana na `"tube-scraper"`; wiersz
`sources` zaktualizowany w DB (UPDATE name) więc cała historia ingest_runs została.
Search-based ścieżka (per performer name); category browse'ng przez `categoriesUrl`.
UWAGA speculative scrapers: większość aggregator + special tubes (xmoviesforyou,
watchporn, siska, porn4days, porndish, xxxfreewatch, latestleaks, mypornerleak,
@ -14,6 +18,11 @@ porndittcom, perverzija, fpoxxx, ...) ma URL templates + regex'y oparte na typow
WordPress conventions. Wymagają post-deploy verification gdy któryś nie zwraca
wyników, sprawdź real search HTML + popraw template/regex w odpowiednim pliku.
"""
# Umbrella Source.name dla wszystkich direct-scraperów (deep-crawl, browse-latest,
# performer-driven). Rename z legacy "pornapp" 2026-06-07 (mylące — nie ma zależności
# od zewnętrznego porn-app API).
SCRAPER_SOURCE_NAME = "tube-scraper"
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.connectors.direct_scrapers.eporner import EpornerScraper

View file

@ -8,7 +8,8 @@ from app.connectors.base import RawScene
class BaseDirectTubeScraper(abc.ABC):
"""Kontrakt direct scrapera. Wszystkie scrapery feedują do `Source(name='pornapp')`
"""Kontrakt direct scrapera. Wszystkie scrapery feedują do
`Source(name=SCRAPER_SOURCE_NAME)` ("tube-scraper", rename z "pornapp" 2026-06-07)
żeby dziedziczyć logikę resolvera + idempotent merge per external_id."""
sitetag: str

View file

@ -20,7 +20,7 @@ import logging
import time
from dataclasses import dataclass
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS, SCRAPER_SOURCE_NAME
from app.models.source import SourceKind
from app.scheduler.performer_driven import _ingest_iter_into_run
@ -46,7 +46,7 @@ def run_browse_latest(*, max_pages: int = 5) -> BrowseCounters:
try:
c = _ingest_iter_into_run(
source_kind=SourceKind.scraper,
source_name="pornapp",
source_name=SCRAPER_SOURCE_NAME,
run_label=f"browse-latest:{scraper.sitetag}",
iterator_factory=lambda s=scraper, mp=max_pages: s.latest_scenes(
max_pages=mp

View file

@ -22,7 +22,7 @@ import time
from pathlib import Path
from app.config import get_settings
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS, SCRAPER_SOURCE_NAME
from app.db import session_scope
from app.ingest import _process_scene, get_or_create_source
from app.models.source import SourceKind
@ -112,7 +112,7 @@ def run_deep_crawl(*, pages_per_run: int = 60, sitetags: list[str] | None = None
end = min(end, cap)
with session_scope() as session:
src = get_or_create_source(session, kind=SourceKind.scraper, name="pornapp")
src = get_or_create_source(session, kind=SourceKind.scraper, name=SCRAPER_SOURCE_NAME)
source_id = src.id
counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}

View file

@ -156,7 +156,7 @@ def run_performer_driven(
# feedują do `Source(name='pornapp')` (legacy nazwa kept for DB compat) z
# external_id `f"{sitetag}:{url}"` — resolver mergeuje idempotentnie cross-source.
if not skip_tubes:
from app.connectors.direct_scrapers import ALL_DIRECT_SCRAPERS
from app.connectors.direct_scrapers import ALL_DIRECT_SCRAPERS, SCRAPER_SOURCE_NAME
sitetag_filter = set(sitetags or []) or None
scrapers = [
@ -168,13 +168,13 @@ def run_performer_driven(
scraper = scraper_cls()
c = _ingest_iter_into_run(
source_kind=SourceKind.scraper,
source_name="pornapp",
source_name=SCRAPER_SOURCE_NAME,
run_label=f"performer-driven:direct:{scraper.sitetag}:{target.canonical_name}",
iterator_factory=lambda s=scraper, t=target: s.search(
t.canonical_name, page=1, limit=50
),
)
counters.merge("pornapp", c)
counters.merge(SCRAPER_SOURCE_NAME, c)
counters.performers_processed += 1

View file

@ -32,7 +32,8 @@ def main() -> int:
s = PornDoeScraper()
with session_scope() as session:
src = get_or_create_source(session, kind=SourceKind.scraper, name="pornapp")
from app.connectors.direct_scrapers import SCRAPER_SOURCE_NAME
src = get_or_create_source(session, kind=SourceKind.scraper, name=SCRAPER_SOURCE_NAME)
source_id = src.id
counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}

View file

@ -80,7 +80,7 @@ def main() -> int:
JOIN scenes sc ON sc.id = ser.scene_id
JOIN sources s ON s.id = ser.source_id
LEFT JOIN studios st ON st.id = sc.studio_id
WHERE s.name = 'pornapp'
WHERE s.name = 'tube-scraper'
AND ser.external_id LIKE :prefix
ORDER BY sc.id
LIMIT :lim