refactor(ingest): rename scraper Source name "pornapp" -> "tube-scraper"
The umbrella Source.name for all direct tube scrapers (deep-crawl, browse-latest, performer-driven) was "pornapp" — a misleading leftover from the removed external porn-app API. It read like a dependency on a third-party "pornapp" service; it is not — these are our own scrapers hitting 25+ tubes directly (kind=scraper, origin tube:<sitetag>). Renamed to "tube-scraper" via a single SCRAPER_SOURCE_NAME constant; DB row renamed in place (UPDATE name, same id) so all ingest_runs + external_records history stays linked. No behavior change — external_id keying (sitetag:url) and dedup are unaffected. NOTE: playback_sources.origin "pornapp:<sitetag>" prefix is a separate legacy format (resolve_playback parses it) and is intentionally left untouched. Verified on prod: row renamed (0 stray "pornapp"), new runs land on "tube-scraper". Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
3339d3cd14
commit
a196fcbcdb
7 changed files with 25 additions and 14 deletions
|
|
@ -2,11 +2,15 @@
|
||||||
|
|
||||||
Każdy scraper hit'uje tube bezpośrednio HTTPm — różne tube'y to różne rate limit
|
Każdy scraper hit'uje tube bezpośrednio HTTPm — różne tube'y to różne rate limit
|
||||||
budgets, więc mogą iść równolegle. Wszystkie feedują sceny do tej samej
|
budgets, więc mogą iść równolegle. Wszystkie feedują sceny do tej samej
|
||||||
`Source(name='pornapp')` (legacy nazwa — kept for DB compat) z external_id
|
`Source(name=SCRAPER_SOURCE_NAME)` z external_id `f"{sitetag}:{url}"`. Resolver
|
||||||
`f"{sitetag}:{url}"`. Resolver mergeuje idempotentnie po tym kluczu.
|
mergeuje idempotentnie po tym kluczu.
|
||||||
|
|
||||||
Search-based ścieżka (per performer name); category browse'ng przez `categoriesUrl`
|
Nazwa źródła: do 2026-06-07 brzmiała `"pornapp"` — myląca pozostałość po usuniętym
|
||||||
overrides w pornapp connector był specyficzny dla porn-app API i zostanie usunięty.
|
zewnętrznym porn-app API (sugerowała zależność od obcego serwisu, której NIE MA —
|
||||||
|
to nasze własne direct-scrapery tubów). Przemianowana na `"tube-scraper"`; wiersz
|
||||||
|
`sources` zaktualizowany w DB (UPDATE name) więc cała historia ingest_runs została.
|
||||||
|
|
||||||
|
Search-based ścieżka (per performer name); category browse'ng przez `categoriesUrl`.
|
||||||
|
|
||||||
UWAGA — speculative scrapers: większość aggregator + special tubes (xmoviesforyou,
|
UWAGA — speculative scrapers: większość aggregator + special tubes (xmoviesforyou,
|
||||||
watchporn, siska, porn4days, porndish, xxxfreewatch, latestleaks, mypornerleak,
|
watchporn, siska, porn4days, porndish, xxxfreewatch, latestleaks, mypornerleak,
|
||||||
|
|
@ -14,6 +18,11 @@ porndittcom, perverzija, fpoxxx, ...) ma URL templates + regex'y oparte na typow
|
||||||
WordPress conventions. Wymagają post-deploy verification — gdy któryś nie zwraca
|
WordPress conventions. Wymagają post-deploy verification — gdy któryś nie zwraca
|
||||||
wyników, sprawdź real search HTML + popraw template/regex w odpowiednim pliku.
|
wyników, sprawdź real search HTML + popraw template/regex w odpowiednim pliku.
|
||||||
"""
|
"""
|
||||||
|
# Umbrella Source.name dla wszystkich direct-scraperów (deep-crawl, browse-latest,
|
||||||
|
# performer-driven). Rename z legacy "pornapp" 2026-06-07 (mylące — nie ma zależności
|
||||||
|
# od zewnętrznego porn-app API).
|
||||||
|
SCRAPER_SOURCE_NAME = "tube-scraper"
|
||||||
|
|
||||||
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper
|
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper
|
||||||
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
||||||
from app.connectors.direct_scrapers.eporner import EpornerScraper
|
from app.connectors.direct_scrapers.eporner import EpornerScraper
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,8 @@ from app.connectors.base import RawScene
|
||||||
|
|
||||||
|
|
||||||
class BaseDirectTubeScraper(abc.ABC):
|
class BaseDirectTubeScraper(abc.ABC):
|
||||||
"""Kontrakt direct scrapera. Wszystkie scrapery feedują do `Source(name='pornapp')`
|
"""Kontrakt direct scrapera. Wszystkie scrapery feedują do
|
||||||
|
`Source(name=SCRAPER_SOURCE_NAME)` ("tube-scraper", rename z "pornapp" 2026-06-07)
|
||||||
żeby dziedziczyć logikę resolvera + idempotent merge per external_id."""
|
żeby dziedziczyć logikę resolvera + idempotent merge per external_id."""
|
||||||
|
|
||||||
sitetag: str
|
sitetag: str
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ import logging
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS
|
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS, SCRAPER_SOURCE_NAME
|
||||||
from app.models.source import SourceKind
|
from app.models.source import SourceKind
|
||||||
from app.scheduler.performer_driven import _ingest_iter_into_run
|
from app.scheduler.performer_driven import _ingest_iter_into_run
|
||||||
|
|
||||||
|
|
@ -46,7 +46,7 @@ def run_browse_latest(*, max_pages: int = 5) -> BrowseCounters:
|
||||||
try:
|
try:
|
||||||
c = _ingest_iter_into_run(
|
c = _ingest_iter_into_run(
|
||||||
source_kind=SourceKind.scraper,
|
source_kind=SourceKind.scraper,
|
||||||
source_name="pornapp",
|
source_name=SCRAPER_SOURCE_NAME,
|
||||||
run_label=f"browse-latest:{scraper.sitetag}",
|
run_label=f"browse-latest:{scraper.sitetag}",
|
||||||
iterator_factory=lambda s=scraper, mp=max_pages: s.latest_scenes(
|
iterator_factory=lambda s=scraper, mp=max_pages: s.latest_scenes(
|
||||||
max_pages=mp
|
max_pages=mp
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from app.config import get_settings
|
from app.config import get_settings
|
||||||
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS
|
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS, SCRAPER_SOURCE_NAME
|
||||||
from app.db import session_scope
|
from app.db import session_scope
|
||||||
from app.ingest import _process_scene, get_or_create_source
|
from app.ingest import _process_scene, get_or_create_source
|
||||||
from app.models.source import SourceKind
|
from app.models.source import SourceKind
|
||||||
|
|
@ -112,7 +112,7 @@ def run_deep_crawl(*, pages_per_run: int = 60, sitetags: list[str] | None = None
|
||||||
end = min(end, cap)
|
end = min(end, cap)
|
||||||
|
|
||||||
with session_scope() as session:
|
with session_scope() as session:
|
||||||
src = get_or_create_source(session, kind=SourceKind.scraper, name="pornapp")
|
src = get_or_create_source(session, kind=SourceKind.scraper, name=SCRAPER_SOURCE_NAME)
|
||||||
source_id = src.id
|
source_id = src.id
|
||||||
|
|
||||||
counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}
|
counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}
|
||||||
|
|
|
||||||
|
|
@ -156,7 +156,7 @@ def run_performer_driven(
|
||||||
# feedują do `Source(name='pornapp')` (legacy nazwa kept for DB compat) z
|
# feedują do `Source(name='pornapp')` (legacy nazwa kept for DB compat) z
|
||||||
# external_id `f"{sitetag}:{url}"` — resolver mergeuje idempotentnie cross-source.
|
# external_id `f"{sitetag}:{url}"` — resolver mergeuje idempotentnie cross-source.
|
||||||
if not skip_tubes:
|
if not skip_tubes:
|
||||||
from app.connectors.direct_scrapers import ALL_DIRECT_SCRAPERS
|
from app.connectors.direct_scrapers import ALL_DIRECT_SCRAPERS, SCRAPER_SOURCE_NAME
|
||||||
|
|
||||||
sitetag_filter = set(sitetags or []) or None
|
sitetag_filter = set(sitetags or []) or None
|
||||||
scrapers = [
|
scrapers = [
|
||||||
|
|
@ -168,13 +168,13 @@ def run_performer_driven(
|
||||||
scraper = scraper_cls()
|
scraper = scraper_cls()
|
||||||
c = _ingest_iter_into_run(
|
c = _ingest_iter_into_run(
|
||||||
source_kind=SourceKind.scraper,
|
source_kind=SourceKind.scraper,
|
||||||
source_name="pornapp",
|
source_name=SCRAPER_SOURCE_NAME,
|
||||||
run_label=f"performer-driven:direct:{scraper.sitetag}:{target.canonical_name}",
|
run_label=f"performer-driven:direct:{scraper.sitetag}:{target.canonical_name}",
|
||||||
iterator_factory=lambda s=scraper, t=target: s.search(
|
iterator_factory=lambda s=scraper, t=target: s.search(
|
||||||
t.canonical_name, page=1, limit=50
|
t.canonical_name, page=1, limit=50
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
counters.merge("pornapp", c)
|
counters.merge(SCRAPER_SOURCE_NAME, c)
|
||||||
|
|
||||||
counters.performers_processed += 1
|
counters.performers_processed += 1
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,8 @@ def main() -> int:
|
||||||
|
|
||||||
s = PornDoeScraper()
|
s = PornDoeScraper()
|
||||||
with session_scope() as session:
|
with session_scope() as session:
|
||||||
src = get_or_create_source(session, kind=SourceKind.scraper, name="pornapp")
|
from app.connectors.direct_scrapers import SCRAPER_SOURCE_NAME
|
||||||
|
src = get_or_create_source(session, kind=SourceKind.scraper, name=SCRAPER_SOURCE_NAME)
|
||||||
source_id = src.id
|
source_id = src.id
|
||||||
|
|
||||||
counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}
|
counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}
|
||||||
|
|
|
||||||
|
|
@ -80,7 +80,7 @@ def main() -> int:
|
||||||
JOIN scenes sc ON sc.id = ser.scene_id
|
JOIN scenes sc ON sc.id = ser.scene_id
|
||||||
JOIN sources s ON s.id = ser.source_id
|
JOIN sources s ON s.id = ser.source_id
|
||||||
LEFT JOIN studios st ON st.id = sc.studio_id
|
LEFT JOIN studios st ON st.id = sc.studio_id
|
||||||
WHERE s.name = 'pornapp'
|
WHERE s.name = 'tube-scraper'
|
||||||
AND ser.external_id LIKE :prefix
|
AND ser.external_id LIKE :prefix
|
||||||
ORDER BY sc.id
|
ORDER BY sc.id
|
||||||
LIMIT :lim
|
LIMIT :lim
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue