feat(ingest): skip <180s tube scenes (trailers) + purge porndoe trailer orphans

Deep-crawling tube catalogs pulls in lots of <3min trailers/teasers (porndoe). Add
min_ingest_duration_sec (default 180): _process_scene skips scraper-source scenes whose
known duration is below the floor (unknown duration kept; canonical TPDB/StashDB
untouched). Deleted 67 existing porndoe-only orphan trailers (<180s, no canonical, no
non-porndoe live playback) via cascade.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-03 10:11:25 +02:00
parent 7e46e5ac48
commit 0f19a61789
2 changed files with 27 additions and 1 deletions

View file

@ -53,6 +53,10 @@ class Settings(BaseSettings):
# False = wciągaj jak dawniej. Tube'y z clip-store studiem NIE są skipowane (mają playback). # False = wciągaj jak dawniej. Tube'y z clip-store studiem NIE są skipowane (mają playback).
skip_clip_store: bool = Field(default=True, validation_alias="GOON_SKIP_CLIP_STORE") skip_clip_store: bool = Field(default=True, validation_alias="GOON_SKIP_CLIP_STORE")
# Minimalny duration sceny z tube/scraper przy ingescie — <N s = trailer/teaser/preview.
# 0 = wyłączony. Nieznany duration nie jest wycinany. NIE dotyczy canonical (TPDB/StashDB).
min_ingest_duration_sec: int = Field(default=180, validation_alias="GOON_MIN_INGEST_DURATION_SEC")
# APScheduler (M5). Każdy 0/None = job wyłączony. # APScheduler (M5). Każdy 0/None = job wyłączony.
sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS") sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS")
sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS") sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS")

View file

@ -34,7 +34,7 @@ from app.models.external_record import EntityKind, ExternalRecord
from app.models.ingest_run import IngestRun, IngestStatus from app.models.ingest_run import IngestRun, IngestStatus
from app.models.source import Source, SourceKind from app.models.source import Source, SourceKind
from app.normalize.movies import normalize_movie from app.normalize.movies import normalize_movie
from app.normalize.scenes import normalize_scene from app.normalize.scenes import NormalizedScene, normalize_scene
from app.resolve.movie_resolver import resolve_movie from app.resolve.movie_resolver import resolve_movie
from app.resolve.scene_resolver import resolve_scene from app.resolve.scene_resolver import resolve_scene
@ -66,6 +66,24 @@ def _skip_clip_store_canonical(session: Session, *, source_id: uuid.UUID, studio
return src is not None and src.kind in (SourceKind.tpdb, SourceKind.stashdb) return src is not None and src.kind in (SourceKind.tpdb, SourceKind.stashdb)
def _skip_short_tube_scene(session: Session, *, source_id: uuid.UUID, norm: NormalizedScene) -> bool:
"""True gdy scena ze scrapera/tube ma ZNANY duration < `min_ingest_duration_sec`
(trailer/teaser/preview śmieć). Nieznany duration NIE wycinamy (mogłaby być pełna
scena bez metadanych). Tylko scraper-source canonical (TPDB/StashDB) zostawiamy.
porndoe/deep-crawl ciągną z głębi katalogu sporo trailerów <3min (2026-06-03)."""
floor = getattr(get_settings(), "min_ingest_duration_sec", 0)
if not floor:
return False
dur = norm.duration_sec
if dur is None:
ps_durs = [ps.duration_sec for ps in norm.playback_sources if ps.duration_sec]
dur = max(ps_durs) if ps_durs else None
if dur is None or dur >= floor:
return False
src = session.get(Source, source_id)
return src is not None and src.kind == SourceKind.scraper
def _canonical_json(payload: dict) -> bytes: def _canonical_json(payload: dict) -> bytes:
return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode() return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode()
@ -248,6 +266,10 @@ def _process_scene(*, source_id: uuid.UUID, raw_scene: RawScene, counters: dict[
counters["skipped"] += 1 counters["skipped"] += 1
return return
if _skip_short_tube_scene(session, source_id=source_id, norm=norm):
counters["skipped"] += 1
return
result = resolve_scene(session, norm=norm, source_id=source_id) result = resolve_scene(session, norm=norm, source_id=source_id)
if result.was_created: if result.was_created: