feat(ingest): skip <180s tube scenes (trailers) + purge porndoe trailer orphans
Deep-crawling tube catalogs pulls in lots of <3min trailers/teasers (porndoe). Add min_ingest_duration_sec (default 180): _process_scene skips scraper-source scenes whose known duration is below the floor (unknown duration kept; canonical TPDB/StashDB untouched). Deleted 67 existing porndoe-only orphan trailers (<180s, no canonical, no non-porndoe live playback) via cascade. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7e46e5ac48
commit
0f19a61789
2 changed files with 27 additions and 1 deletions
|
|
@ -53,6 +53,10 @@ class Settings(BaseSettings):
|
||||||
# False = wciągaj jak dawniej. Tube'y z clip-store studiem NIE są skipowane (mają playback).
|
# False = wciągaj jak dawniej. Tube'y z clip-store studiem NIE są skipowane (mają playback).
|
||||||
skip_clip_store: bool = Field(default=True, validation_alias="GOON_SKIP_CLIP_STORE")
|
skip_clip_store: bool = Field(default=True, validation_alias="GOON_SKIP_CLIP_STORE")
|
||||||
|
|
||||||
|
# Minimalny duration sceny z tube/scraper przy ingescie — <N s = trailer/teaser/preview.
|
||||||
|
# 0 = wyłączony. Nieznany duration nie jest wycinany. NIE dotyczy canonical (TPDB/StashDB).
|
||||||
|
min_ingest_duration_sec: int = Field(default=180, validation_alias="GOON_MIN_INGEST_DURATION_SEC")
|
||||||
|
|
||||||
# APScheduler (M5). Każdy 0/None = job wyłączony.
|
# APScheduler (M5). Każdy 0/None = job wyłączony.
|
||||||
sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS")
|
sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS")
|
||||||
sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS")
|
sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS")
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ from app.models.external_record import EntityKind, ExternalRecord
|
||||||
from app.models.ingest_run import IngestRun, IngestStatus
|
from app.models.ingest_run import IngestRun, IngestStatus
|
||||||
from app.models.source import Source, SourceKind
|
from app.models.source import Source, SourceKind
|
||||||
from app.normalize.movies import normalize_movie
|
from app.normalize.movies import normalize_movie
|
||||||
from app.normalize.scenes import normalize_scene
|
from app.normalize.scenes import NormalizedScene, normalize_scene
|
||||||
from app.resolve.movie_resolver import resolve_movie
|
from app.resolve.movie_resolver import resolve_movie
|
||||||
from app.resolve.scene_resolver import resolve_scene
|
from app.resolve.scene_resolver import resolve_scene
|
||||||
|
|
||||||
|
|
@ -66,6 +66,24 @@ def _skip_clip_store_canonical(session: Session, *, source_id: uuid.UUID, studio
|
||||||
return src is not None and src.kind in (SourceKind.tpdb, SourceKind.stashdb)
|
return src is not None and src.kind in (SourceKind.tpdb, SourceKind.stashdb)
|
||||||
|
|
||||||
|
|
||||||
|
def _skip_short_tube_scene(session: Session, *, source_id: uuid.UUID, norm: NormalizedScene) -> bool:
|
||||||
|
"""True gdy scena ze scrapera/tube ma ZNANY duration < `min_ingest_duration_sec`
|
||||||
|
(trailer/teaser/preview — śmieć). Nieznany duration → NIE wycinamy (mogłaby być pełna
|
||||||
|
scena bez metadanych). Tylko scraper-source — canonical (TPDB/StashDB) zostawiamy.
|
||||||
|
porndoe/deep-crawl ciągną z głębi katalogu sporo trailerów <3min (2026-06-03)."""
|
||||||
|
floor = getattr(get_settings(), "min_ingest_duration_sec", 0)
|
||||||
|
if not floor:
|
||||||
|
return False
|
||||||
|
dur = norm.duration_sec
|
||||||
|
if dur is None:
|
||||||
|
ps_durs = [ps.duration_sec for ps in norm.playback_sources if ps.duration_sec]
|
||||||
|
dur = max(ps_durs) if ps_durs else None
|
||||||
|
if dur is None or dur >= floor:
|
||||||
|
return False
|
||||||
|
src = session.get(Source, source_id)
|
||||||
|
return src is not None and src.kind == SourceKind.scraper
|
||||||
|
|
||||||
|
|
||||||
def _canonical_json(payload: dict) -> bytes:
|
def _canonical_json(payload: dict) -> bytes:
|
||||||
return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode()
|
return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode()
|
||||||
|
|
||||||
|
|
@ -248,6 +266,10 @@ def _process_scene(*, source_id: uuid.UUID, raw_scene: RawScene, counters: dict[
|
||||||
counters["skipped"] += 1
|
counters["skipped"] += 1
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if _skip_short_tube_scene(session, source_id=source_id, norm=norm):
|
||||||
|
counters["skipped"] += 1
|
||||||
|
return
|
||||||
|
|
||||||
result = resolve_scene(session, norm=norm, source_id=source_id)
|
result = resolve_scene(session, norm=norm, source_id=source_id)
|
||||||
|
|
||||||
if result.was_created:
|
if result.was_created:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue