From 0f19a6178912f71f9228a9c4f64bb0fdb900a806 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Wed, 3 Jun 2026 10:11:25 +0200 Subject: [PATCH] feat(ingest): skip <180s tube scenes (trailers) + purge porndoe trailer orphans Deep-crawling tube catalogs pulls in lots of <3min trailers/teasers (porndoe). Add min_ingest_duration_sec (default 180): _process_scene skips scraper-source scenes whose known duration is below the floor (unknown duration kept; canonical TPDB/StashDB untouched). Deleted 67 existing porndoe-only orphan trailers (<180s, no canonical, no non-porndoe live playback) via cascade. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/config.py | 4 ++++ app/ingest.py | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/app/config.py b/app/config.py index f7e4c42..3bcea64 100644 --- a/app/config.py +++ b/app/config.py @@ -53,6 +53,10 @@ class Settings(BaseSettings): # False = wciągaj jak dawniej. Tube'y z clip-store studiem NIE są skipowane (mają playback). skip_clip_store: bool = Field(default=True, validation_alias="GOON_SKIP_CLIP_STORE") + # Minimalny duration sceny z tube/scraper przy ingescie — bool: + """True gdy scena ze scrapera/tube ma ZNANY duration < `min_ingest_duration_sec` + (trailer/teaser/preview — śmieć). Nieznany duration → NIE wycinamy (mogłaby być pełna + scena bez metadanych). Tylko scraper-source — canonical (TPDB/StashDB) zostawiamy. + porndoe/deep-crawl ciągną z głębi katalogu sporo trailerów <3min (2026-06-03).""" + floor = getattr(get_settings(), "min_ingest_duration_sec", 0) + if not floor: + return False + dur = norm.duration_sec + if dur is None: + ps_durs = [ps.duration_sec for ps in norm.playback_sources if ps.duration_sec] + dur = max(ps_durs) if ps_durs else None + if dur is None or dur >= floor: + return False + src = session.get(Source, source_id) + return src is not None and src.kind == SourceKind.scraper + + def _canonical_json(payload: dict) -> bytes: return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode() @@ -248,6 +266,10 @@ def _process_scene(*, source_id: uuid.UUID, raw_scene: RawScene, counters: dict[ counters["skipped"] += 1 return + if _skip_short_tube_scene(session, source_id=source_id, norm=norm): + counters["skipped"] += 1 + return + result = resolve_scene(session, norm=norm, source_id=source_id) if result.was_created: