From d5409d01cefb488a11a9446b2ec21a108ffaea49 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Sun, 7 Jun 2026 19:52:44 +0200 Subject: [PATCH] =?UTF-8?q?feat(scripts):=20audit=5Fteaser=5Fonly=20?= =?UTF-8?q?=E2=80=94=20hide=20scenes=20whose=20only=20source=20is=20a=20te?= =?UTF-8?q?aser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bug-report 2026-06-01 (48d6cc6b): scene shows canonical duration from TPDB (real 22min studio scene) but the only live playback_source is a short tube teaser (xnxx 21s) → "shows 22m, plays <1m". When ALL live sources are a tiny fraction (<15%) of a known canonical (>300s), the scene has no real playback; mark those sources dead → scene becomes orphan → hidden (has_playback=false), consistent with the orphan-hiding policy. Reversible (dead_at), conservative (skips scenes with any unknown-duration or full-length live source). Applied on prod: 182 sources dead across 174 scenes. Co-Authored-By: Claude Opus 4.8 --- scripts/audit_teaser_only.py | 95 ++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 scripts/audit_teaser_only.py diff --git a/scripts/audit_teaser_only.py b/scripts/audit_teaser_only.py new file mode 100644 index 0000000..26d2715 --- /dev/null +++ b/scripts/audit_teaser_only.py @@ -0,0 +1,95 @@ +"""Audyt + remediacja scen "teaser-only" (data quality). + +Problem (bug-report 2026-06-01, scena 48d6cc6b): scena ma canonical +`scene.duration_sec` z TPDB (prawdziwa długość studyjnej sceny, np. 22min), ale +JEDYNE żywe playback_source to krótki tubowy teaser/klip (np. xnxx 21s). Apka +pokazuje 22min (hero z canonical) a gra 21s → mylące "22m a wideo <1m". + +Fix: gdy WSZYSTKIE żywe źródła sceny są znikomą częścią canonical, scena nie ma +realnego playbacku — oznaczamy te źródła `dead_at` → scena traci żywe źródła → +`has_playback=false` → ukryta (zgodnie z polityką orphanów: nie pokazujemy scen +których nie da się realnie obejrzeć). + +Konserwatywne guardy (minimalizują fałszywe ukrycia): +- canonical (scene.duration_sec) musi być > MIN_CANON (znamy prawdziwą długość) +- WSZYSTKIE żywe źródła muszą mieć ZNANĄ duration (n_live == n_with_duration) — + źródło bez duration mogłoby być pełną sceną, więc wtedy pomijamy +- nawet NAJDŁUŻSZE żywe źródło < RATIO * canonical (czyli żaden nie jest pełny) + +Reversible (dead_at, NIE delete). Domyślnie dry-run; --yes zapisuje. + +Uruchomienie: + python -m scripts.audit_teaser_only # dry-run (podsumowanie) + python -m scripts.audit_teaser_only --list 30 # + lista scen + python -m scripts.audit_teaser_only --yes # wykonaj +""" +from __future__ import annotations + +import argparse + +from sqlalchemy import text + +from app.db import engine + +MIN_CANON = 300 # canonical > 5min — znamy realną długość pełnej sceny +RATIO = 0.15 # nawet najdłuższe żywe źródło < 15% canonical = brak pełnej sceny + +_TEASER_SCENES_SQL = """ +WITH live AS ( + SELECT scene_id, + count(*) AS n_live, + count(*) FILTER (WHERE duration_sec IS NOT NULL AND duration_sec > 0) AS n_dur, + max(duration_sec) AS max_dur + FROM playback_sources + WHERE dead_at IS NULL + GROUP BY scene_id +) +SELECT l.scene_id, sc.duration_sec AS canonical, l.n_live, l.max_dur +FROM live l +JOIN scenes sc ON sc.id = l.scene_id +WHERE sc.duration_sec > :min_canon + AND l.n_live = l.n_dur -- wszystkie żywe źródła mają znaną duration + AND l.max_dur < sc.duration_sec * :ratio -- żaden nie jest "pełny" +""" + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--list", type=int, default=0, metavar="N") + ap.add_argument("--yes", action="store_true", help="wykonaj (bez tego dry-run)") + args = ap.parse_args() + + params = {"min_canon": MIN_CANON, "ratio": RATIO} + with engine.connect() as conn: + rows = conn.execute(text(_TEASER_SCENES_SQL), params).fetchall() + n_scenes = len(rows) + scene_ids = [r[0] for r in rows] + n_sources = 0 + if scene_ids: + n_sources = conn.execute(text( + "SELECT count(*) FROM playback_sources " + "WHERE dead_at IS NULL AND scene_id = ANY(:ids)" + ), {"ids": scene_ids}).scalar() + print(f"teaser-only scenes (canonical>{MIN_CANON}s, all live sources <{int(RATIO*100)}% canonical): {n_scenes}") + print(f"live sources that would be marked dead: {n_sources}") + + if args.list: + for r in rows[: args.list]: + title = conn.execute(text("SELECT title FROM scenes WHERE id=:i"), {"i": r[0]}).scalar() + print(f" {str(r[0])[:8]} canonical={r[1]}s n_live={r[2]} max_src={r[3]}s {(title or '')[:55]}") + + if args.yes and scene_ids: + res = conn.execute(text(""" + UPDATE playback_sources + SET dead_at = now(), + dead_reason = 'teaser-only audit: all live sources <15% of canonical (orphan-hide)' + WHERE dead_at IS NULL AND scene_id = ANY(:ids) + """), {"ids": scene_ids}) + conn.commit() + print(f"\nAPPLIED: marked {res.rowcount} sources dead across {n_scenes} scenes (reversible: dead_at)") + elif not args.yes: + print("\n(dry-run — uruchom z --yes aby zapisać)") + + +if __name__ == "__main__": + main()