feat(scripts): audit_teaser_only — hide scenes whose only source is a teaser
bug-report 2026-06-01 (48d6cc6b): scene shows canonical duration from TPDB (real 22min studio scene) but the only live playback_source is a short tube teaser (xnxx 21s) → "shows 22m, plays <1m". When ALL live sources are a tiny fraction (<15%) of a known canonical (>300s), the scene has no real playback; mark those sources dead → scene becomes orphan → hidden (has_playback=false), consistent with the orphan-hiding policy. Reversible (dead_at), conservative (skips scenes with any unknown-duration or full-length live source). Applied on prod: 182 sources dead across 174 scenes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
63880feeb1
commit
d5409d01ce
1 changed files with 95 additions and 0 deletions
95
scripts/audit_teaser_only.py
Normal file
95
scripts/audit_teaser_only.py
Normal file
|
|
@ -0,0 +1,95 @@
|
||||||
|
"""Audyt + remediacja scen "teaser-only" (data quality).
|
||||||
|
|
||||||
|
Problem (bug-report 2026-06-01, scena 48d6cc6b): scena ma canonical
|
||||||
|
`scene.duration_sec` z TPDB (prawdziwa długość studyjnej sceny, np. 22min), ale
|
||||||
|
JEDYNE żywe playback_source to krótki tubowy teaser/klip (np. xnxx 21s). Apka
|
||||||
|
pokazuje 22min (hero z canonical) a gra 21s → mylące "22m a wideo <1m".
|
||||||
|
|
||||||
|
Fix: gdy WSZYSTKIE żywe źródła sceny są znikomą częścią canonical, scena nie ma
|
||||||
|
realnego playbacku — oznaczamy te źródła `dead_at` → scena traci żywe źródła →
|
||||||
|
`has_playback=false` → ukryta (zgodnie z polityką orphanów: nie pokazujemy scen
|
||||||
|
których nie da się realnie obejrzeć).
|
||||||
|
|
||||||
|
Konserwatywne guardy (minimalizują fałszywe ukrycia):
|
||||||
|
- canonical (scene.duration_sec) musi być > MIN_CANON (znamy prawdziwą długość)
|
||||||
|
- WSZYSTKIE żywe źródła muszą mieć ZNANĄ duration (n_live == n_with_duration) —
|
||||||
|
źródło bez duration mogłoby być pełną sceną, więc wtedy pomijamy
|
||||||
|
- nawet NAJDŁUŻSZE żywe źródło < RATIO * canonical (czyli żaden nie jest pełny)
|
||||||
|
|
||||||
|
Reversible (dead_at, NIE delete). Domyślnie dry-run; --yes zapisuje.
|
||||||
|
|
||||||
|
Uruchomienie:
|
||||||
|
python -m scripts.audit_teaser_only # dry-run (podsumowanie)
|
||||||
|
python -m scripts.audit_teaser_only --list 30 # + lista scen
|
||||||
|
python -m scripts.audit_teaser_only --yes # wykonaj
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.db import engine
|
||||||
|
|
||||||
|
MIN_CANON = 300 # canonical > 5min — znamy realną długość pełnej sceny
|
||||||
|
RATIO = 0.15 # nawet najdłuższe żywe źródło < 15% canonical = brak pełnej sceny
|
||||||
|
|
||||||
|
_TEASER_SCENES_SQL = """
|
||||||
|
WITH live AS (
|
||||||
|
SELECT scene_id,
|
||||||
|
count(*) AS n_live,
|
||||||
|
count(*) FILTER (WHERE duration_sec IS NOT NULL AND duration_sec > 0) AS n_dur,
|
||||||
|
max(duration_sec) AS max_dur
|
||||||
|
FROM playback_sources
|
||||||
|
WHERE dead_at IS NULL
|
||||||
|
GROUP BY scene_id
|
||||||
|
)
|
||||||
|
SELECT l.scene_id, sc.duration_sec AS canonical, l.n_live, l.max_dur
|
||||||
|
FROM live l
|
||||||
|
JOIN scenes sc ON sc.id = l.scene_id
|
||||||
|
WHERE sc.duration_sec > :min_canon
|
||||||
|
AND l.n_live = l.n_dur -- wszystkie żywe źródła mają znaną duration
|
||||||
|
AND l.max_dur < sc.duration_sec * :ratio -- żaden nie jest "pełny"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__)
|
||||||
|
ap.add_argument("--list", type=int, default=0, metavar="N")
|
||||||
|
ap.add_argument("--yes", action="store_true", help="wykonaj (bez tego dry-run)")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
params = {"min_canon": MIN_CANON, "ratio": RATIO}
|
||||||
|
with engine.connect() as conn:
|
||||||
|
rows = conn.execute(text(_TEASER_SCENES_SQL), params).fetchall()
|
||||||
|
n_scenes = len(rows)
|
||||||
|
scene_ids = [r[0] for r in rows]
|
||||||
|
n_sources = 0
|
||||||
|
if scene_ids:
|
||||||
|
n_sources = conn.execute(text(
|
||||||
|
"SELECT count(*) FROM playback_sources "
|
||||||
|
"WHERE dead_at IS NULL AND scene_id = ANY(:ids)"
|
||||||
|
), {"ids": scene_ids}).scalar()
|
||||||
|
print(f"teaser-only scenes (canonical>{MIN_CANON}s, all live sources <{int(RATIO*100)}% canonical): {n_scenes}")
|
||||||
|
print(f"live sources that would be marked dead: {n_sources}")
|
||||||
|
|
||||||
|
if args.list:
|
||||||
|
for r in rows[: args.list]:
|
||||||
|
title = conn.execute(text("SELECT title FROM scenes WHERE id=:i"), {"i": r[0]}).scalar()
|
||||||
|
print(f" {str(r[0])[:8]} canonical={r[1]}s n_live={r[2]} max_src={r[3]}s {(title or '')[:55]}")
|
||||||
|
|
||||||
|
if args.yes and scene_ids:
|
||||||
|
res = conn.execute(text("""
|
||||||
|
UPDATE playback_sources
|
||||||
|
SET dead_at = now(),
|
||||||
|
dead_reason = 'teaser-only audit: all live sources <15% of canonical (orphan-hide)'
|
||||||
|
WHERE dead_at IS NULL AND scene_id = ANY(:ids)
|
||||||
|
"""), {"ids": scene_ids})
|
||||||
|
conn.commit()
|
||||||
|
print(f"\nAPPLIED: marked {res.rowcount} sources dead across {n_scenes} scenes (reversible: dead_at)")
|
||||||
|
elif not args.yes:
|
||||||
|
print("\n(dry-run — uruchom z --yes aby zapisać)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Reference in a new issue