Opt-in remediation for the duration-inconsistent scenes found by the audit. Scope is deliberately narrow and reversible: - only scenes with >=3 duration-bearing sources AND max/min ratio > 3x - anchored on scene.duration_sec (the canonical value), never the median of sources (a median is wrong when several bogus short clips outvote the real full-length source) - marks dead ONLY sources that are >2x SHORTER than the canonical — a falsely merged source is almost always a short SEO clip/preview. Sources longer than the canonical are left alone, since an over-long outlier more often means the canonical duration itself is too low (so killing the long source would drop the real video); those stay for manual review. - guards that at least one live source remains - dry-run by default; --yes to apply; sets dead_at (reversible), not delete First run marked 514 short-clip sources dead across 228 scenes. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
204 lines
9.9 KiB
Python
204 lines
9.9 KiB
Python
"""Audyt potencjalnych false-merge'y scen (data quality).
|
|
|
|
Kontekst: przed 2026-05-12 scoring auto-mergeował sceny w aggregator mode bez
|
|
wymaganego strong-signal (fp/duration/date). Hardening z 2026-05-12 (scoring.py:
|
|
`aggregator_weak_signal_cap` = 0.85 + duration bump zacieśniony do ≤3s, bug
|
|
ef090842) zatrzymuje NOWE false-merge'y, ale starsze zostały w danych.
|
|
|
|
Candidate-log (`merge_candidates`, status=auto_merged) NIE zapisuje który
|
|
external_ref dopięto, więc precyzyjne cofnięcie z samego logu jest niemożliwe.
|
|
Zamiast tego wykrywamy false-merge po SKUTKU: scena, do której dopięto źródło
|
|
innego wideo, ma `playback_sources` o **niespójnych długościach**. To sygnał
|
|
wykrywalny z bieżących danych i daje konkretny target (outlier source).
|
|
|
|
Read-only — NIC nie mutuje. Remediacja (detach/mark-dead outliera) to osobna
|
|
decyzja: krótka długość bywa legit (preview/trailer) albo błąd metadanych, więc
|
|
n=2 jest niejednoznaczne (nie wiadomo które źródło jest „prawdziwe”). Do oceny
|
|
ludzkiej / osobnego skryptu z jawnym progiem.
|
|
|
|
Uruchomienie:
|
|
python -m scripts.audit_false_merges # podsumowanie + buckety
|
|
python -m scripts.audit_false_merges --list 50 # 50 najgorszych z rozpiską
|
|
python -m scripts.audit_false_merges --min-ratio 3 # tylko ratio >= 3x
|
|
python -m scripts.audit_false_merges --json out.json # eksport suspektów
|
|
python -m scripts.audit_false_merges --fix # DRY-RUN remediacji (nic nie pisze)
|
|
python -m scripts.audit_false_merges --fix --yes # wykonaj remediację
|
|
|
|
Remediacja (--fix): tylko sceny n>=3 źródeł-z-duration ORAZ ratio>3x. Zakotwiczona
|
|
na `scene.duration_sec` (canonical = ground-truth, NIE median źródeł — bo przy
|
|
kilku błędnych krótkich klipach median wskazuje złe źródło, np. Omar galanti
|
|
[59s,60s,7921s] gdzie 7921 pasuje do tytułu). Oznacza `dead_at` (reversible, NIE
|
|
delete) na źródłach wyraźnie odstających od canonical (>2x w którąkolwiek stronę).
|
|
Pomija sceny bez canonical duration i gwarantuje że >=1 źródło zostaje żywe.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
|
|
from sqlalchemy import text
|
|
|
|
from app.db import engine
|
|
|
|
# Suspekt: scena z ≥2 żywymi źródłami mającymi duration, gdzie najdłuższe vs
|
|
# najkrótsze różni się o > MIN_ABS_GAP sekund ORAZ ratio >= MIN_RATIO.
|
|
MIN_ABS_GAP = 90 # sekund — odsiewa drobne różnice encodingu/trim
|
|
DEFAULT_MIN_RATIO = 1.25
|
|
MIN_DUR = 30 # ignoruj śmieciowe <30s wpisy w samym progu liczenia
|
|
|
|
_SUSPECTS_SQL = """
|
|
WITH d AS (
|
|
SELECT scene_id, duration_sec
|
|
FROM playback_sources
|
|
WHERE dead_at IS NULL AND duration_sec IS NOT NULL AND duration_sec > :min_dur
|
|
),
|
|
agg AS (
|
|
SELECT scene_id, count(*) AS n, min(duration_sec) AS mn, max(duration_sec) AS mx
|
|
FROM d GROUP BY scene_id HAVING count(*) >= 2
|
|
)
|
|
SELECT a.scene_id, a.n, a.mn, a.mx, (a.mx::float / NULLIF(a.mn, 0)) AS ratio
|
|
FROM agg a
|
|
WHERE (a.mx - a.mn) > :gap AND (a.mx::float / NULLIF(a.mn, 0)) >= :ratio
|
|
ORDER BY ratio DESC
|
|
"""
|
|
|
|
|
|
def _summary(conn) -> None:
|
|
cov = conn.execute(text("""
|
|
SELECT count(*) FILTER (WHERE duration_sec IS NOT NULL AND duration_sec > 0), count(*)
|
|
FROM playback_sources WHERE dead_at IS NULL
|
|
""")).fetchone()
|
|
print(f"playback_sources with duration: {cov[0]}/{cov[1]} "
|
|
f"({100*cov[0]/max(cov[1],1):.0f}%)")
|
|
|
|
total = conn.execute(text(f"SELECT count(*) FROM ({_SUSPECTS_SQL}) q"),
|
|
{"min_dur": MIN_DUR, "gap": MIN_ABS_GAP, "ratio": DEFAULT_MIN_RATIO}).scalar()
|
|
print(f"\nduration-inconsistent scenes (gap>{MIN_ABS_GAP}s, ratio>={DEFAULT_MIN_RATIO}x): {total}")
|
|
print("severity buckets (by max/min ratio):")
|
|
for lo, hi, lbl in [(1.25, 1.5, "1.25-1.5x"), (1.5, 2.0, "1.5-2x"),
|
|
(2.0, 3.0, "2-3x"), (3.0, 1e9, ">3x")]:
|
|
n = conn.execute(text(f"SELECT count(*) FROM ({_SUSPECTS_SQL}) q WHERE ratio >= :lo AND ratio < :hi"),
|
|
{"min_dur": MIN_DUR, "gap": MIN_ABS_GAP, "ratio": DEFAULT_MIN_RATIO,
|
|
"lo": lo, "hi": hi}).scalar()
|
|
print(f" {lbl:>10}: {n}")
|
|
print("\nNOTE: read-only audit. >3x ratio = strong false-merge signal "
|
|
"(short clip attached to full scene). n>=3 disambiguates the outlier; "
|
|
"n=2 needs human review. Remediation is a separate, explicit decision.")
|
|
|
|
|
|
def _rows(conn, min_ratio: float):
|
|
return conn.execute(text(_SUSPECTS_SQL),
|
|
{"min_dur": MIN_DUR, "gap": MIN_ABS_GAP, "ratio": min_ratio}).fetchall()
|
|
|
|
|
|
def _source_breakdown(conn, scene_id):
|
|
return conn.execute(text("""
|
|
SELECT origin, duration_sec, dead_at IS NOT NULL AS dead, page_url
|
|
FROM playback_sources WHERE scene_id = :sid
|
|
ORDER BY duration_sec NULLS LAST
|
|
"""), {"sid": scene_id}).fetchall()
|
|
|
|
|
|
# Remediacja: tylko sceny z tak wyraźnym sygnałem że ryzyko fałszywego kill-u jest minimalne.
|
|
FIX_MIN_N = 3 # min liczba źródeł-z-duration
|
|
FIX_MIN_RATIO = 3.0 # min max/min ratio sceny
|
|
FIX_OUTLIER_DEV = 2.0 # źródło-KRÓTKIE odstające gdy anchor/dur > 2 (tylko krótsze!)
|
|
# Celowo NIE zabijamy źródeł DŁUŻSZYCH od canonical: błędne dopięcie to niemal zawsze
|
|
# krótki SEO-klip/preview. Gdy źródło jest >2x DŁUŻSZE od canonical, częściej to
|
|
# canonical scene.duration_sec jest zaniżony (sam ustawiony z błędnego merge'a), a
|
|
# długie źródło to prawdziwa pełna scena — kill byłby fałszywy. Takie zostają do
|
|
# ręcznego review (widać je w --list).
|
|
|
|
|
|
def _remediate(conn, *, apply: bool) -> None:
|
|
"""Oznacza `dead_at` na źródłach wyraźnie odstających od canonical scene.duration_sec.
|
|
Domyślnie dry-run (apply=False). Zwraca raport na stdout."""
|
|
rows = conn.execute(text(_SUSPECTS_SQL),
|
|
{"min_dur": MIN_DUR, "gap": MIN_ABS_GAP, "ratio": FIX_MIN_RATIO}).fetchall()
|
|
scanned = killed = would_kill = skipped_no_anchor = skipped_guard = touched_scenes = 0
|
|
for r in rows:
|
|
scene_id, n = r[0], r[1]
|
|
if n < FIX_MIN_N:
|
|
continue
|
|
scanned += 1
|
|
anchor = conn.execute(text("SELECT duration_sec FROM scenes WHERE id=:i"),
|
|
{"i": scene_id}).scalar()
|
|
if not anchor or anchor <= MIN_DUR:
|
|
skipped_no_anchor += 1
|
|
continue
|
|
srcs = conn.execute(text("""
|
|
SELECT id, duration_sec, origin FROM playback_sources
|
|
WHERE scene_id=:sid AND dead_at IS NULL AND duration_sec IS NOT NULL AND duration_sec>0
|
|
"""), {"sid": scene_id}).fetchall()
|
|
# Tylko KRÓTKIE outliery (anchor/dur > 2). Długie świadomie pomijamy (patrz wyżej).
|
|
outliers = [s for s in srcs if anchor / s[1] > FIX_OUTLIER_DEV]
|
|
keepers = [s for s in srcs if s not in outliers]
|
|
# Guard: musi zostać >=1 żywe źródło zgodne z canonical.
|
|
if not outliers or not keepers:
|
|
skipped_guard += 1
|
|
continue
|
|
touched_scenes += 1
|
|
would_kill += len(outliers)
|
|
print(f" scene {str(scene_id)[:8]} anchor={anchor}s keep={len(keepers)} "
|
|
f"kill={len(outliers)}: " + ", ".join(f"{o[2]}={o[1]}s" for o in outliers))
|
|
if apply:
|
|
ids = [o[0] for o in outliers]
|
|
conn.execute(text("""
|
|
UPDATE playback_sources
|
|
SET dead_at=now(),
|
|
dead_reason='false-merge audit: duration outlier vs canonical'
|
|
WHERE id = ANY(:ids)
|
|
"""), {"ids": ids})
|
|
killed += len(ids)
|
|
if apply:
|
|
conn.commit()
|
|
verb = f"sources_marked_dead={killed}" if apply else f"sources_would_mark_dead={would_kill}"
|
|
print(f"\n{'APPLIED' if apply else 'DRY-RUN'}: scenes_eligible={scanned} "
|
|
f"touched={touched_scenes} {verb} "
|
|
f"skipped_no_anchor={skipped_no_anchor} skipped_guard={skipped_guard}")
|
|
if not apply:
|
|
print("(dry-run — uruchom z --yes aby zapisać; reversible: dead_at IS NOT NULL)")
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description=__doc__)
|
|
ap.add_argument("--list", type=int, default=0, metavar="N",
|
|
help="wypisz N najgorszych scen z rozpiską źródeł")
|
|
ap.add_argument("--min-ratio", type=float, default=DEFAULT_MIN_RATIO,
|
|
help=f"minimalny max/min ratio (default {DEFAULT_MIN_RATIO})")
|
|
ap.add_argument("--json", metavar="PATH", help="eksportuj wszystkich suspektów do JSON")
|
|
ap.add_argument("--fix", action="store_true",
|
|
help="remediacja: mark-dead outlier sources (n>=3, ratio>3x, anchored on canonical)")
|
|
ap.add_argument("--yes", action="store_true", help="faktycznie zapisz (bez tego --fix = dry-run)")
|
|
args = ap.parse_args()
|
|
|
|
with engine.connect() as conn:
|
|
_summary(conn)
|
|
|
|
if args.list:
|
|
rows = _rows(conn, args.min_ratio)[: args.list]
|
|
print(f"\n=== top {len(rows)} suspects (ratio>={args.min_ratio}x) ===")
|
|
for r in rows:
|
|
title = conn.execute(text("SELECT title FROM scenes WHERE id=:i"),
|
|
{"i": r[0]}).scalar()
|
|
print(f"\n{r[0]} n={r[1]} {r[2]}s..{r[3]}s ratio={r[4]:.1f}x {(title or '')[:60]}")
|
|
for b in _source_breakdown(conn, r[0]):
|
|
flag = " DEAD" if b[2] else ""
|
|
print(f" {b[0]:<28} {str(b[1]) + 's':>8}{flag} {(b[3] or '')[:50]}")
|
|
|
|
if args.json:
|
|
rows = _rows(conn, args.min_ratio)
|
|
out = [{"scene_id": str(r[0]), "n_sources": r[1], "min_dur": r[2],
|
|
"max_dur": r[3], "ratio": round(r[4], 3)} for r in rows]
|
|
with open(args.json, "w", encoding="utf-8") as f:
|
|
json.dump(out, f, indent=2)
|
|
print(f"\nwrote {len(out)} suspects -> {args.json}")
|
|
|
|
if args.fix:
|
|
print(f"\n=== remediacja (n>={FIX_MIN_N}, ratio>{FIX_MIN_RATIO}x, "
|
|
f"outlier>{FIX_OUTLIER_DEV}x od canonical) ===")
|
|
_remediate(conn, apply=args.yes)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|