diff --git a/scripts/dedup_n2_canonical.py b/scripts/dedup_n2_canonical.py new file mode 100644 index 0000000..a6a97f5 --- /dev/null +++ b/scripts/dedup_n2_canonical.py @@ -0,0 +1,107 @@ +"""False-merge remediacja dla scen z DOKŁADNIE 2 źródłami (n=2), rozstrzygana przez +canonical duration. + +Kontekst: audit_false_merges remediuje tylko n>=3 (większość źródeł = ground truth dla +outliera). Dla n=2 było "needs human review" — ale review-queue we froncie już nie ma. +Pomiar (2026-06-07): z 535 scen n=2-divergent (2 źródła, długości różnią się >3x), +WSZYSTKIE 535 mają canonical scene.duration_sec (z TPDB/StashDB), a 531 ma JEDNO źródło +pasujące do canonical (±20%) i drugie >2x odbiegające → jednoznaczny false-merge. + +Logika: canonical (TPDB/StashDB) = ground-truth tożsamości sceny. Źródło pasujące do +canonical (±KEEP_TOL) = keeper; drugie, >OUTLIER_DEV× odbiegające w którąkolwiek stronę += wrong video (teaser ALBO compilation) → dead. Działa w obie strony (w przeciwieństwie +do audit_false_merges które kasuje tylko krótsze), bo canonical jest tu SKORELOWANY z +keeperem (≠ Omar-case gdzie canonical nie pasował do niczego). Skip gdy oba pasują +(nie divergentne) lub żadne (canonical niewiarygodny — ambiguous). + +Reversible (dead_at). Domyślnie dry-run; --yes zapisuje. + + python -m scripts.dedup_n2_canonical # dry-run + python -m scripts.dedup_n2_canonical --list 30 + python -m scripts.dedup_n2_canonical --yes +""" +from __future__ import annotations + +import argparse + +from sqlalchemy import text + +from app.db import session_scope + +MIN_DUR = 30 +GAP = 90 +RATIO = 3.0 +KEEP_TOL = 0.20 # źródło w ±20% canonical = keeper (skorelowane → canonical wiarygodny) +OUTLIER_DEV = 2.0 # drugie źródło >2x od canonical (w którąkolwiek stronę) = wrong video + +_CANDIDATES_SQL = """ +WITH d AS ( + SELECT scene_id, count(*) n, min(duration_sec) mn, max(duration_sec) mx + FROM playback_sources + WHERE dead_at IS NULL AND duration_sec IS NOT NULL AND duration_sec > :min_dur + GROUP BY scene_id + HAVING count(*) = 2 + AND (max(duration_sec) - min(duration_sec)) > :gap + AND max(duration_sec)::float / min(duration_sec) >= :ratio +) +SELECT d.scene_id, sc.duration_sec AS canonical +FROM d JOIN scenes sc ON sc.id = d.scene_id +WHERE sc.duration_sec IS NOT NULL AND sc.duration_sec > :min_dur +""" + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--list", type=int, default=0) + ap.add_argument("--yes", action="store_true") + args = ap.parse_args() + params = {"min_dur": MIN_DUR, "gap": GAP, "ratio": RATIO} + + with session_scope() as s: + cands = list(s.execute(text(_CANDIDATES_SQL), params)) + + print(f"n=2 divergent scenes with canonical: {len(cands)}") + killed = touched = skipped_ambiguous = 0 + listed = 0 + for scene_id, canonical in cands: + with session_scope() as s: + srcs = list(s.execute(text(""" + SELECT id, duration_sec, origin FROM playback_sources + WHERE scene_id = :sid AND dead_at IS NULL + AND duration_sec IS NOT NULL AND duration_sec > 0 + ORDER BY duration_sec + """), {"sid": scene_id})) + if len(srcs) != 2: + continue + keepers = [r for r in srcs if abs(r[1] - canonical) <= canonical * KEEP_TOL] + def _is_outlier(dur: int) -> bool: + return canonical / dur > OUTLIER_DEV or dur / canonical > OUTLIER_DEV + outliers = [r for r in srcs if _is_outlier(r[1]) and r not in keepers] + # Bezpiecznie: dokładnie 1 keeper (skorelowany z canonical) + 1 outlier. + if len(keepers) != 1 or len(outliers) != 1: + skipped_ambiguous += 1 + continue + keep, kill = keepers[0], outliers[0] + touched += 1 + if listed < args.list: + listed += 1 + print(f" {str(scene_id)[:8]} canon={canonical}s keep={keep[2]}={keep[1]}s " + f"kill={kill[2]}={kill[1]}s") + if args.yes: + s.execute(text(""" + UPDATE playback_sources + SET dead_at = now(), + dead_reason = 'false-merge n=2 audit: duration off canonical (other source matches)' + WHERE id = :id + """), {"id": kill[0]}) + s.commit() + killed += 1 + + verb = f"killed={killed}" if args.yes else f"would_kill={touched}" + print(f"\n{'APPLIED' if args.yes else 'DRY-RUN'}: {verb} skipped_ambiguous={skipped_ambiguous}") + if not args.yes: + print("(dry-run — uruchom z --yes; reversible: dead_at)") + + +if __name__ == "__main__": + main()