feat(scripts): dedup_n2_canonical — resolve n=2 false-merges via canonical duration
audit_false_merges only auto-fixes n>=3 (majority disambiguates the outlier); n=2 was "needs human review" — but the merge-review UI is gone, nobody triages 500+. Measured: of 535 n=2 duration-divergent scenes, ALL have a canonical scene.duration_sec (TPDB/StashDB) and 531 have exactly one source matching canonical (±20%) + one >2x off → unambiguous false-merge. Kill the off source (works both directions since canonical is corroborated by the matching keeper, unlike the Omar-case the n>=3 audit guards against). Applied: 529 sources marked dead (4 ambiguous skipped). Reversible (dead_at). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
4922646011
commit
9f46e8dea9
1 changed files with 107 additions and 0 deletions
107
scripts/dedup_n2_canonical.py
Normal file
107
scripts/dedup_n2_canonical.py
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
"""False-merge remediacja dla scen z DOKŁADNIE 2 źródłami (n=2), rozstrzygana przez
|
||||
canonical duration.
|
||||
|
||||
Kontekst: audit_false_merges remediuje tylko n>=3 (większość źródeł = ground truth dla
|
||||
outliera). Dla n=2 było "needs human review" — ale review-queue we froncie już nie ma.
|
||||
Pomiar (2026-06-07): z 535 scen n=2-divergent (2 źródła, długości różnią się >3x),
|
||||
WSZYSTKIE 535 mają canonical scene.duration_sec (z TPDB/StashDB), a 531 ma JEDNO źródło
|
||||
pasujące do canonical (±20%) i drugie >2x odbiegające → jednoznaczny false-merge.
|
||||
|
||||
Logika: canonical (TPDB/StashDB) = ground-truth tożsamości sceny. Źródło pasujące do
|
||||
canonical (±KEEP_TOL) = keeper; drugie, >OUTLIER_DEV× odbiegające w którąkolwiek stronę
|
||||
= wrong video (teaser ALBO compilation) → dead. Działa w obie strony (w przeciwieństwie
|
||||
do audit_false_merges które kasuje tylko krótsze), bo canonical jest tu SKORELOWANY z
|
||||
keeperem (≠ Omar-case gdzie canonical nie pasował do niczego). Skip gdy oba pasują
|
||||
(nie divergentne) lub żadne (canonical niewiarygodny — ambiguous).
|
||||
|
||||
Reversible (dead_at). Domyślnie dry-run; --yes zapisuje.
|
||||
|
||||
python -m scripts.dedup_n2_canonical # dry-run
|
||||
python -m scripts.dedup_n2_canonical --list 30
|
||||
python -m scripts.dedup_n2_canonical --yes
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.db import session_scope
|
||||
|
||||
MIN_DUR = 30
|
||||
GAP = 90
|
||||
RATIO = 3.0
|
||||
KEEP_TOL = 0.20 # źródło w ±20% canonical = keeper (skorelowane → canonical wiarygodny)
|
||||
OUTLIER_DEV = 2.0 # drugie źródło >2x od canonical (w którąkolwiek stronę) = wrong video
|
||||
|
||||
_CANDIDATES_SQL = """
|
||||
WITH d AS (
|
||||
SELECT scene_id, count(*) n, min(duration_sec) mn, max(duration_sec) mx
|
||||
FROM playback_sources
|
||||
WHERE dead_at IS NULL AND duration_sec IS NOT NULL AND duration_sec > :min_dur
|
||||
GROUP BY scene_id
|
||||
HAVING count(*) = 2
|
||||
AND (max(duration_sec) - min(duration_sec)) > :gap
|
||||
AND max(duration_sec)::float / min(duration_sec) >= :ratio
|
||||
)
|
||||
SELECT d.scene_id, sc.duration_sec AS canonical
|
||||
FROM d JOIN scenes sc ON sc.id = d.scene_id
|
||||
WHERE sc.duration_sec IS NOT NULL AND sc.duration_sec > :min_dur
|
||||
"""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--list", type=int, default=0)
|
||||
ap.add_argument("--yes", action="store_true")
|
||||
args = ap.parse_args()
|
||||
params = {"min_dur": MIN_DUR, "gap": GAP, "ratio": RATIO}
|
||||
|
||||
with session_scope() as s:
|
||||
cands = list(s.execute(text(_CANDIDATES_SQL), params))
|
||||
|
||||
print(f"n=2 divergent scenes with canonical: {len(cands)}")
|
||||
killed = touched = skipped_ambiguous = 0
|
||||
listed = 0
|
||||
for scene_id, canonical in cands:
|
||||
with session_scope() as s:
|
||||
srcs = list(s.execute(text("""
|
||||
SELECT id, duration_sec, origin FROM playback_sources
|
||||
WHERE scene_id = :sid AND dead_at IS NULL
|
||||
AND duration_sec IS NOT NULL AND duration_sec > 0
|
||||
ORDER BY duration_sec
|
||||
"""), {"sid": scene_id}))
|
||||
if len(srcs) != 2:
|
||||
continue
|
||||
keepers = [r for r in srcs if abs(r[1] - canonical) <= canonical * KEEP_TOL]
|
||||
def _is_outlier(dur: int) -> bool:
|
||||
return canonical / dur > OUTLIER_DEV or dur / canonical > OUTLIER_DEV
|
||||
outliers = [r for r in srcs if _is_outlier(r[1]) and r not in keepers]
|
||||
# Bezpiecznie: dokładnie 1 keeper (skorelowany z canonical) + 1 outlier.
|
||||
if len(keepers) != 1 or len(outliers) != 1:
|
||||
skipped_ambiguous += 1
|
||||
continue
|
||||
keep, kill = keepers[0], outliers[0]
|
||||
touched += 1
|
||||
if listed < args.list:
|
||||
listed += 1
|
||||
print(f" {str(scene_id)[:8]} canon={canonical}s keep={keep[2]}={keep[1]}s "
|
||||
f"kill={kill[2]}={kill[1]}s")
|
||||
if args.yes:
|
||||
s.execute(text("""
|
||||
UPDATE playback_sources
|
||||
SET dead_at = now(),
|
||||
dead_reason = 'false-merge n=2 audit: duration off canonical (other source matches)'
|
||||
WHERE id = :id
|
||||
"""), {"id": kill[0]})
|
||||
s.commit()
|
||||
killed += 1
|
||||
|
||||
verb = f"killed={killed}" if args.yes else f"would_kill={touched}"
|
||||
print(f"\n{'APPLIED' if args.yes else 'DRY-RUN'}: {verb} skipped_ambiguous={skipped_ambiguous}")
|
||||
if not args.yes:
|
||||
print("(dry-run — uruchom z --yes; reversible: dead_at)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Reference in a new issue