From e23e2d1f17a7d31fa4d1817182f86a304e22cc73 Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Mon, 8 Jun 2026 10:56:50 +0200 Subject: [PATCH] fix(merge): move playback_sources on scene merge + exact-title+duration dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit merge_scenes never reassigned playback_sources → ON DELETE CASCADE dropped them with the absorbed scene. Cross-source (canonical) merges rarely had tube playback so it hid, but tube-dup merges silently LOST playback links. Add _move_playback_sources (global unique (origin,page_url) guarantees no collision on reassign). + merge_exact_title_duration.py: catches missing-merge dupes bulk_dedup misses (same performer + identical normalized title + identical duration_sec, no phash). Bad Bella had 25 such pairs (bug-report ef92809d "duplikat, te same miniatury"). Co-Authored-By: Claude Opus 4.8 --- app/resolve/scene_merge.py | 18 +++++ scripts/merge_exact_title_duration.py | 98 +++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 scripts/merge_exact_title_duration.py diff --git a/app/resolve/scene_merge.py b/app/resolve/scene_merge.py index 59bcc92..6af58f9 100644 --- a/app/resolve/scene_merge.py +++ b/app/resolve/scene_merge.py @@ -21,6 +21,7 @@ from sqlalchemy import or_, select, update from sqlalchemy.orm import Session from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus +from app.models.playback_source import PlaybackSource from app.models.scene import ( Scene, SceneExternalRef, @@ -55,6 +56,7 @@ def merge_scenes( _move_performers(session, keep_id=keep_id, drop_id=drop_id) _move_tags(session, keep_id=keep_id, drop_id=drop_id) _move_fingerprints(session, keep_id=keep_id, drop_id=drop_id) + _move_playback_sources(session, keep_id=keep_id, drop_id=drop_id) _coalesce_canonical_fields(keep, drop) session.delete(drop) @@ -186,6 +188,22 @@ def _move_fingerprints(session: Session, *, keep_id: uuid.UUID, drop_id: uuid.UU fp.scene_id = keep_id +def _move_playback_sources(session: Session, *, keep_id: uuid.UUID, drop_id: uuid.UUID) -> None: + """Przepnij playback_sources z `drop` na `keep`. + + Wcześniej merge_scenes NIE ruszał playback_sources → CASCADE (ON DELETE) kasował + je razem z `drop` sceną. Dla cross-source merdży (tpdb↔stashdb canonical, zwykle + bez tube-playbacku) nie bolało, ale phash_exact / tube-dup merge GUBIŁ linki do + odtwarzania. Unique constraint `(origin, page_url)` jest GLOBALNY → drop i keep nie + mogą współdzielić tego samego źródła, więc samo przepięcie scene_id nie grozi kolizją. + """ + session.execute( + update(PlaybackSource) + .where(PlaybackSource.scene_id == drop_id) + .values(scene_id=keep_id) + ) + + def _coalesce_canonical_fields(keep: Scene, drop: Scene) -> None: """Wypełnij braki w `keep` polami z `drop`. Nie nadpisuje istniejących wartości.""" if not keep.description and drop.description: diff --git a/scripts/merge_exact_title_duration.py b/scripts/merge_exact_title_duration.py new file mode 100644 index 0000000..c995b61 --- /dev/null +++ b/scripts/merge_exact_title_duration.py @@ -0,0 +1,98 @@ +"""Merge missing-merge duplikatów: ten sam performer + identyczny znormalizowany tytuł ++ identyczna długość (co do sekundy). + +Kontekst: bulk_dedup łapie cross-source (tpdb↔stashdb) i exact-phash, ale NIE łapie +tube-dup bez fingerprintów (np. ta sama scena zescrapowana 2× pod różnym URL/slug). +Na stronie performera user widzi wtedy "te same miniatury, duplikat" (bug-report +ef92809d — Bad Bella miała 25 takich par). Sygnał `same performer + exact norm-title ++ exact duration_sec` jest praktycznie pewny (dwa różne wideo nie mają byte-identycznego +tytułu I długości co do sekundy). + +Keep = scena z największą liczbą external_refs → potem playback_sources → potem najstarsza. +Merge przez resolve.scene_merge.merge_scenes (przenosi refs/performers/tags/fingerprints/ +playback_sources — playback move dodany 2026-06-08 razem z tym skryptem). + +Użycie (kontener worker): + python scripts/merge_exact_title_duration.py [PERFORMER_ID] [--commit] +Bez PERFORMER_ID = wszyscy performerzy (global). Bez --commit = dry-run. +""" +from __future__ import annotations + +import sys + +from sqlalchemy import text + +from app.db import session_scope +from app.resolve.scene_merge import merge_scenes + + +def _args() -> tuple[str | None, bool]: + commit = "--commit" in sys.argv + pid = None + for a in sys.argv[1:]: + if a != "--commit" and len(a) >= 32: + pid = a + return pid, commit + + +def _groups(pid: str | None) -> list[list[str]]: + # Grupy scen (per performer) o identycznym lower(trim(title)) + duration_sec. + # member order: refs DESC, srcs DESC, created_at ASC → pierwszy = keeper. + where_perf = "AND sp.performer_id = :pid" if pid else "" + sql = f""" + WITH cand AS ( + SELECT s.id, + sp.performer_id, + lower(btrim(s.title)) nt, + s.duration_sec dur, + s.created_at, + (SELECT count(*) FROM scene_external_refs r WHERE r.scene_id=s.id) refs, + (SELECT count(*) FROM playback_sources p WHERE p.scene_id=s.id) srcs + FROM scenes s + JOIN scene_performers sp ON sp.scene_id=s.id {where_perf} + WHERE s.duration_sec IS NOT NULL AND btrim(s.title) <> '' + ) + SELECT array_agg(id::text ORDER BY refs DESC, srcs DESC, created_at ASC) members + FROM cand + GROUP BY performer_id, nt, dur + HAVING count(*) > 1 + """ + params = {"pid": pid} if pid else {} + with session_scope() as s: + rows = s.execute(text(sql), params).all() + # dedup grup (ten sam zestaw może wyjść dla 2 performerów dzielących sceny) + seen: set[frozenset] = set() + out: list[list[str]] = [] + for (members,) in rows: + key = frozenset(members) + if key in seen: + continue + seen.add(key) + out.append(list(members)) + return out + + +def main() -> None: + pid, commit = _args() + groups = _groups(pid) + pairs = sum(len(g) - 1 for g in groups) + print(f"performer={pid or 'ALL'} groups={len(groups)} merges={pairs} commit={commit}", flush=True) + merged = 0 + for g in groups: + keep = g[0] + for drop in g[1:]: + if not commit: + print(f" [dry] keep {keep[:8]} <- drop {drop[:8]}") + continue + try: + with session_scope() as s: + import uuid as _u + merge_scenes(s, keep_id=_u.UUID(keep), drop_id=_u.UUID(drop), resolved_by="merge_exact_title_duration") + merged += 1 + except Exception as e: + print(f" ERR keep {keep[:8]} drop {drop[:8]}: {e}") + print(f"DONE merged={merged}/{pairs}", flush=True) + + +if __name__ == "__main__": + main()