fix(merge): move playback_sources on scene merge + exact-title+duration dedup
merge_scenes never reassigned playback_sources → ON DELETE CASCADE dropped them with the absorbed scene. Cross-source (canonical) merges rarely had tube playback so it hid, but tube-dup merges silently LOST playback links. Add _move_playback_sources (global unique (origin,page_url) guarantees no collision on reassign). + merge_exact_title_duration.py: catches missing-merge dupes bulk_dedup misses (same performer + identical normalized title + identical duration_sec, no phash). Bad Bella had 25 such pairs (bug-report ef92809d "duplikat, te same miniatury"). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
8f34a3e2f1
commit
e23e2d1f17
2 changed files with 116 additions and 0 deletions
|
|
@ -21,6 +21,7 @@ from sqlalchemy import or_, select, update
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus
|
from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
from app.models.scene import (
|
from app.models.scene import (
|
||||||
Scene,
|
Scene,
|
||||||
SceneExternalRef,
|
SceneExternalRef,
|
||||||
|
|
@ -55,6 +56,7 @@ def merge_scenes(
|
||||||
_move_performers(session, keep_id=keep_id, drop_id=drop_id)
|
_move_performers(session, keep_id=keep_id, drop_id=drop_id)
|
||||||
_move_tags(session, keep_id=keep_id, drop_id=drop_id)
|
_move_tags(session, keep_id=keep_id, drop_id=drop_id)
|
||||||
_move_fingerprints(session, keep_id=keep_id, drop_id=drop_id)
|
_move_fingerprints(session, keep_id=keep_id, drop_id=drop_id)
|
||||||
|
_move_playback_sources(session, keep_id=keep_id, drop_id=drop_id)
|
||||||
_coalesce_canonical_fields(keep, drop)
|
_coalesce_canonical_fields(keep, drop)
|
||||||
|
|
||||||
session.delete(drop)
|
session.delete(drop)
|
||||||
|
|
@ -186,6 +188,22 @@ def _move_fingerprints(session: Session, *, keep_id: uuid.UUID, drop_id: uuid.UU
|
||||||
fp.scene_id = keep_id
|
fp.scene_id = keep_id
|
||||||
|
|
||||||
|
|
||||||
|
def _move_playback_sources(session: Session, *, keep_id: uuid.UUID, drop_id: uuid.UUID) -> None:
|
||||||
|
"""Przepnij playback_sources z `drop` na `keep`.
|
||||||
|
|
||||||
|
Wcześniej merge_scenes NIE ruszał playback_sources → CASCADE (ON DELETE) kasował
|
||||||
|
je razem z `drop` sceną. Dla cross-source merdży (tpdb↔stashdb canonical, zwykle
|
||||||
|
bez tube-playbacku) nie bolało, ale phash_exact / tube-dup merge GUBIŁ linki do
|
||||||
|
odtwarzania. Unique constraint `(origin, page_url)` jest GLOBALNY → drop i keep nie
|
||||||
|
mogą współdzielić tego samego źródła, więc samo przepięcie scene_id nie grozi kolizją.
|
||||||
|
"""
|
||||||
|
session.execute(
|
||||||
|
update(PlaybackSource)
|
||||||
|
.where(PlaybackSource.scene_id == drop_id)
|
||||||
|
.values(scene_id=keep_id)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _coalesce_canonical_fields(keep: Scene, drop: Scene) -> None:
|
def _coalesce_canonical_fields(keep: Scene, drop: Scene) -> None:
|
||||||
"""Wypełnij braki w `keep` polami z `drop`. Nie nadpisuje istniejących wartości."""
|
"""Wypełnij braki w `keep` polami z `drop`. Nie nadpisuje istniejących wartości."""
|
||||||
if not keep.description and drop.description:
|
if not keep.description and drop.description:
|
||||||
|
|
|
||||||
98
scripts/merge_exact_title_duration.py
Normal file
98
scripts/merge_exact_title_duration.py
Normal file
|
|
@ -0,0 +1,98 @@
|
||||||
|
"""Merge missing-merge duplikatów: ten sam performer + identyczny znormalizowany tytuł
|
||||||
|
+ identyczna długość (co do sekundy).
|
||||||
|
|
||||||
|
Kontekst: bulk_dedup łapie cross-source (tpdb↔stashdb) i exact-phash, ale NIE łapie
|
||||||
|
tube-dup bez fingerprintów (np. ta sama scena zescrapowana 2× pod różnym URL/slug).
|
||||||
|
Na stronie performera user widzi wtedy "te same miniatury, duplikat" (bug-report
|
||||||
|
ef92809d — Bad Bella miała 25 takich par). Sygnał `same performer + exact norm-title
|
||||||
|
+ exact duration_sec` jest praktycznie pewny (dwa różne wideo nie mają byte-identycznego
|
||||||
|
tytułu I długości co do sekundy).
|
||||||
|
|
||||||
|
Keep = scena z największą liczbą external_refs → potem playback_sources → potem najstarsza.
|
||||||
|
Merge przez resolve.scene_merge.merge_scenes (przenosi refs/performers/tags/fingerprints/
|
||||||
|
playback_sources — playback move dodany 2026-06-08 razem z tym skryptem).
|
||||||
|
|
||||||
|
Użycie (kontener worker):
|
||||||
|
python scripts/merge_exact_title_duration.py [PERFORMER_ID] [--commit]
|
||||||
|
Bez PERFORMER_ID = wszyscy performerzy (global). Bez --commit = dry-run.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.db import session_scope
|
||||||
|
from app.resolve.scene_merge import merge_scenes
|
||||||
|
|
||||||
|
|
||||||
|
def _args() -> tuple[str | None, bool]:
|
||||||
|
commit = "--commit" in sys.argv
|
||||||
|
pid = None
|
||||||
|
for a in sys.argv[1:]:
|
||||||
|
if a != "--commit" and len(a) >= 32:
|
||||||
|
pid = a
|
||||||
|
return pid, commit
|
||||||
|
|
||||||
|
|
||||||
|
def _groups(pid: str | None) -> list[list[str]]:
|
||||||
|
# Grupy scen (per performer) o identycznym lower(trim(title)) + duration_sec.
|
||||||
|
# member order: refs DESC, srcs DESC, created_at ASC → pierwszy = keeper.
|
||||||
|
where_perf = "AND sp.performer_id = :pid" if pid else ""
|
||||||
|
sql = f"""
|
||||||
|
WITH cand AS (
|
||||||
|
SELECT s.id,
|
||||||
|
sp.performer_id,
|
||||||
|
lower(btrim(s.title)) nt,
|
||||||
|
s.duration_sec dur,
|
||||||
|
s.created_at,
|
||||||
|
(SELECT count(*) FROM scene_external_refs r WHERE r.scene_id=s.id) refs,
|
||||||
|
(SELECT count(*) FROM playback_sources p WHERE p.scene_id=s.id) srcs
|
||||||
|
FROM scenes s
|
||||||
|
JOIN scene_performers sp ON sp.scene_id=s.id {where_perf}
|
||||||
|
WHERE s.duration_sec IS NOT NULL AND btrim(s.title) <> ''
|
||||||
|
)
|
||||||
|
SELECT array_agg(id::text ORDER BY refs DESC, srcs DESC, created_at ASC) members
|
||||||
|
FROM cand
|
||||||
|
GROUP BY performer_id, nt, dur
|
||||||
|
HAVING count(*) > 1
|
||||||
|
"""
|
||||||
|
params = {"pid": pid} if pid else {}
|
||||||
|
with session_scope() as s:
|
||||||
|
rows = s.execute(text(sql), params).all()
|
||||||
|
# dedup grup (ten sam zestaw może wyjść dla 2 performerów dzielących sceny)
|
||||||
|
seen: set[frozenset] = set()
|
||||||
|
out: list[list[str]] = []
|
||||||
|
for (members,) in rows:
|
||||||
|
key = frozenset(members)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
out.append(list(members))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
pid, commit = _args()
|
||||||
|
groups = _groups(pid)
|
||||||
|
pairs = sum(len(g) - 1 for g in groups)
|
||||||
|
print(f"performer={pid or 'ALL'} groups={len(groups)} merges={pairs} commit={commit}", flush=True)
|
||||||
|
merged = 0
|
||||||
|
for g in groups:
|
||||||
|
keep = g[0]
|
||||||
|
for drop in g[1:]:
|
||||||
|
if not commit:
|
||||||
|
print(f" [dry] keep {keep[:8]} <- drop {drop[:8]}")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
with session_scope() as s:
|
||||||
|
import uuid as _u
|
||||||
|
merge_scenes(s, keep_id=_u.UUID(keep), drop_id=_u.UUID(drop), resolved_by="merge_exact_title_duration")
|
||||||
|
merged += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERR keep {keep[:8]} drop {drop[:8]}: {e}")
|
||||||
|
print(f"DONE merged={merged}/{pairs}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Reference in a new issue