"""Bulk auto-merge wysokocrediblowych pending merge_candidates. Strategia: pending kandydaci scena↔scena spełniający WSZYSTKIE: - score >= 0.85 - cross-source: jeden bok ma TPDB ref, drugi ma StashDB ref (źródła zaufane) - studio match: oba scenes mają studio_id i są równe (lub oba NULL — rzadko) - data pasuje: |release_date_left - release_date_right| <= 7 dni (lub jedna strona ma NULL date i druga też — wtedy pomijamy bo niepewne) Side keep: preferuje TPDB (więcej metadanych: studio aliases, performers, dłuższe descs). Jeśli oba mają TPDB lub oba mają StashDB → keep_left (deterministyczne). Reżim: python bulk_auto_merge.py [--dry-run] [--score-min 0.85] [--max N] Run z VPS: docker cp scripts/bulk_auto_merge.py goon-api-1:/tmp/ docker exec goon-api-1 python /tmp/bulk_auto_merge.py [args] """ from __future__ import annotations import argparse import logging import sys from datetime import timedelta from sqlalchemy import select from sqlalchemy.orm import Session from app.db import session_scope from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus from app.models.scene import Scene, ScenePerformer, SceneExternalRef from app.models.source import Source, SourceKind from app.resolve.scene_merge import MergeError, resolve_candidate log = logging.getLogger("bulk_auto_merge") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") parser.add_argument("--score-min", type=float, default=0.85) parser.add_argument("--date-window-days", type=int, default=7) parser.add_argument("--max", type=int, default=10000) parser.add_argument( "--allow-same-source-above", type=float, default=None, help="Score >= X auto-merge regardless of cross-source (np. 0.95 dla perfect fingerprint)", ) parser.add_argument( "--include-pornapp", action="store_true", help=( "Akceptuj canon (TPDB|StashDB) ↔ pornapp jako cross-source. Pomija twardy " "studio match (porn-app tube != canonical studio). Wymaga ≥1 wspólnego " "performera (jeśli oba mają) i title_fuzzy >= --min-title-fuzzy." ), ) parser.add_argument( "--min-title-fuzzy", type=float, default=0.7, help=( "Min token_set_ratio dla canon↔pornapp (default 0.7). Bypass dla score≥0.95 " "(fingerprint/external_id match — tytuły mogą się różnić bo tube SEO != studio)." ), ) parser.add_argument( "--max-duration-diff-sec", type=int, default=30, help=( "Max różnica duration_sec między canon i pornapp dla auto-merge (default 30s). " "Bez tego scen z tym samym performerem dostają 0.91+ score nawet gdy są różne " "(np. 35min VR vs 8min anal). Bypass: score≥0.95." ), ) args = parser.parse_args() logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") with session_scope() as s: # Pull all pending scene candidates above threshold cands = ( s.execute( select(MergeCandidate) .where(MergeCandidate.status == MergeStatus.pending) .where(MergeCandidate.kind == MergeKind.scene) .where(MergeCandidate.score >= args.score_min) .order_by(MergeCandidate.score.desc()) .limit(args.max) ) .scalars() .all() ) log.info("pending candidates >= %.2f: %d", args.score_min, len(cands)) # Pre-fetch source kinds for left/right scenes (TPDB vs StashDB vs scraper) scene_ids = {c.left_id for c in cands} | {c.right_id for c in cands} kinds_per_scene: dict = {} for sid, kind in s.execute( select(SceneExternalRef.scene_id, Source.kind) .join(Source, Source.id == SceneExternalRef.source_id) .where(SceneExternalRef.scene_id.in_(scene_ids)) ): kinds_per_scene.setdefault(sid, set()).add(kind) # Pre-fetch scene meta (studio_id, release_date, duration_sec) scene_meta: dict = {} for sid, studio_id, rel_date, dur in s.execute( select(Scene.id, Scene.studio_id, Scene.release_date, Scene.duration_sec) .where(Scene.id.in_(scene_ids)) ): scene_meta[sid] = (studio_id, rel_date, dur) # Pre-fetch performer sets per scene (for overlap check w canon↔pornapp). perf_per_scene: dict = {} for sid, pid in s.execute( select(ScenePerformer.scene_id, ScenePerformer.performer_id) .where(ScenePerformer.scene_id.in_(scene_ids)) ): perf_per_scene.setdefault(sid, set()).add(pid) # Pre-fetch title_normalized for fuzzy check. title_per_scene: dict = {} for sid, tnorm in s.execute( select(Scene.id, Scene.title_normalized).where(Scene.id.in_(scene_ids)) ): title_per_scene[sid] = tnorm or "" counters = { "merged": 0, "merged_canon_pornapp": 0, "skip_score": 0, "skip_not_cross_source": 0, "skip_studio_mismatch": 0, "skip_date_mismatch": 0, "skip_no_performer_overlap": 0, "skip_low_title_fuzzy": 0, "skip_duration_mismatch": 0, "skip_duration_unknown": 0, "skip_missing_meta": 0, "errored": 0, } window = timedelta(days=args.date_window_days) from rapidfuzz import fuzz for c in cands: l_kinds = kinds_per_scene.get(c.left_id, set()) r_kinds = kinds_per_scene.get(c.right_id, set()) l_has_tpdb = SourceKind.tpdb in l_kinds l_has_stash = SourceKind.stashdb in l_kinds l_has_pa = SourceKind.scraper in l_kinds r_has_tpdb = SourceKind.tpdb in r_kinds r_has_stash = SourceKind.stashdb in r_kinds r_has_pa = SourceKind.scraper in r_kinds l_canon = l_has_tpdb or l_has_stash r_canon = r_has_tpdb or r_has_stash # 3 typy cross-source: # 1. canon↔canon (TPDB↔StashDB) — original mode # 2. canon↔pornapp — z --include-pornapp; pornapp side dziedziczy tagi z canon # 3. allow-same-source-above bypass dla bardzo wysokich score (fingerprint match) canon_canon = (l_has_tpdb and r_has_stash) or (l_has_stash and r_has_tpdb) canon_pa = ( args.include_pornapp and ((l_canon and r_has_pa and not r_canon) or (r_canon and l_has_pa and not l_canon)) ) bypass = ( args.allow_same_source_above is not None and c.score >= args.allow_same_source_above ) if not canon_canon and not canon_pa and not bypass: counters["skip_not_cross_source"] += 1 continue l_meta = scene_meta.get(c.left_id) r_meta = scene_meta.get(c.right_id) if not l_meta or not r_meta: counters["skip_missing_meta"] += 1 continue l_studio, l_date, l_dur = l_meta r_studio, r_date, r_dur = r_meta if canon_pa: # canon↔pornapp: studio + date NIE są informatywne (pornapp daje tube name # jako studio, typowo brak release_date). Zamiast tego: # # 1. ≥1 wspólny performer (jeśli oba mają) # 2. duration ±N sekund — KLUCZOWE bo scoring.py boostuje score do 0.91+ # przez performer+title-tokens nawet gdy to różne sceny tego samego # performera (np. 7 różnych Lena Paul scen na 1 canonical → 7 false pos). # Duration to twardy fizyczny sygnał: dwie te same sceny mają tę samą # długość. 35min VR scena vs 8min anal scena → różne sceny. # 3. title fuzzy ≥ --min-title-fuzzy (bypass score≥0.95) # # Bypass wszystkich sanity gdy score ≥ 0.95 (to wtedy fingerprint/external_id # match, sygnał wyższego rzędu niż heurystyki). if c.score < 0.95: l_perfs = perf_per_scene.get(c.left_id, set()) r_perfs = perf_per_scene.get(c.right_id, set()) if l_perfs and r_perfs and not (l_perfs & r_perfs): counters["skip_no_performer_overlap"] += 1 continue # Duration check — wymaga znanej duration po obu stronach. Brak duration # → reject (nie umiemy bezpiecznie zdecydować). if l_dur is None or r_dur is None: counters["skip_duration_unknown"] += 1 continue if abs(l_dur - r_dur) > args.max_duration_diff_sec: counters["skip_duration_mismatch"] += 1 continue ltitle = title_per_scene.get(c.left_id, "") rtitle = title_per_scene.get(c.right_id, "") if ltitle and rtitle: title_fuzz = fuzz.token_set_ratio(ltitle, rtitle) / 100.0 if title_fuzz < args.min_title_fuzzy: counters["skip_low_title_fuzzy"] += 1 continue keep_left = l_canon else: # canon↔canon (TPDB↔StashDB) lub bypass: standardowe sanity checks. if l_studio is None or r_studio is None or l_studio != r_studio: counters["skip_studio_mismatch"] += 1 continue if l_date is None or r_date is None: counters["skip_date_mismatch"] += 1 continue if abs((l_date - r_date).days) > args.date_window_days: counters["skip_date_mismatch"] += 1 continue # TPDB preferowany (więcej metadanych); inaczej keep_left. if l_has_tpdb and not r_has_tpdb: keep_left = True elif r_has_tpdb and not l_has_tpdb: keep_left = False else: keep_left = True if args.dry_run: counters["merged"] += 1 if canon_pa: counters["merged_canon_pornapp"] += 1 continue try: with session_scope() as s: resolve_candidate( s, candidate_id=c.id, action="merge", keep_left=keep_left, resolved_by="bulk_auto_merge", ) counters["merged"] += 1 if canon_pa: counters["merged_canon_pornapp"] += 1 if counters["merged"] % 100 == 0: log.info("progress merged=%d", counters["merged"]) except (MergeError, Exception) as e: counters["errored"] += 1 log.warning("merge %s failed: %s", c.id, e) log.info("done: %s", counters) if __name__ == "__main__": main()