goon/scripts/bulk_auto_merge.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

268 lines
11 KiB
Python

"""Bulk auto-merge wysokocrediblowych pending merge_candidates.
Strategia: pending kandydaci scena↔scena spełniający WSZYSTKIE:
- score >= 0.85
- cross-source: jeden bok ma TPDB ref, drugi ma StashDB ref (źródła zaufane)
- studio match: oba scenes mają studio_id i są równe (lub oba NULL — rzadko)
- data pasuje: |release_date_left - release_date_right| <= 7 dni
(lub jedna strona ma NULL date i druga też — wtedy pomijamy bo niepewne)
Side keep: preferuje TPDB (więcej metadanych: studio aliases, performers, dłuższe descs).
Jeśli oba mają TPDB lub oba mają StashDB → keep_left (deterministyczne).
Reżim:
python bulk_auto_merge.py [--dry-run] [--score-min 0.85] [--max N]
Run z VPS:
docker cp scripts/bulk_auto_merge.py goon-api-1:/tmp/
docker exec goon-api-1 python /tmp/bulk_auto_merge.py [args]
"""
from __future__ import annotations
import argparse
import logging
import sys
from datetime import timedelta
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.db import session_scope
from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus
from app.models.scene import Scene, ScenePerformer, SceneExternalRef
from app.models.source import Source, SourceKind
from app.resolve.scene_merge import MergeError, resolve_candidate
log = logging.getLogger("bulk_auto_merge")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--score-min", type=float, default=0.85)
parser.add_argument("--date-window-days", type=int, default=7)
parser.add_argument("--max", type=int, default=10000)
parser.add_argument(
"--allow-same-source-above",
type=float,
default=None,
help="Score >= X auto-merge regardless of cross-source (np. 0.95 dla perfect fingerprint)",
)
parser.add_argument(
"--include-pornapp",
action="store_true",
help=(
"Akceptuj canon (TPDB|StashDB) ↔ pornapp jako cross-source. Pomija twardy "
"studio match (porn-app tube != canonical studio). Wymaga ≥1 wspólnego "
"performera (jeśli oba mają) i title_fuzzy >= --min-title-fuzzy."
),
)
parser.add_argument(
"--min-title-fuzzy",
type=float,
default=0.7,
help=(
"Min token_set_ratio dla canon↔pornapp (default 0.7). Bypass dla score≥0.95 "
"(fingerprint/external_id match — tytuły mogą się różnić bo tube SEO != studio)."
),
)
parser.add_argument(
"--max-duration-diff-sec",
type=int,
default=30,
help=(
"Max różnica duration_sec między canon i pornapp dla auto-merge (default 30s). "
"Bez tego scen z tym samym performerem dostają 0.91+ score nawet gdy są różne "
"(np. 35min VR vs 8min anal). Bypass: score≥0.95."
),
)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
with session_scope() as s:
# Pull all pending scene candidates above threshold
cands = (
s.execute(
select(MergeCandidate)
.where(MergeCandidate.status == MergeStatus.pending)
.where(MergeCandidate.kind == MergeKind.scene)
.where(MergeCandidate.score >= args.score_min)
.order_by(MergeCandidate.score.desc())
.limit(args.max)
)
.scalars()
.all()
)
log.info("pending candidates >= %.2f: %d", args.score_min, len(cands))
# Pre-fetch source kinds for left/right scenes (TPDB vs StashDB vs scraper)
scene_ids = {c.left_id for c in cands} | {c.right_id for c in cands}
kinds_per_scene: dict = {}
for sid, kind in s.execute(
select(SceneExternalRef.scene_id, Source.kind)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(SceneExternalRef.scene_id.in_(scene_ids))
):
kinds_per_scene.setdefault(sid, set()).add(kind)
# Pre-fetch scene meta (studio_id, release_date, duration_sec)
scene_meta: dict = {}
for sid, studio_id, rel_date, dur in s.execute(
select(Scene.id, Scene.studio_id, Scene.release_date, Scene.duration_sec)
.where(Scene.id.in_(scene_ids))
):
scene_meta[sid] = (studio_id, rel_date, dur)
# Pre-fetch performer sets per scene (for overlap check w canon↔pornapp).
perf_per_scene: dict = {}
for sid, pid in s.execute(
select(ScenePerformer.scene_id, ScenePerformer.performer_id)
.where(ScenePerformer.scene_id.in_(scene_ids))
):
perf_per_scene.setdefault(sid, set()).add(pid)
# Pre-fetch title_normalized for fuzzy check.
title_per_scene: dict = {}
for sid, tnorm in s.execute(
select(Scene.id, Scene.title_normalized).where(Scene.id.in_(scene_ids))
):
title_per_scene[sid] = tnorm or ""
counters = {
"merged": 0,
"merged_canon_pornapp": 0,
"skip_score": 0,
"skip_not_cross_source": 0,
"skip_studio_mismatch": 0,
"skip_date_mismatch": 0,
"skip_no_performer_overlap": 0,
"skip_low_title_fuzzy": 0,
"skip_duration_mismatch": 0,
"skip_duration_unknown": 0,
"skip_missing_meta": 0,
"errored": 0,
}
window = timedelta(days=args.date_window_days)
from rapidfuzz import fuzz
for c in cands:
l_kinds = kinds_per_scene.get(c.left_id, set())
r_kinds = kinds_per_scene.get(c.right_id, set())
l_has_tpdb = SourceKind.tpdb in l_kinds
l_has_stash = SourceKind.stashdb in l_kinds
l_has_pa = SourceKind.scraper in l_kinds
r_has_tpdb = SourceKind.tpdb in r_kinds
r_has_stash = SourceKind.stashdb in r_kinds
r_has_pa = SourceKind.scraper in r_kinds
l_canon = l_has_tpdb or l_has_stash
r_canon = r_has_tpdb or r_has_stash
# 3 typy cross-source:
# 1. canon↔canon (TPDB↔StashDB) — original mode
# 2. canon↔pornapp — z --include-pornapp; pornapp side dziedziczy tagi z canon
# 3. allow-same-source-above bypass dla bardzo wysokich score (fingerprint match)
canon_canon = (l_has_tpdb and r_has_stash) or (l_has_stash and r_has_tpdb)
canon_pa = (
args.include_pornapp
and ((l_canon and r_has_pa and not r_canon) or (r_canon and l_has_pa and not l_canon))
)
bypass = (
args.allow_same_source_above is not None
and c.score >= args.allow_same_source_above
)
if not canon_canon and not canon_pa and not bypass:
counters["skip_not_cross_source"] += 1
continue
l_meta = scene_meta.get(c.left_id)
r_meta = scene_meta.get(c.right_id)
if not l_meta or not r_meta:
counters["skip_missing_meta"] += 1
continue
l_studio, l_date, l_dur = l_meta
r_studio, r_date, r_dur = r_meta
if canon_pa:
# canon↔pornapp: studio + date NIE są informatywne (pornapp daje tube name
# jako studio, typowo brak release_date). Zamiast tego:
#
# 1. ≥1 wspólny performer (jeśli oba mają)
# 2. duration ±N sekund — KLUCZOWE bo scoring.py boostuje score do 0.91+
# przez performer+title-tokens nawet gdy to różne sceny tego samego
# performera (np. 7 różnych Lena Paul scen na 1 canonical → 7 false pos).
# Duration to twardy fizyczny sygnał: dwie te same sceny mają tę samą
# długość. 35min VR scena vs 8min anal scena → różne sceny.
# 3. title fuzzy ≥ --min-title-fuzzy (bypass score≥0.95)
#
# Bypass wszystkich sanity gdy score ≥ 0.95 (to wtedy fingerprint/external_id
# match, sygnał wyższego rzędu niż heurystyki).
if c.score < 0.95:
l_perfs = perf_per_scene.get(c.left_id, set())
r_perfs = perf_per_scene.get(c.right_id, set())
if l_perfs and r_perfs and not (l_perfs & r_perfs):
counters["skip_no_performer_overlap"] += 1
continue
# Duration check — wymaga znanej duration po obu stronach. Brak duration
# → reject (nie umiemy bezpiecznie zdecydować).
if l_dur is None or r_dur is None:
counters["skip_duration_unknown"] += 1
continue
if abs(l_dur - r_dur) > args.max_duration_diff_sec:
counters["skip_duration_mismatch"] += 1
continue
ltitle = title_per_scene.get(c.left_id, "")
rtitle = title_per_scene.get(c.right_id, "")
if ltitle and rtitle:
title_fuzz = fuzz.token_set_ratio(ltitle, rtitle) / 100.0
if title_fuzz < args.min_title_fuzzy:
counters["skip_low_title_fuzzy"] += 1
continue
keep_left = l_canon
else:
# canon↔canon (TPDB↔StashDB) lub bypass: standardowe sanity checks.
if l_studio is None or r_studio is None or l_studio != r_studio:
counters["skip_studio_mismatch"] += 1
continue
if l_date is None or r_date is None:
counters["skip_date_mismatch"] += 1
continue
if abs((l_date - r_date).days) > args.date_window_days:
counters["skip_date_mismatch"] += 1
continue
# TPDB preferowany (więcej metadanych); inaczej keep_left.
if l_has_tpdb and not r_has_tpdb:
keep_left = True
elif r_has_tpdb and not l_has_tpdb:
keep_left = False
else:
keep_left = True
if args.dry_run:
counters["merged"] += 1
if canon_pa:
counters["merged_canon_pornapp"] += 1
continue
try:
with session_scope() as s:
resolve_candidate(
s,
candidate_id=c.id,
action="merge",
keep_left=keep_left,
resolved_by="bulk_auto_merge",
)
counters["merged"] += 1
if canon_pa:
counters["merged_canon_pornapp"] += 1
if counters["merged"] % 100 == 0:
log.info("progress merged=%d", counters["merged"])
except (MergeError, Exception) as e:
counters["errored"] += 1
log.warning("merge %s failed: %s", c.id, e)
log.info("done: %s", counters)
if __name__ == "__main__":
main()