Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
268 lines
11 KiB
Python
268 lines
11 KiB
Python
"""Bulk auto-merge wysokocrediblowych pending merge_candidates.
|
|
|
|
Strategia: pending kandydaci scena↔scena spełniający WSZYSTKIE:
|
|
- score >= 0.85
|
|
- cross-source: jeden bok ma TPDB ref, drugi ma StashDB ref (źródła zaufane)
|
|
- studio match: oba scenes mają studio_id i są równe (lub oba NULL — rzadko)
|
|
- data pasuje: |release_date_left - release_date_right| <= 7 dni
|
|
(lub jedna strona ma NULL date i druga też — wtedy pomijamy bo niepewne)
|
|
|
|
Side keep: preferuje TPDB (więcej metadanych: studio aliases, performers, dłuższe descs).
|
|
Jeśli oba mają TPDB lub oba mają StashDB → keep_left (deterministyczne).
|
|
|
|
Reżim:
|
|
python bulk_auto_merge.py [--dry-run] [--score-min 0.85] [--max N]
|
|
|
|
Run z VPS:
|
|
docker cp scripts/bulk_auto_merge.py goon-api-1:/tmp/
|
|
docker exec goon-api-1 python /tmp/bulk_auto_merge.py [args]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
from datetime import timedelta
|
|
|
|
from sqlalchemy import select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.db import session_scope
|
|
from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus
|
|
from app.models.scene import Scene, ScenePerformer, SceneExternalRef
|
|
from app.models.source import Source, SourceKind
|
|
from app.resolve.scene_merge import MergeError, resolve_candidate
|
|
|
|
log = logging.getLogger("bulk_auto_merge")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
parser.add_argument("--score-min", type=float, default=0.85)
|
|
parser.add_argument("--date-window-days", type=int, default=7)
|
|
parser.add_argument("--max", type=int, default=10000)
|
|
parser.add_argument(
|
|
"--allow-same-source-above",
|
|
type=float,
|
|
default=None,
|
|
help="Score >= X auto-merge regardless of cross-source (np. 0.95 dla perfect fingerprint)",
|
|
)
|
|
parser.add_argument(
|
|
"--include-pornapp",
|
|
action="store_true",
|
|
help=(
|
|
"Akceptuj canon (TPDB|StashDB) ↔ pornapp jako cross-source. Pomija twardy "
|
|
"studio match (porn-app tube != canonical studio). Wymaga ≥1 wspólnego "
|
|
"performera (jeśli oba mają) i title_fuzzy >= --min-title-fuzzy."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--min-title-fuzzy",
|
|
type=float,
|
|
default=0.7,
|
|
help=(
|
|
"Min token_set_ratio dla canon↔pornapp (default 0.7). Bypass dla score≥0.95 "
|
|
"(fingerprint/external_id match — tytuły mogą się różnić bo tube SEO != studio)."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--max-duration-diff-sec",
|
|
type=int,
|
|
default=30,
|
|
help=(
|
|
"Max różnica duration_sec między canon i pornapp dla auto-merge (default 30s). "
|
|
"Bez tego scen z tym samym performerem dostają 0.91+ score nawet gdy są różne "
|
|
"(np. 35min VR vs 8min anal). Bypass: score≥0.95."
|
|
),
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
with session_scope() as s:
|
|
# Pull all pending scene candidates above threshold
|
|
cands = (
|
|
s.execute(
|
|
select(MergeCandidate)
|
|
.where(MergeCandidate.status == MergeStatus.pending)
|
|
.where(MergeCandidate.kind == MergeKind.scene)
|
|
.where(MergeCandidate.score >= args.score_min)
|
|
.order_by(MergeCandidate.score.desc())
|
|
.limit(args.max)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
log.info("pending candidates >= %.2f: %d", args.score_min, len(cands))
|
|
|
|
# Pre-fetch source kinds for left/right scenes (TPDB vs StashDB vs scraper)
|
|
scene_ids = {c.left_id for c in cands} | {c.right_id for c in cands}
|
|
kinds_per_scene: dict = {}
|
|
for sid, kind in s.execute(
|
|
select(SceneExternalRef.scene_id, Source.kind)
|
|
.join(Source, Source.id == SceneExternalRef.source_id)
|
|
.where(SceneExternalRef.scene_id.in_(scene_ids))
|
|
):
|
|
kinds_per_scene.setdefault(sid, set()).add(kind)
|
|
|
|
# Pre-fetch scene meta (studio_id, release_date, duration_sec)
|
|
scene_meta: dict = {}
|
|
for sid, studio_id, rel_date, dur in s.execute(
|
|
select(Scene.id, Scene.studio_id, Scene.release_date, Scene.duration_sec)
|
|
.where(Scene.id.in_(scene_ids))
|
|
):
|
|
scene_meta[sid] = (studio_id, rel_date, dur)
|
|
|
|
# Pre-fetch performer sets per scene (for overlap check w canon↔pornapp).
|
|
perf_per_scene: dict = {}
|
|
for sid, pid in s.execute(
|
|
select(ScenePerformer.scene_id, ScenePerformer.performer_id)
|
|
.where(ScenePerformer.scene_id.in_(scene_ids))
|
|
):
|
|
perf_per_scene.setdefault(sid, set()).add(pid)
|
|
|
|
# Pre-fetch title_normalized for fuzzy check.
|
|
title_per_scene: dict = {}
|
|
for sid, tnorm in s.execute(
|
|
select(Scene.id, Scene.title_normalized).where(Scene.id.in_(scene_ids))
|
|
):
|
|
title_per_scene[sid] = tnorm or ""
|
|
|
|
counters = {
|
|
"merged": 0,
|
|
"merged_canon_pornapp": 0,
|
|
"skip_score": 0,
|
|
"skip_not_cross_source": 0,
|
|
"skip_studio_mismatch": 0,
|
|
"skip_date_mismatch": 0,
|
|
"skip_no_performer_overlap": 0,
|
|
"skip_low_title_fuzzy": 0,
|
|
"skip_duration_mismatch": 0,
|
|
"skip_duration_unknown": 0,
|
|
"skip_missing_meta": 0,
|
|
"errored": 0,
|
|
}
|
|
window = timedelta(days=args.date_window_days)
|
|
from rapidfuzz import fuzz
|
|
|
|
for c in cands:
|
|
l_kinds = kinds_per_scene.get(c.left_id, set())
|
|
r_kinds = kinds_per_scene.get(c.right_id, set())
|
|
l_has_tpdb = SourceKind.tpdb in l_kinds
|
|
l_has_stash = SourceKind.stashdb in l_kinds
|
|
l_has_pa = SourceKind.scraper in l_kinds
|
|
r_has_tpdb = SourceKind.tpdb in r_kinds
|
|
r_has_stash = SourceKind.stashdb in r_kinds
|
|
r_has_pa = SourceKind.scraper in r_kinds
|
|
l_canon = l_has_tpdb or l_has_stash
|
|
r_canon = r_has_tpdb or r_has_stash
|
|
# 3 typy cross-source:
|
|
# 1. canon↔canon (TPDB↔StashDB) — original mode
|
|
# 2. canon↔pornapp — z --include-pornapp; pornapp side dziedziczy tagi z canon
|
|
# 3. allow-same-source-above bypass dla bardzo wysokich score (fingerprint match)
|
|
canon_canon = (l_has_tpdb and r_has_stash) or (l_has_stash and r_has_tpdb)
|
|
canon_pa = (
|
|
args.include_pornapp
|
|
and ((l_canon and r_has_pa and not r_canon) or (r_canon and l_has_pa and not l_canon))
|
|
)
|
|
bypass = (
|
|
args.allow_same_source_above is not None
|
|
and c.score >= args.allow_same_source_above
|
|
)
|
|
if not canon_canon and not canon_pa and not bypass:
|
|
counters["skip_not_cross_source"] += 1
|
|
continue
|
|
|
|
l_meta = scene_meta.get(c.left_id)
|
|
r_meta = scene_meta.get(c.right_id)
|
|
if not l_meta or not r_meta:
|
|
counters["skip_missing_meta"] += 1
|
|
continue
|
|
l_studio, l_date, l_dur = l_meta
|
|
r_studio, r_date, r_dur = r_meta
|
|
|
|
if canon_pa:
|
|
# canon↔pornapp: studio + date NIE są informatywne (pornapp daje tube name
|
|
# jako studio, typowo brak release_date). Zamiast tego:
|
|
#
|
|
# 1. ≥1 wspólny performer (jeśli oba mają)
|
|
# 2. duration ±N sekund — KLUCZOWE bo scoring.py boostuje score do 0.91+
|
|
# przez performer+title-tokens nawet gdy to różne sceny tego samego
|
|
# performera (np. 7 różnych Lena Paul scen na 1 canonical → 7 false pos).
|
|
# Duration to twardy fizyczny sygnał: dwie te same sceny mają tę samą
|
|
# długość. 35min VR scena vs 8min anal scena → różne sceny.
|
|
# 3. title fuzzy ≥ --min-title-fuzzy (bypass score≥0.95)
|
|
#
|
|
# Bypass wszystkich sanity gdy score ≥ 0.95 (to wtedy fingerprint/external_id
|
|
# match, sygnał wyższego rzędu niż heurystyki).
|
|
if c.score < 0.95:
|
|
l_perfs = perf_per_scene.get(c.left_id, set())
|
|
r_perfs = perf_per_scene.get(c.right_id, set())
|
|
if l_perfs and r_perfs and not (l_perfs & r_perfs):
|
|
counters["skip_no_performer_overlap"] += 1
|
|
continue
|
|
# Duration check — wymaga znanej duration po obu stronach. Brak duration
|
|
# → reject (nie umiemy bezpiecznie zdecydować).
|
|
if l_dur is None or r_dur is None:
|
|
counters["skip_duration_unknown"] += 1
|
|
continue
|
|
if abs(l_dur - r_dur) > args.max_duration_diff_sec:
|
|
counters["skip_duration_mismatch"] += 1
|
|
continue
|
|
ltitle = title_per_scene.get(c.left_id, "")
|
|
rtitle = title_per_scene.get(c.right_id, "")
|
|
if ltitle and rtitle:
|
|
title_fuzz = fuzz.token_set_ratio(ltitle, rtitle) / 100.0
|
|
if title_fuzz < args.min_title_fuzzy:
|
|
counters["skip_low_title_fuzzy"] += 1
|
|
continue
|
|
keep_left = l_canon
|
|
else:
|
|
# canon↔canon (TPDB↔StashDB) lub bypass: standardowe sanity checks.
|
|
if l_studio is None or r_studio is None or l_studio != r_studio:
|
|
counters["skip_studio_mismatch"] += 1
|
|
continue
|
|
if l_date is None or r_date is None:
|
|
counters["skip_date_mismatch"] += 1
|
|
continue
|
|
if abs((l_date - r_date).days) > args.date_window_days:
|
|
counters["skip_date_mismatch"] += 1
|
|
continue
|
|
# TPDB preferowany (więcej metadanych); inaczej keep_left.
|
|
if l_has_tpdb and not r_has_tpdb:
|
|
keep_left = True
|
|
elif r_has_tpdb and not l_has_tpdb:
|
|
keep_left = False
|
|
else:
|
|
keep_left = True
|
|
|
|
if args.dry_run:
|
|
counters["merged"] += 1
|
|
if canon_pa:
|
|
counters["merged_canon_pornapp"] += 1
|
|
continue
|
|
|
|
try:
|
|
with session_scope() as s:
|
|
resolve_candidate(
|
|
s,
|
|
candidate_id=c.id,
|
|
action="merge",
|
|
keep_left=keep_left,
|
|
resolved_by="bulk_auto_merge",
|
|
)
|
|
counters["merged"] += 1
|
|
if canon_pa:
|
|
counters["merged_canon_pornapp"] += 1
|
|
if counters["merged"] % 100 == 0:
|
|
log.info("progress merged=%d", counters["merged"])
|
|
except (MergeError, Exception) as e:
|
|
counters["errored"] += 1
|
|
log.warning("merge %s failed: %s", c.id, e)
|
|
|
|
log.info("done: %s", counters)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|