goon/app/resolve/scoring.py

"""Scoring funkcji do dopasowania kandydatów scen.

Wszystkie sub-score'y wracają w [0, 1]. Composite score łączy je z wagami,
redystrybuując wagę gdy któryś sygnał jest niedostępny (np. brak fingerprintu).

Wagi (gdy wszystko dostępne):
  fp_phash:    0.40
  title:       0.25
  performers:  0.20
  date:        0.15

Twardy reject: studio_match=False → score 0.0 (chyba że ma silny fingerprint match
≥0.95, wtedy ufamy fingerprintowi i ignorujemy studio mismatch — bo zdarza się że
TPDB ma "Brazzers Exxtra" a StashDB "Brazzers" jako studio sceny).
"""
from __future__ import annotations

import math
import uuid
from collections.abc import Iterable
from dataclasses import dataclass
from datetime import date

from rapidfuzz import fuzz

from app.config import get_settings


@dataclass
class ScoreBreakdown:
    """Per-sub-score values + final composite + reasons (do zapisu w merge_candidates.reasons)."""

    fp: float | None = None
    title: float | None = None
    performers: float | None = None
    date: float | None = None
    duration: float | None = None
    studio_match: bool | None = None
    composite: float = 0.0
    reasons: dict = None  # type: ignore[assignment]

    def to_dict(self) -> dict:
        return {
            "fp": self.fp,
            "title": self.title,
            "performers": self.performers,
            "date": self.date,
            "duration": self.duration,
            "studio_match": self.studio_match,
            "composite": self.composite,
            "reasons": self.reasons or {},
        }


# ---- Sub-scorers ----------------------------------------------------------

def hamming_distance_hex(a: str, b: str) -> int:
    """Hamming distance dwóch hex hashy o tej samej długości."""
    if len(a) != len(b):
        raise ValueError(f"hash length mismatch: {len(a)} vs {len(b)}")
    return bin(int(a, 16) ^ int(b, 16)).count("1")


def phash_similarity(a: str, b: str, *, bits: int = 64) -> float:
    """Similarity = 1 - hamming/bits. Dla 64-bit phash i ≤5 różnic → ≥0.92."""
    d = hamming_distance_hex(a, b)
    return max(0.0, 1.0 - d / bits)


def title_similarity(a: str, b: str) -> float:
    """`a`, `b` powinny być już znormalizowane (`title_normalized`).

    Token-set ratio jest odporny na zmianę kolejności słów / dodatkowe tokeny.
    """
    if not a or not b:
        return 0.0
    return fuzz.token_set_ratio(a, b) / 100.0


def performer_set_similarity(
    left_ids: Iterable[uuid.UUID],
    right_ids: Iterable[uuid.UUID],
) -> float:
    """Jaccard na zbiorach kanonicznych UUID-ów performerów."""
    left = {str(i) for i in left_ids if i is not None}
    right = {str(i) for i in right_ids if i is not None}
    if not left and not right:
        return 0.0
    intersection = left & right
    union = left | right
    if not union:
        return 0.0
    return len(intersection) / len(union)


def date_proximity(left: date | None, right: date | None, *, window_days: int = 7) -> float:
    """1.0 gdy ten sam dzień, liniowy spadek do 0 w oknie window_days, poza oknem 0.0."""
    if left is None or right is None:
        return 0.0
    delta = abs((left - right).days)
    if delta == 0:
        return 1.0
    if delta > window_days:
        return 0.0
    return 1.0 - delta / window_days


def duration_proximity(
    left: int | None, right: int | None, *, window_sec: int = 60
) -> float | None:
    """1.0 gdy duration identyczny, liniowy spadek do 0 w oknie window_sec.

    Zwraca None gdy któraś wartość brak (sygnał nieinformatywny). Tube'y rzadko
    podają dokładny duration; różnica ±60s zwykle oznacza tę samą scenę z innym
    intro/outro. Poza oknem → 0.0 (różne sceny).
    """
    if not left or not right:
        return None
    delta = abs(left - right)
    if delta == 0:
        return 1.0
    if delta > window_sec:
        return 0.0
    return 1.0 - delta / window_sec


# ---- Composite ------------------------------------------------------------

# Bazowe wagi gdy wszystkie sygnały są dostępne.
_BASE_WEIGHTS = {
    "fp": 0.40,
    "title": 0.20,
    "performers": 0.15,
    "date": 0.15,
    "duration": 0.10,
}


def composite_score(
    *,
    fp: float | None,
    title: float | None,
    performers: float | None,
    date_score: float | None,
    duration_score: float | None = None,
    studio_match: bool | None,
    aggregator_mode: bool = False,
) -> tuple[float, dict]:
    """Łączy sub-score'y w jeden composite [0, 1] + zwraca raport reasons.

    studio_match=False → hard reject na 0.0, chyba że:
      - fp ≥ 0.95 (silny fingerprint bije studio mismatch), albo
      - aggregator_mode=True (np. tube'y typu HQPorner agregują z różnych studiów,
        więc studio z naszej perspektywy nie jest informatywny — pomijamy hard reject
        i zwiększamy wagę performers).
    """
    reasons: dict = {}

    if studio_match is False:
        if fp is not None and fp >= 0.95:
            reasons["studio_mismatch_overridden_by_fp"] = True
        elif aggregator_mode:
            reasons["studio_ignored_aggregator"] = True
            studio_match = None  # nie informatywny
        else:
            reasons["studio_mismatch"] = True
            return 0.0, reasons

    available = {
        k: v
        for k, v in {
            "fp": fp,
            "title": title,
            "performers": performers,
            "date": date_score,
            "duration": duration_score,
        }.items()
        if v is not None
    }
    if not available:
        return 0.0, {"no_signals": True}

    base_weights = dict(_BASE_WEIGHTS)
    if aggregator_mode:
        # tube'y nie mają date/fp, performer set + duration to najsilniejsze sygnały.
        base_weights = {
            "fp": 0.20,
            "title": 0.15,
            "performers": 0.35,
            "date": 0.05,
            "duration": 0.25,
        }
        reasons["aggregator_weights"] = base_weights

    weights = {k: base_weights[k] for k in available}
    total_w = sum(weights.values())
    if total_w == 0.0:
        return 0.0, reasons
    norm_w = {k: w / total_w for k, w in weights.items()}

    score = sum(available[k] * norm_w[k] for k in available)
    reasons["weights"] = norm_w
    reasons["sub_scores"] = available

    # W aggregator mode wymagamy minimalnego performer overlap dla auto-merge —
    # bez tego polegamy tylko na title fuzzy, co ma wysoki false-positive rate
    # (różne sceny mogą mieć podobne nazwy).
    if aggregator_mode and (performers is None or performers < 0.5):
        score = min(score, 0.74)  # cap poniżej review threshold
        reasons["aggregator_low_performer_cap"] = True

    # Weak-signal cap: w aggregator mode gdy NIE MAMY duration ANI fingerprint ANI
    # date (wszystkie najsilniejsze sygnały braki), polegamy WYŁĄCZNIE na title +
    # performers. Same-performer dla prolific actresses (Tania Amazon, Mia Malkova,
    # Aria Alexander) daje 1.0, a token-set ratio 0.75 z imienia/nazwiska w tytule
    # jest powszechny → composite szybko hitje threshold (0.925) i auto-merguje
    # 78 różnych scen pod jedną canonical (zgłoszone 2026-05-08).
    #
    # Reload 2026-05-09: title bypass przy ≥0.95 zostawiał furtkę dla "Simone Peach
    # BBW rides..." vs "Peach Lollypop sexy BBW rides..." (token-set sweep
    # podbija title >0.95 mimo że to różne osoby). Cap zawsze, niezależnie od
    # title — auto-merge wymaga **co najmniej jednego strong signal** (fp,
    # duration, date). Bez nich → review queue, nigdy auto-merge.
    has_strong_signal = (
        (fp is not None and fp >= 0.5)
        or (duration_score is not None and duration_score >= 0.5)
        or (date_score is not None and date_score >= 0.5)
    )
    if aggregator_mode and not has_strong_signal:
        score = min(score, 0.85)
        reasons["aggregator_weak_signal_cap"] = True

    # Strong-signal boost: w aggregator mode duration ±3s + performer overlap ≥0.5
    # + title >=0.40 ≈ pewny match (te same długości + ten sam performer + nie-totalnie-
    # różne tytuły to bardzo rzadki false positive). Bumpujemy do auto-merge gdy
    # tube SEO title różni się od studio canonical title ale zachowuje wspólny token.
    #
    # **Tightened 2026-05-12** (bug-report ef090842): poprzednio duration ±6s bez
    # guardu na title → "Five Star Anal Fuck" (2105s) i "Match My Freak" (2110s) Lily
    # Lou auto-merge'owało się w jedną scenę bo duration diff=5 (score 0.917 ≥0.90)
    # + performer 1.0 wystarczało. Zmiany:
    #   - duration ≥ 0.95 (≤3s diff zamiast ≤6s) — Brazzers/Naughty America często
    #     mają sąsiednie sceny z tym samym actorem o pochodnej długości (intro/outro),
    #     ale ≤3s to praktycznie ten sam encoding
    #   - title ≥ 0.40 — zatrzymuje "totally different title" false matches; nadal
    #     toleruje "TheCanonicalTitle" vs "SiteSlug SEO Title TheCanonical Title FREE"
    if (
        aggregator_mode
        and duration_score is not None
        and duration_score >= 0.95
        and performers is not None
        and performers >= 0.5
        and title is not None
        and title >= 0.40
    ):
        if score < 0.92:
            reasons["duration_perf_strong_match_bump"] = True
            score = max(score, 0.92)

    return _clamp(score), reasons


def triage(score: float) -> str:
    """Zwraca 'auto', 'review', 'reject' wg progów z config."""
    s = get_settings()
    if score >= s.auto_merge_threshold:
        return "auto"
    if score >= s.review_threshold:
        return "review"
    return "reject"


def _clamp(v: float) -> float:
    return max(0.0, min(1.0, v if not math.isnan(v) else 0.0))