goon/app/resolve/scoring.py

"""Scoring funkcji do dopasowania kandydatów scen.

Wszystkie sub-score'y wracają w [0, 1]. Composite score łączy je z wagami,
redystrybuując wagę gdy któryś sygnał jest niedostępny (np. brak fingerprintu).

Wagi (gdy wszystko dostępne):
  fp_phash:    0.40
  title:       0.25
  performers:  0.20
  date:        0.15

Twardy reject: studio_match=False → score 0.0 (chyba że ma silny fingerprint match
≥0.95, wtedy ufamy fingerprintowi i ignorujemy studio mismatch — bo zdarza się że
TPDB ma "Brazzers Exxtra" a StashDB "Brazzers" jako studio sceny).
"""
from __future__ import annotations

import math
import re
import uuid
from collections.abc import Iterable
from dataclasses import dataclass
from datetime import date

from rapidfuzz import fuzz

from app.config import get_settings


@dataclass
class ScoreBreakdown:
    """Per-sub-score values + final composite + reasons (do zapisu w merge_candidates.reasons)."""

    fp: float | None = None
    title: float | None = None
    performers: float | None = None
    date: float | None = None
    duration: float | None = None
    studio_match: bool | None = None
    composite: float = 0.0
    reasons: dict = None  # type: ignore[assignment]

    def to_dict(self) -> dict:
        return {
            "fp": self.fp,
            "title": self.title,
            "performers": self.performers,
            "date": self.date,
            "duration": self.duration,
            "studio_match": self.studio_match,
            "composite": self.composite,
            "reasons": self.reasons or {},
        }


# ---- Sub-scorers ----------------------------------------------------------

def hamming_distance_hex(a: str, b: str) -> int:
    """Hamming distance dwóch hex hashy o tej samej długości."""
    if len(a) != len(b):
        raise ValueError(f"hash length mismatch: {len(a)} vs {len(b)}")
    return bin(int(a, 16) ^ int(b, 16)).count("1")


def phash_similarity(a: str, b: str, *, bits: int = 64) -> float:
    """Similarity = 1 - hamming/bits. Dla 64-bit phash i ≤5 różnic → ≥0.92."""
    d = hamming_distance_hex(a, b)
    return max(0.0, 1.0 - d / bits)


def title_similarity(a: str, b: str) -> float:
    """`a`, `b` powinny być już znormalizowane (`title_normalized`).

    Token-set ratio jest odporny na zmianę kolejności słów / dodatkowe tokeny.
    """
    if not a or not b:
        return 0.0
    return fuzz.token_set_ratio(a, b) / 100.0


def performer_set_similarity(
    left_ids: Iterable[uuid.UUID],
    right_ids: Iterable[uuid.UUID],
) -> float:
    """Jaccard na zbiorach kanonicznych UUID-ów performerów."""
    left = {str(i) for i in left_ids if i is not None}
    right = {str(i) for i in right_ids if i is not None}
    if not left and not right:
        return 0.0
    intersection = left & right
    union = left | right
    if not union:
        return 0.0
    return len(intersection) / len(union)


def date_proximity(left: date | None, right: date | None, *, window_days: int = 7) -> float:
    """1.0 gdy ten sam dzień, liniowy spadek do 0 w oknie window_days, poza oknem 0.0."""
    if left is None or right is None:
        return 0.0
    delta = abs((left - right).days)
    if delta == 0:
        return 1.0
    if delta > window_days:
        return 0.0
    return 1.0 - delta / window_days


# Wyłapuje "Episode 4" / "Ep 4" / "Part 2" / "Pt. 3" / "Vol 7" / "Volume 12" /
# "Scene 5" / "Chapter 9" / "Ch.3" / "#7" / "S9:E8" / "S9E8" — wszystko po
# normalizacji (lower-cased, punkt usunięty zwykle, ale tolerujemy \\.).
# `(?<!\d)` + `(?!\d)` zapobiega wyłapaniu fragmentu cyfry z dłuższego ciągu —
# np. "scene from 2020" nie wygeneruje fałszywego pos=0 z boundary-end-of-2020.
_SERIES_NUM_RE = re.compile(
    r"\b(?:episode|ep|part|pt|vol|volume|chapter|ch|scene|series)\b\s*\.?\s*#?\s*(?<!\d)(\d{1,3})(?!\d)"
    r"|(?<!\w)#\s*(?<!\d)(\d{1,3})(?!\d)"
    r"|\bs(?<!\d)(\d{1,2})(?!\d)\s*[:e]\s*e?(?<!\d)(\d{1,3})(?!\d)",
    re.IGNORECASE,
)

# Tagi które wprost mówią że scena to wariant osobny (BTS / bonus / unedited /
# trailer). Jeśli tag jest TYLKO po jednej stronie, to NIE jest ta sama scena.
_MODIFIER_TAGS: tuple[str, ...] = (
    "behind the scenes",
    "behind-the-scenes",
    "bts",
    "bonus",
    "unedited",
    "uncut",
    "extended",
    "directors cut",
    "director's cut",
    "trailer",
    "preview",
    "teaser",
    "compilation",
)


def detect_series_positions(title_normalized: str | None) -> set[int]:
    """Zwraca wszystkie pozycje (Episode/Part/Vol/Scene/Chapter/# itp.) znalezione w tytule.

    Tytuł powinien być znormalizowany (lowercase, unaccent), ale regex jest case-insensitive
    i tolerancyjny — chodzi tylko o sygnał, nie o robust parsing.
    """
    if not title_normalized:
        return set()
    out: set[int] = set()
    for m in _SERIES_NUM_RE.finditer(title_normalized):
        for g in m.groups():
            if g and g.isdigit():
                out.add(int(g))
    return out


def detect_modifier_tags(title_normalized: str | None) -> set[str]:
    """Zwraca set modifier tagów wykrytych w tytule (bts/bonus/unedited/itp.)."""
    if not title_normalized:
        return set()
    lower = title_normalized.lower()
    return {t for t in _MODIFIER_TAGS if t in lower}


def series_mismatch_strength(
    title_a_normalized: str | None,
    title_b_normalized: str | None,
) -> float:
    """Wykrywa rozjazd "wariantu sceny" między tytułami.

    Zwraca strength w [0.0, 1.0]:
      0.0 — brak sygnału mismatchu (tytuły kompatybilne).
      0.5 — modifier tags po obu stronach ale RÓŻNE (BTS vs trailer).
      0.7 — modifier tag po jednej stronie tylko (BTS vs regular).
      1.0 — series position mismatch (Episode 2 vs Episode 4 → twardy reject).
    """
    pos_a = detect_series_positions(title_a_normalized)
    pos_b = detect_series_positions(title_b_normalized)
    # Hard mismatch gdy oba mają jakieś pozycje i symmetric difference jest niepusty
    # — przykład: "Vol 140 Scene 3" vs "Vol 140 Scene 4" mają wspólne 140 ale różne 3/4,
    # to są osobne sceny ze wspólnej kompilacji. Asymetryczny brak (jedna strona ma
    # pozycję a druga nie) nie liczy się jako mismatch — tube SEO często gubi numer.
    if pos_a and pos_b and (pos_a ^ pos_b):
        return 1.0

    mod_a = detect_modifier_tags(title_a_normalized)
    mod_b = detect_modifier_tags(title_b_normalized)
    if (not mod_a) != (not mod_b):
        return 0.7
    if mod_a and mod_b and not (mod_a & mod_b):
        return 0.5
    return 0.0


def duration_proximity(
    left: int | None, right: int | None, *, window_sec: int = 60
) -> float | None:
    """1.0 gdy duration identyczny, liniowy spadek do 0 w oknie window_sec.

    Zwraca None gdy któraś wartość brak (sygnał nieinformatywny). Tube'y rzadko
    podają dokładny duration; różnica ±60s zwykle oznacza tę samą scenę z innym
    intro/outro. Poza oknem → 0.0 (różne sceny).
    """
    if not left or not right:
        return None
    delta = abs(left - right)
    if delta == 0:
        return 1.0
    if delta > window_sec:
        return 0.0
    return 1.0 - delta / window_sec


# ---- Composite ------------------------------------------------------------

# Bazowe wagi gdy wszystkie sygnały są dostępne.
_BASE_WEIGHTS = {
    "fp": 0.40,
    "title": 0.20,
    "performers": 0.15,
    "date": 0.15,
    "duration": 0.10,
}


def composite_score(
    *,
    fp: float | None,
    title: float | None,
    performers: float | None,
    date_score: float | None,
    duration_score: float | None = None,
    studio_match: bool | None,
    aggregator_mode: bool = False,
    series_mismatch: float | None = None,
) -> tuple[float, dict]:
    """Łączy sub-score'y w jeden composite [0, 1] + zwraca raport reasons.

    studio_match=False → hard reject na 0.0, chyba że:
      - fp ≥ 0.95 (silny fingerprint bije studio mismatch), albo
      - aggregator_mode=True (np. tube'y typu HQPorner agregują z różnych studiów,
        więc studio z naszej perspektywy nie jest informatywny — pomijamy hard reject
        i zwiększamy wagę performers).

    `series_mismatch` (≥0.0): wartość z `series_mismatch_strength()` — gdy 1.0 (Episode 2
    vs Episode 4), wymusza twardy reject niezależnie od pozostałych sygnałów; gdy 0.5-0.7
    (modifier mismatch: BTS/bonus/unedited po jednej stronie), nakłada cap = `1 - strength`.
    """
    reasons: dict = {}

    if series_mismatch is not None and series_mismatch >= 1.0:
        reasons["series_position_mismatch"] = True
        return 0.0, reasons

    if studio_match is False:
        if fp is not None and fp >= 0.95:
            reasons["studio_mismatch_overridden_by_fp"] = True
        elif aggregator_mode:
            reasons["studio_ignored_aggregator"] = True
            studio_match = None  # nie informatywny
        else:
            reasons["studio_mismatch"] = True
            return 0.0, reasons

    available = {
        k: v
        for k, v in {
            "fp": fp,
            "title": title,
            "performers": performers,
            "date": date_score,
            "duration": duration_score,
        }.items()
        if v is not None
    }
    if not available:
        return 0.0, {"no_signals": True}

    base_weights = dict(_BASE_WEIGHTS)
    if aggregator_mode:
        # tube'y nie mają date/fp, performer set + duration to najsilniejsze sygnały.
        base_weights = {
            "fp": 0.20,
            "title": 0.15,
            "performers": 0.35,
            "date": 0.05,
            "duration": 0.25,
        }
        reasons["aggregator_weights"] = base_weights

    weights = {k: base_weights[k] for k in available}
    total_w = sum(weights.values())
    if total_w == 0.0:
        return 0.0, reasons
    norm_w = {k: w / total_w for k, w in weights.items()}

    score = sum(available[k] * norm_w[k] for k in available)
    reasons["weights"] = norm_w
    reasons["sub_scores"] = available

    # W aggregator mode wymagamy minimalnego performer overlap dla auto-merge —
    # bez tego polegamy tylko na title fuzzy, co ma wysoki false-positive rate
    # (różne sceny mogą mieć podobne nazwy).
    if aggregator_mode and (performers is None or performers < 0.5):
        score = min(score, 0.74)  # cap poniżej review threshold
        reasons["aggregator_low_performer_cap"] = True

    # Weak-signal cap: w aggregator mode gdy NIE MAMY duration ANI fingerprint ANI
    # date (wszystkie najsilniejsze sygnały braki), polegamy WYŁĄCZNIE na title +
    # performers. Same-performer dla prolific actresses (Tania Amazon, Mia Malkova,
    # Aria Alexander) daje 1.0, a token-set ratio 0.75 z imienia/nazwiska w tytule
    # jest powszechny → composite szybko hitje threshold (0.925) i auto-merguje
    # 78 różnych scen pod jedną canonical (zgłoszone 2026-05-08).
    #
    # Reload 2026-05-09: title bypass przy ≥0.95 zostawiał furtkę dla "Simone Peach
    # BBW rides..." vs "Peach Lollypop sexy BBW rides..." (token-set sweep
    # podbija title >0.95 mimo że to różne osoby). Cap zawsze, niezależnie od
    # title — auto-merge wymaga **co najmniej jednego strong signal** (fp,
    # duration, date). Bez nich → review queue, nigdy auto-merge.
    has_strong_signal = (
        (fp is not None and fp >= 0.5)
        or (duration_score is not None and duration_score >= 0.5)
        or (date_score is not None and date_score >= 0.5)
    )
    if aggregator_mode and not has_strong_signal:
        score = min(score, 0.85)
        reasons["aggregator_weak_signal_cap"] = True

    # Strong-signal boost: w aggregator mode duration ±3s + performer overlap ≥0.5
    # + title >=0.40 ≈ pewny match (te same długości + ten sam performer + nie-totalnie-
    # różne tytuły to bardzo rzadki false positive). Bumpujemy do auto-merge gdy
    # tube SEO title różni się od studio canonical title ale zachowuje wspólny token.
    #
    # **Tightened 2026-05-12** (bug-report ef090842): poprzednio duration ±6s bez
    # guardu na title → "Five Star Anal Fuck" (2105s) i "Match My Freak" (2110s) Lily
    # Lou auto-merge'owało się w jedną scenę bo duration diff=5 (score 0.917 ≥0.90)
    # + performer 1.0 wystarczało. Zmiany:
    #   - duration ≥ 0.95 (≤3s diff zamiast ≤6s) — Brazzers/Naughty America często
    #     mają sąsiednie sceny z tym samym actorem o pochodnej długości (intro/outro),
    #     ale ≤3s to praktycznie ten sam encoding
    #   - title ≥ 0.40 — zatrzymuje "totally different title" false matches; nadal
    #     toleruje "TheCanonicalTitle" vs "SiteSlug SEO Title TheCanonical Title FREE"
    if (
        aggregator_mode
        and duration_score is not None
        and duration_score >= 0.95
        and performers is not None
        and performers >= 0.5
        and title is not None
        and title >= 0.40
    ):
        if score < 0.92:
            reasons["duration_perf_strong_match_bump"] = True
            score = max(score, 0.92)

    # Series-modifier cap: jedna ze stron ma "BTS"/"bonus"/"unedited" a druga nie,
    # albo różne tagi. Twardy mismatch (różne pozycje numeryczne) został już złapany
    # wcześniej (return 0.0). Tu zostają miękkie sygnały — cap żeby nigdy nie auto-merge.
    if series_mismatch is not None and 0.0 < series_mismatch < 1.0:
        cap = max(0.0, 1.0 - series_mismatch)
        if score > cap:
            reasons["series_modifier_cap"] = cap
            reasons["series_mismatch_strength"] = series_mismatch
            score = cap

    return _clamp(score), reasons


def triage(score: float) -> str:
    """Zwraca 'auto', 'review', 'reject' wg progów z config."""
    s = get_settings()
    if score >= s.auto_merge_threshold:
        return "auto"
    if score >= s.review_threshold:
        return "review"
    return "reject"


def _clamp(v: float) -> float:
    return max(0.0, min(1.0, v if not math.isnan(v) else 0.0))