"""Composite scoring dla movies — title + year + studio + cast Jaccard. Movies vs Scenes scoring: - Movies rzadko mają fingerprinty — tytuł jest najsilniejszym sygnałem (weight 0.5). - Year (production year) ważny ale czasem nieznany (paradisehill często NULL); wagę redystrybuujemy proporcjonalnie gdy brak. - Studio: binary signal — match=1.0, mismatch=0.0; gdy któraś strona ma NULL, sygnał pomijamy (None waga). - Cast: Jaccard po canonical UUID-ach performerów. Bonus signal — niektóre source'y (paradisehill często, dooplay zawsze) mają cast; tube'y bez cast → None. Threshold (lower niż scenes — title dla movies ma niższy false-positive rate): ≥0.85 auto-merge 0.65-0.85 review <0.65 nowy kanon """ from __future__ import annotations import uuid from collections.abc import Iterable from dataclasses import dataclass from rapidfuzz import fuzz from sqlalchemy.orm import Session from app.models.movie import Movie, MoviePerformer @dataclass class MovieScoreBreakdown: title: float | None = None year: float | None = None studio_match: bool | None = None cast: float | None = None composite: float = 0.0 reasons: dict | None = None def to_dict(self) -> dict: return { "title": self.title, "year": self.year, "studio_match": self.studio_match, "cast": self.cast, "composite": self.composite, "reasons": self.reasons or {}, } _BASE_WEIGHTS = { "title": 0.50, "year": 0.20, "studio": 0.20, "cast": 0.10, } # Movies-specific thresholds (lower niż scenes ze względu na unique titles) AUTO_THRESHOLD = 0.85 REVIEW_THRESHOLD = 0.65 def score_movie_candidate( session: Session, *, candidate: Movie, norm_title_normalized: str, norm_release_year: int | None, norm_studio_id: uuid.UUID | None, norm_performer_ids: Iterable[uuid.UUID], ) -> MovieScoreBreakdown: """Liczy score dopasowania incoming → candidate. Wszystko w [0,1].""" breakdown = MovieScoreBreakdown(reasons={}) # Title (always) if candidate.title_normalized and norm_title_normalized: breakdown.title = fuzz.token_set_ratio( candidate.title_normalized, norm_title_normalized ) / 100.0 # Year — exact lub ±1 = wysoka, ±2 = średnia if candidate.release_year is not None and norm_release_year is not None: delta = abs(candidate.release_year - norm_release_year) if delta == 0: breakdown.year = 1.0 elif delta == 1: breakdown.year = 0.7 elif delta == 2: breakdown.year = 0.4 else: breakdown.year = 0.0 # Studio match — binary if candidate.studio_id and norm_studio_id: breakdown.studio_match = candidate.studio_id == norm_studio_id # Cast Jaccard — gdy obie strony mają performerów norm_perf_set = {str(pid) for pid in norm_performer_ids if pid is not None} if norm_perf_set: cand_perf_rows = session.execute( MoviePerformer.__table__.select().where( MoviePerformer.movie_id == candidate.id ) ).fetchall() cand_perf_set = {str(r.performer_id) for r in cand_perf_rows} if cand_perf_set: inter = norm_perf_set & cand_perf_set union = norm_perf_set | cand_perf_set breakdown.cast = len(inter) / len(union) if union else 0.0 # Composite — redystrybuuj wagi po dostępnych sygnałach available: dict[str, float] = {} if breakdown.title is not None: available["title"] = breakdown.title if breakdown.year is not None: available["year"] = breakdown.year if breakdown.studio_match is not None: available["studio"] = 1.0 if breakdown.studio_match else 0.0 if breakdown.cast is not None: available["cast"] = breakdown.cast if not available: breakdown.composite = 0.0 breakdown.reasons["no_signals"] = True return breakdown weights = {k: _BASE_WEIGHTS[k] for k in available} total_w = sum(weights.values()) if total_w == 0.0: breakdown.composite = 0.0 return breakdown norm_w = {k: w / total_w for k, w in weights.items()} score = sum(available[k] * norm_w[k] for k in available) # Hard reject: studio mismatch + year mismatch (delta ≥3) → różne filmy z podobnym # tytułem. "Tushy Anal Bliss 7" vs "Anal Bliss 7" z innego studia rok różny — to NIE # jest mirror, to różne wydania. if ( breakdown.studio_match is False and breakdown.year is not None and breakdown.year < 0.4 ): breakdown.reasons["studio_year_mismatch_reject"] = True score = 0.0 breakdown.composite = max(0.0, min(1.0, score)) breakdown.reasons["weights"] = norm_w breakdown.reasons["sub_scores"] = available return breakdown def triage_movie(score: float) -> str: if score >= AUTO_THRESHOLD: return "auto" if score >= REVIEW_THRESHOLD: return "review" return "reject"