goon/app/resolve/movie_score.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

155 lines
5 KiB
Python

"""Composite scoring dla movies — title + year + studio + cast Jaccard.
Movies vs Scenes scoring:
- Movies rzadko mają fingerprinty — tytuł jest najsilniejszym sygnałem (weight 0.5).
- Year (production year) ważny ale czasem nieznany (paradisehill często NULL); wagę
redystrybuujemy proporcjonalnie gdy brak.
- Studio: binary signal — match=1.0, mismatch=0.0; gdy któraś strona ma NULL, sygnał
pomijamy (None waga).
- Cast: Jaccard po canonical UUID-ach performerów. Bonus signal — niektóre source'y
(paradisehill często, dooplay zawsze) mają cast; tube'y bez cast → None.
Threshold (lower niż scenes — title dla movies ma niższy false-positive rate):
≥0.85 auto-merge
0.65-0.85 review
<0.65 nowy kanon
"""
from __future__ import annotations
import uuid
from collections.abc import Iterable
from dataclasses import dataclass
from rapidfuzz import fuzz
from sqlalchemy.orm import Session
from app.models.movie import Movie, MoviePerformer
@dataclass
class MovieScoreBreakdown:
title: float | None = None
year: float | None = None
studio_match: bool | None = None
cast: float | None = None
composite: float = 0.0
reasons: dict | None = None
def to_dict(self) -> dict:
return {
"title": self.title,
"year": self.year,
"studio_match": self.studio_match,
"cast": self.cast,
"composite": self.composite,
"reasons": self.reasons or {},
}
_BASE_WEIGHTS = {
"title": 0.50,
"year": 0.20,
"studio": 0.20,
"cast": 0.10,
}
# Movies-specific thresholds (lower niż scenes ze względu na unique titles)
AUTO_THRESHOLD = 0.85
REVIEW_THRESHOLD = 0.65
def score_movie_candidate(
session: Session,
*,
candidate: Movie,
norm_title_normalized: str,
norm_release_year: int | None,
norm_studio_id: uuid.UUID | None,
norm_performer_ids: Iterable[uuid.UUID],
) -> MovieScoreBreakdown:
"""Liczy score dopasowania incoming → candidate. Wszystko w [0,1]."""
breakdown = MovieScoreBreakdown(reasons={})
# Title (always)
if candidate.title_normalized and norm_title_normalized:
breakdown.title = fuzz.token_set_ratio(
candidate.title_normalized, norm_title_normalized
) / 100.0
# Year — exact lub ±1 = wysoka, ±2 = średnia
if candidate.release_year is not None and norm_release_year is not None:
delta = abs(candidate.release_year - norm_release_year)
if delta == 0:
breakdown.year = 1.0
elif delta == 1:
breakdown.year = 0.7
elif delta == 2:
breakdown.year = 0.4
else:
breakdown.year = 0.0
# Studio match — binary
if candidate.studio_id and norm_studio_id:
breakdown.studio_match = candidate.studio_id == norm_studio_id
# Cast Jaccard — gdy obie strony mają performerów
norm_perf_set = {str(pid) for pid in norm_performer_ids if pid is not None}
if norm_perf_set:
cand_perf_rows = session.execute(
MoviePerformer.__table__.select().where(
MoviePerformer.movie_id == candidate.id
)
).fetchall()
cand_perf_set = {str(r.performer_id) for r in cand_perf_rows}
if cand_perf_set:
inter = norm_perf_set & cand_perf_set
union = norm_perf_set | cand_perf_set
breakdown.cast = len(inter) / len(union) if union else 0.0
# Composite — redystrybuuj wagi po dostępnych sygnałach
available: dict[str, float] = {}
if breakdown.title is not None:
available["title"] = breakdown.title
if breakdown.year is not None:
available["year"] = breakdown.year
if breakdown.studio_match is not None:
available["studio"] = 1.0 if breakdown.studio_match else 0.0
if breakdown.cast is not None:
available["cast"] = breakdown.cast
if not available:
breakdown.composite = 0.0
breakdown.reasons["no_signals"] = True
return breakdown
weights = {k: _BASE_WEIGHTS[k] for k in available}
total_w = sum(weights.values())
if total_w == 0.0:
breakdown.composite = 0.0
return breakdown
norm_w = {k: w / total_w for k, w in weights.items()}
score = sum(available[k] * norm_w[k] for k in available)
# Hard reject: studio mismatch + year mismatch (delta ≥3) → różne filmy z podobnym
# tytułem. "Tushy Anal Bliss 7" vs "Anal Bliss 7" z innego studia rok różny — to NIE
# jest mirror, to różne wydania.
if (
breakdown.studio_match is False
and breakdown.year is not None
and breakdown.year < 0.4
):
breakdown.reasons["studio_year_mismatch_reject"] = True
score = 0.0
breakdown.composite = max(0.0, min(1.0, score))
breakdown.reasons["weights"] = norm_w
breakdown.reasons["sub_scores"] = available
return breakdown
def triage_movie(score: float) -> str:
if score >= AUTO_THRESHOLD:
return "auto"
if score >= REVIEW_THRESHOLD:
return "review"
return "reject"