Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
155 lines
5 KiB
Python
155 lines
5 KiB
Python
"""Composite scoring dla movies — title + year + studio + cast Jaccard.
|
|
|
|
Movies vs Scenes scoring:
|
|
- Movies rzadko mają fingerprinty — tytuł jest najsilniejszym sygnałem (weight 0.5).
|
|
- Year (production year) ważny ale czasem nieznany (paradisehill często NULL); wagę
|
|
redystrybuujemy proporcjonalnie gdy brak.
|
|
- Studio: binary signal — match=1.0, mismatch=0.0; gdy któraś strona ma NULL, sygnał
|
|
pomijamy (None waga).
|
|
- Cast: Jaccard po canonical UUID-ach performerów. Bonus signal — niektóre source'y
|
|
(paradisehill często, dooplay zawsze) mają cast; tube'y bez cast → None.
|
|
|
|
Threshold (lower niż scenes — title dla movies ma niższy false-positive rate):
|
|
≥0.85 auto-merge
|
|
0.65-0.85 review
|
|
<0.65 nowy kanon
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from collections.abc import Iterable
|
|
from dataclasses import dataclass
|
|
|
|
from rapidfuzz import fuzz
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.models.movie import Movie, MoviePerformer
|
|
|
|
|
|
@dataclass
|
|
class MovieScoreBreakdown:
|
|
title: float | None = None
|
|
year: float | None = None
|
|
studio_match: bool | None = None
|
|
cast: float | None = None
|
|
composite: float = 0.0
|
|
reasons: dict | None = None
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"title": self.title,
|
|
"year": self.year,
|
|
"studio_match": self.studio_match,
|
|
"cast": self.cast,
|
|
"composite": self.composite,
|
|
"reasons": self.reasons or {},
|
|
}
|
|
|
|
|
|
_BASE_WEIGHTS = {
|
|
"title": 0.50,
|
|
"year": 0.20,
|
|
"studio": 0.20,
|
|
"cast": 0.10,
|
|
}
|
|
|
|
# Movies-specific thresholds (lower niż scenes ze względu na unique titles)
|
|
AUTO_THRESHOLD = 0.85
|
|
REVIEW_THRESHOLD = 0.65
|
|
|
|
|
|
def score_movie_candidate(
|
|
session: Session,
|
|
*,
|
|
candidate: Movie,
|
|
norm_title_normalized: str,
|
|
norm_release_year: int | None,
|
|
norm_studio_id: uuid.UUID | None,
|
|
norm_performer_ids: Iterable[uuid.UUID],
|
|
) -> MovieScoreBreakdown:
|
|
"""Liczy score dopasowania incoming → candidate. Wszystko w [0,1]."""
|
|
breakdown = MovieScoreBreakdown(reasons={})
|
|
|
|
# Title (always)
|
|
if candidate.title_normalized and norm_title_normalized:
|
|
breakdown.title = fuzz.token_set_ratio(
|
|
candidate.title_normalized, norm_title_normalized
|
|
) / 100.0
|
|
|
|
# Year — exact lub ±1 = wysoka, ±2 = średnia
|
|
if candidate.release_year is not None and norm_release_year is not None:
|
|
delta = abs(candidate.release_year - norm_release_year)
|
|
if delta == 0:
|
|
breakdown.year = 1.0
|
|
elif delta == 1:
|
|
breakdown.year = 0.7
|
|
elif delta == 2:
|
|
breakdown.year = 0.4
|
|
else:
|
|
breakdown.year = 0.0
|
|
|
|
# Studio match — binary
|
|
if candidate.studio_id and norm_studio_id:
|
|
breakdown.studio_match = candidate.studio_id == norm_studio_id
|
|
|
|
# Cast Jaccard — gdy obie strony mają performerów
|
|
norm_perf_set = {str(pid) for pid in norm_performer_ids if pid is not None}
|
|
if norm_perf_set:
|
|
cand_perf_rows = session.execute(
|
|
MoviePerformer.__table__.select().where(
|
|
MoviePerformer.movie_id == candidate.id
|
|
)
|
|
).fetchall()
|
|
cand_perf_set = {str(r.performer_id) for r in cand_perf_rows}
|
|
if cand_perf_set:
|
|
inter = norm_perf_set & cand_perf_set
|
|
union = norm_perf_set | cand_perf_set
|
|
breakdown.cast = len(inter) / len(union) if union else 0.0
|
|
|
|
# Composite — redystrybuuj wagi po dostępnych sygnałach
|
|
available: dict[str, float] = {}
|
|
if breakdown.title is not None:
|
|
available["title"] = breakdown.title
|
|
if breakdown.year is not None:
|
|
available["year"] = breakdown.year
|
|
if breakdown.studio_match is not None:
|
|
available["studio"] = 1.0 if breakdown.studio_match else 0.0
|
|
if breakdown.cast is not None:
|
|
available["cast"] = breakdown.cast
|
|
|
|
if not available:
|
|
breakdown.composite = 0.0
|
|
breakdown.reasons["no_signals"] = True
|
|
return breakdown
|
|
|
|
weights = {k: _BASE_WEIGHTS[k] for k in available}
|
|
total_w = sum(weights.values())
|
|
if total_w == 0.0:
|
|
breakdown.composite = 0.0
|
|
return breakdown
|
|
norm_w = {k: w / total_w for k, w in weights.items()}
|
|
score = sum(available[k] * norm_w[k] for k in available)
|
|
|
|
# Hard reject: studio mismatch + year mismatch (delta ≥3) → różne filmy z podobnym
|
|
# tytułem. "Tushy Anal Bliss 7" vs "Anal Bliss 7" z innego studia rok różny — to NIE
|
|
# jest mirror, to różne wydania.
|
|
if (
|
|
breakdown.studio_match is False
|
|
and breakdown.year is not None
|
|
and breakdown.year < 0.4
|
|
):
|
|
breakdown.reasons["studio_year_mismatch_reject"] = True
|
|
score = 0.0
|
|
|
|
breakdown.composite = max(0.0, min(1.0, score))
|
|
breakdown.reasons["weights"] = norm_w
|
|
breakdown.reasons["sub_scores"] = available
|
|
return breakdown
|
|
|
|
|
|
def triage_movie(score: float) -> str:
|
|
if score >= AUTO_THRESHOLD:
|
|
return "auto"
|
|
if score >= REVIEW_THRESHOLD:
|
|
return "review"
|
|
return "reject"
|