Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
274 lines
9.5 KiB
Python
274 lines
9.5 KiB
Python
"""Scoring funkcji do dopasowania kandydatów scen.
|
|
|
|
Wszystkie sub-score'y wracają w [0, 1]. Composite score łączy je z wagami,
|
|
redystrybuując wagę gdy któryś sygnał jest niedostępny (np. brak fingerprintu).
|
|
|
|
Wagi (gdy wszystko dostępne):
|
|
fp_phash: 0.40
|
|
title: 0.25
|
|
performers: 0.20
|
|
date: 0.15
|
|
|
|
Twardy reject: studio_match=False → score 0.0 (chyba że ma silny fingerprint match
|
|
≥0.95, wtedy ufamy fingerprintowi i ignorujemy studio mismatch — bo zdarza się że
|
|
TPDB ma "Brazzers Exxtra" a StashDB "Brazzers" jako studio sceny).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
import uuid
|
|
from collections.abc import Iterable
|
|
from dataclasses import dataclass
|
|
from datetime import date
|
|
|
|
from rapidfuzz import fuzz
|
|
|
|
from app.config import get_settings
|
|
|
|
|
|
@dataclass
|
|
class ScoreBreakdown:
|
|
"""Per-sub-score values + final composite + reasons (do zapisu w merge_candidates.reasons)."""
|
|
|
|
fp: float | None = None
|
|
title: float | None = None
|
|
performers: float | None = None
|
|
date: float | None = None
|
|
duration: float | None = None
|
|
studio_match: bool | None = None
|
|
composite: float = 0.0
|
|
reasons: dict = None # type: ignore[assignment]
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"fp": self.fp,
|
|
"title": self.title,
|
|
"performers": self.performers,
|
|
"date": self.date,
|
|
"duration": self.duration,
|
|
"studio_match": self.studio_match,
|
|
"composite": self.composite,
|
|
"reasons": self.reasons or {},
|
|
}
|
|
|
|
|
|
# ---- Sub-scorers ----------------------------------------------------------
|
|
|
|
def hamming_distance_hex(a: str, b: str) -> int:
|
|
"""Hamming distance dwóch hex hashy o tej samej długości."""
|
|
if len(a) != len(b):
|
|
raise ValueError(f"hash length mismatch: {len(a)} vs {len(b)}")
|
|
return bin(int(a, 16) ^ int(b, 16)).count("1")
|
|
|
|
|
|
def phash_similarity(a: str, b: str, *, bits: int = 64) -> float:
|
|
"""Similarity = 1 - hamming/bits. Dla 64-bit phash i ≤5 różnic → ≥0.92."""
|
|
d = hamming_distance_hex(a, b)
|
|
return max(0.0, 1.0 - d / bits)
|
|
|
|
|
|
def title_similarity(a: str, b: str) -> float:
|
|
"""`a`, `b` powinny być już znormalizowane (`title_normalized`).
|
|
|
|
Token-set ratio jest odporny na zmianę kolejności słów / dodatkowe tokeny.
|
|
"""
|
|
if not a or not b:
|
|
return 0.0
|
|
return fuzz.token_set_ratio(a, b) / 100.0
|
|
|
|
|
|
def performer_set_similarity(
|
|
left_ids: Iterable[uuid.UUID],
|
|
right_ids: Iterable[uuid.UUID],
|
|
) -> float:
|
|
"""Jaccard na zbiorach kanonicznych UUID-ów performerów."""
|
|
left = {str(i) for i in left_ids if i is not None}
|
|
right = {str(i) for i in right_ids if i is not None}
|
|
if not left and not right:
|
|
return 0.0
|
|
intersection = left & right
|
|
union = left | right
|
|
if not union:
|
|
return 0.0
|
|
return len(intersection) / len(union)
|
|
|
|
|
|
def date_proximity(left: date | None, right: date | None, *, window_days: int = 7) -> float:
|
|
"""1.0 gdy ten sam dzień, liniowy spadek do 0 w oknie window_days, poza oknem 0.0."""
|
|
if left is None or right is None:
|
|
return 0.0
|
|
delta = abs((left - right).days)
|
|
if delta == 0:
|
|
return 1.0
|
|
if delta > window_days:
|
|
return 0.0
|
|
return 1.0 - delta / window_days
|
|
|
|
|
|
def duration_proximity(
|
|
left: int | None, right: int | None, *, window_sec: int = 60
|
|
) -> float | None:
|
|
"""1.0 gdy duration identyczny, liniowy spadek do 0 w oknie window_sec.
|
|
|
|
Zwraca None gdy któraś wartość brak (sygnał nieinformatywny). Tube'y rzadko
|
|
podają dokładny duration; różnica ±60s zwykle oznacza tę samą scenę z innym
|
|
intro/outro. Poza oknem → 0.0 (różne sceny).
|
|
"""
|
|
if not left or not right:
|
|
return None
|
|
delta = abs(left - right)
|
|
if delta == 0:
|
|
return 1.0
|
|
if delta > window_sec:
|
|
return 0.0
|
|
return 1.0 - delta / window_sec
|
|
|
|
|
|
# ---- Composite ------------------------------------------------------------
|
|
|
|
# Bazowe wagi gdy wszystkie sygnały są dostępne.
|
|
_BASE_WEIGHTS = {
|
|
"fp": 0.40,
|
|
"title": 0.20,
|
|
"performers": 0.15,
|
|
"date": 0.15,
|
|
"duration": 0.10,
|
|
}
|
|
|
|
|
|
def composite_score(
|
|
*,
|
|
fp: float | None,
|
|
title: float | None,
|
|
performers: float | None,
|
|
date_score: float | None,
|
|
duration_score: float | None = None,
|
|
studio_match: bool | None,
|
|
aggregator_mode: bool = False,
|
|
) -> tuple[float, dict]:
|
|
"""Łączy sub-score'y w jeden composite [0, 1] + zwraca raport reasons.
|
|
|
|
studio_match=False → hard reject na 0.0, chyba że:
|
|
- fp ≥ 0.95 (silny fingerprint bije studio mismatch), albo
|
|
- aggregator_mode=True (np. tube'y typu HQPorner agregują z różnych studiów,
|
|
więc studio z naszej perspektywy nie jest informatywny — pomijamy hard reject
|
|
i zwiększamy wagę performers).
|
|
"""
|
|
reasons: dict = {}
|
|
|
|
if studio_match is False:
|
|
if fp is not None and fp >= 0.95:
|
|
reasons["studio_mismatch_overridden_by_fp"] = True
|
|
elif aggregator_mode:
|
|
reasons["studio_ignored_aggregator"] = True
|
|
studio_match = None # nie informatywny
|
|
else:
|
|
reasons["studio_mismatch"] = True
|
|
return 0.0, reasons
|
|
|
|
available = {
|
|
k: v
|
|
for k, v in {
|
|
"fp": fp,
|
|
"title": title,
|
|
"performers": performers,
|
|
"date": date_score,
|
|
"duration": duration_score,
|
|
}.items()
|
|
if v is not None
|
|
}
|
|
if not available:
|
|
return 0.0, {"no_signals": True}
|
|
|
|
base_weights = dict(_BASE_WEIGHTS)
|
|
if aggregator_mode:
|
|
# tube'y nie mają date/fp, performer set + duration to najsilniejsze sygnały.
|
|
base_weights = {
|
|
"fp": 0.20,
|
|
"title": 0.15,
|
|
"performers": 0.35,
|
|
"date": 0.05,
|
|
"duration": 0.25,
|
|
}
|
|
reasons["aggregator_weights"] = base_weights
|
|
|
|
weights = {k: base_weights[k] for k in available}
|
|
total_w = sum(weights.values())
|
|
if total_w == 0.0:
|
|
return 0.0, reasons
|
|
norm_w = {k: w / total_w for k, w in weights.items()}
|
|
|
|
score = sum(available[k] * norm_w[k] for k in available)
|
|
reasons["weights"] = norm_w
|
|
reasons["sub_scores"] = available
|
|
|
|
# W aggregator mode wymagamy minimalnego performer overlap dla auto-merge —
|
|
# bez tego polegamy tylko na title fuzzy, co ma wysoki false-positive rate
|
|
# (różne sceny mogą mieć podobne nazwy).
|
|
if aggregator_mode and (performers is None or performers < 0.5):
|
|
score = min(score, 0.74) # cap poniżej review threshold
|
|
reasons["aggregator_low_performer_cap"] = True
|
|
|
|
# Weak-signal cap: w aggregator mode gdy NIE MAMY duration ANI fingerprint ANI
|
|
# date (wszystkie najsilniejsze sygnały braki), polegamy WYŁĄCZNIE na title +
|
|
# performers. Same-performer dla prolific actresses (Tania Amazon, Mia Malkova,
|
|
# Aria Alexander) daje 1.0, a token-set ratio 0.75 z imienia/nazwiska w tytule
|
|
# jest powszechny → composite szybko hitje threshold (0.925) i auto-merguje
|
|
# 78 różnych scen pod jedną canonical (zgłoszone 2026-05-08).
|
|
#
|
|
# Reload 2026-05-09: title bypass przy ≥0.95 zostawiał furtkę dla "Simone Peach
|
|
# BBW rides..." vs "Peach Lollypop sexy BBW rides..." (token-set sweep
|
|
# podbija title >0.95 mimo że to różne osoby). Cap zawsze, niezależnie od
|
|
# title — auto-merge wymaga **co najmniej jednego strong signal** (fp,
|
|
# duration, date). Bez nich → review queue, nigdy auto-merge.
|
|
has_strong_signal = (
|
|
(fp is not None and fp >= 0.5)
|
|
or (duration_score is not None and duration_score >= 0.5)
|
|
or (date_score is not None and date_score >= 0.5)
|
|
)
|
|
if aggregator_mode and not has_strong_signal:
|
|
score = min(score, 0.85)
|
|
reasons["aggregator_weak_signal_cap"] = True
|
|
|
|
# Strong-signal boost: w aggregator mode duration ±3s + performer overlap ≥0.5
|
|
# + title >=0.40 ≈ pewny match (te same długości + ten sam performer + nie-totalnie-
|
|
# różne tytuły to bardzo rzadki false positive). Bumpujemy do auto-merge gdy
|
|
# tube SEO title różni się od studio canonical title ale zachowuje wspólny token.
|
|
#
|
|
# **Tightened 2026-05-12** (bug-report ef090842): poprzednio duration ±6s bez
|
|
# guardu na title → "Five Star Anal Fuck" (2105s) i "Match My Freak" (2110s) Lily
|
|
# Lou auto-merge'owało się w jedną scenę bo duration diff=5 (score 0.917 ≥0.90)
|
|
# + performer 1.0 wystarczało. Zmiany:
|
|
# - duration ≥ 0.95 (≤3s diff zamiast ≤6s) — Brazzers/Naughty America często
|
|
# mają sąsiednie sceny z tym samym actorem o pochodnej długości (intro/outro),
|
|
# ale ≤3s to praktycznie ten sam encoding
|
|
# - title ≥ 0.40 — zatrzymuje "totally different title" false matches; nadal
|
|
# toleruje "TheCanonicalTitle" vs "SiteSlug SEO Title TheCanonical Title FREE"
|
|
if (
|
|
aggregator_mode
|
|
and duration_score is not None
|
|
and duration_score >= 0.95
|
|
and performers is not None
|
|
and performers >= 0.5
|
|
and title is not None
|
|
and title >= 0.40
|
|
):
|
|
if score < 0.92:
|
|
reasons["duration_perf_strong_match_bump"] = True
|
|
score = max(score, 0.92)
|
|
|
|
return _clamp(score), reasons
|
|
|
|
|
|
def triage(score: float) -> str:
|
|
"""Zwraca 'auto', 'review', 'reject' wg progów z config."""
|
|
s = get_settings()
|
|
if score >= s.auto_merge_threshold:
|
|
return "auto"
|
|
if score >= s.review_threshold:
|
|
return "review"
|
|
return "reject"
|
|
|
|
|
|
def _clamp(v: float) -> float:
|
|
return max(0.0, min(1.0, v if not math.isnan(v) else 0.0))
|