goon/tests/test_scoring.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

156 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Unit testy scoring engine — czyste funkcje bez DB."""
from __future__ import annotations
import uuid
from datetime import date
import pytest
from app.resolve.scoring import (
composite_score,
date_proximity,
hamming_distance_hex,
performer_set_similarity,
phash_similarity,
title_similarity,
triage,
)
# ---- hamming / phash ------------------------------------------------------
def test_hamming_zero_on_identical() -> None:
assert hamming_distance_hex("deadbeef", "deadbeef") == 0
def test_hamming_one_bit_difference() -> None:
# 0x0 vs 0x1 = 1 bit
assert hamming_distance_hex("00", "01") == 1
# 0xFF vs 0x00 = 8 bity
assert hamming_distance_hex("ff", "00") == 8
def test_hamming_raises_on_length_mismatch() -> None:
with pytest.raises(ValueError):
hamming_distance_hex("abcd", "abcdef")
def test_phash_similarity_64bit() -> None:
# 16 hex chars = 64 bits
a = "0000000000000000"
b = "0000000000000001" # 1 bit różnicy
sim = phash_similarity(a, b)
assert sim == pytest.approx(1.0 - 1 / 64, abs=1e-6)
def test_phash_similarity_far() -> None:
a = "0000000000000000"
b = "ffffffffffffffff" # 64 bity różnicy
assert phash_similarity(a, b) == 0.0
# ---- title similarity -----------------------------------------------------
def test_title_similarity_identical() -> None:
assert title_similarity("the great heist", "the great heist") == 1.0
def test_title_similarity_token_set_handles_extra_words() -> None:
# token_set_ratio jest odporny na zmianę kolejności i dodatkowe tokeny
s = title_similarity("great heist", "the great heist extended cut")
assert s > 0.8
def test_title_similarity_empty_inputs() -> None:
assert title_similarity("", "anything") == 0.0
assert title_similarity("anything", "") == 0.0
# ---- performer set Jaccard ------------------------------------------------
def test_performer_set_full_overlap() -> None:
a = uuid.uuid4()
b = uuid.uuid4()
assert performer_set_similarity([a, b], [a, b]) == 1.0
def test_performer_set_partial_overlap() -> None:
a, b, c = uuid.uuid4(), uuid.uuid4(), uuid.uuid4()
# |∩| = 1, || = 3 → 1/3
assert performer_set_similarity([a, b], [a, c]) == pytest.approx(1 / 3)
def test_performer_set_empty_inputs() -> None:
assert performer_set_similarity([], []) == 0.0
# ---- date proximity -------------------------------------------------------
def test_date_proximity_same_day() -> None:
d = date(2024, 1, 1)
assert date_proximity(d, d) == 1.0
def test_date_proximity_within_window() -> None:
d = date(2024, 1, 1)
# +3 days, window 7 → 1 - 3/7
assert date_proximity(d, date(2024, 1, 4), window_days=7) == pytest.approx(1 - 3 / 7)
def test_date_proximity_outside_window() -> None:
assert date_proximity(date(2024, 1, 1), date(2024, 2, 1), window_days=7) == 0.0
def test_date_proximity_none_inputs() -> None:
assert date_proximity(None, date(2024, 1, 1)) == 0.0
assert date_proximity(date(2024, 1, 1), None) == 0.0
# ---- composite -----------------------------------------------------------
def test_composite_studio_mismatch_hard_rejects() -> None:
score, reasons = composite_score(
fp=None, title=0.9, performers=0.9, date_score=1.0, studio_match=False
)
assert score == 0.0
assert reasons.get("studio_mismatch")
def test_composite_strong_fp_overrides_studio_mismatch() -> None:
# silny pHash bije studio mismatch (sub-studio mapping rozjeżdża się czasem)
score, reasons = composite_score(
fp=0.97, title=0.5, performers=None, date_score=1.0, studio_match=False
)
assert score > 0.0
assert reasons.get("studio_mismatch_overridden_by_fp") is True
def test_composite_redistributes_weights_when_fp_missing() -> None:
# wszystko 1.0 → composite musi być 1.0 niezależnie czy mamy fp czy nie
s_with, _ = composite_score(fp=1.0, title=1.0, performers=1.0, date_score=1.0, studio_match=True)
s_without, _ = composite_score(fp=None, title=1.0, performers=1.0, date_score=1.0, studio_match=True)
assert s_with == pytest.approx(1.0)
assert s_without == pytest.approx(1.0)
def test_composite_no_signals() -> None:
score, reasons = composite_score(
fp=None, title=None, performers=None, date_score=None, studio_match=None
)
assert score == 0.0
assert reasons.get("no_signals")
def test_composite_clamps_to_unit() -> None:
score, _ = composite_score(fp=2.0, title=2.0, performers=2.0, date_score=2.0, studio_match=True)
assert score == 1.0
# ---- triage --------------------------------------------------------------
def test_triage_thresholds() -> None:
assert triage(0.95) == "auto" # >= 0.92
assert triage(0.92) == "auto"
assert triage(0.85) == "review"
assert triage(0.75) == "review"
assert triage(0.5) == "reject"