"""Unit testy scoring engine — czyste funkcje bez DB.""" from __future__ import annotations import uuid from datetime import date import pytest from app.resolve.scoring import ( composite_score, date_proximity, detect_modifier_tags, detect_series_positions, hamming_distance_hex, performer_set_similarity, phash_similarity, series_mismatch_strength, title_similarity, triage, ) # ---- hamming / phash ------------------------------------------------------ def test_hamming_zero_on_identical() -> None: assert hamming_distance_hex("deadbeef", "deadbeef") == 0 def test_hamming_one_bit_difference() -> None: # 0x0 vs 0x1 = 1 bit assert hamming_distance_hex("00", "01") == 1 # 0xFF vs 0x00 = 8 bity assert hamming_distance_hex("ff", "00") == 8 def test_hamming_raises_on_length_mismatch() -> None: with pytest.raises(ValueError): hamming_distance_hex("abcd", "abcdef") def test_phash_similarity_64bit() -> None: # 16 hex chars = 64 bits a = "0000000000000000" b = "0000000000000001" # 1 bit różnicy sim = phash_similarity(a, b) assert sim == pytest.approx(1.0 - 1 / 64, abs=1e-6) def test_phash_similarity_far() -> None: a = "0000000000000000" b = "ffffffffffffffff" # 64 bity różnicy assert phash_similarity(a, b) == 0.0 # ---- title similarity ----------------------------------------------------- def test_title_similarity_identical() -> None: assert title_similarity("the great heist", "the great heist") == 1.0 def test_title_similarity_token_set_handles_extra_words() -> None: # token_set_ratio jest odporny na zmianę kolejności i dodatkowe tokeny s = title_similarity("great heist", "the great heist extended cut") assert s > 0.8 def test_title_similarity_empty_inputs() -> None: assert title_similarity("", "anything") == 0.0 assert title_similarity("anything", "") == 0.0 # ---- performer set Jaccard ------------------------------------------------ def test_performer_set_full_overlap() -> None: a = uuid.uuid4() b = uuid.uuid4() assert performer_set_similarity([a, b], [a, b]) == 1.0 def test_performer_set_partial_overlap() -> None: a, b, c = uuid.uuid4(), uuid.uuid4(), uuid.uuid4() # |∩| = 1, |∪| = 3 → 1/3 assert performer_set_similarity([a, b], [a, c]) == pytest.approx(1 / 3) def test_performer_set_empty_inputs() -> None: assert performer_set_similarity([], []) == 0.0 # ---- date proximity ------------------------------------------------------- def test_date_proximity_same_day() -> None: d = date(2024, 1, 1) assert date_proximity(d, d) == 1.0 def test_date_proximity_within_window() -> None: d = date(2024, 1, 1) # +3 days, window 7 → 1 - 3/7 assert date_proximity(d, date(2024, 1, 4), window_days=7) == pytest.approx(1 - 3 / 7) def test_date_proximity_outside_window() -> None: assert date_proximity(date(2024, 1, 1), date(2024, 2, 1), window_days=7) == 0.0 def test_date_proximity_none_inputs() -> None: assert date_proximity(None, date(2024, 1, 1)) == 0.0 assert date_proximity(date(2024, 1, 1), None) == 0.0 # ---- composite ----------------------------------------------------------- def test_composite_studio_mismatch_hard_rejects() -> None: score, reasons = composite_score( fp=None, title=0.9, performers=0.9, date_score=1.0, studio_match=False ) assert score == 0.0 assert reasons.get("studio_mismatch") def test_composite_strong_fp_overrides_studio_mismatch() -> None: # silny pHash bije studio mismatch (sub-studio mapping rozjeżdża się czasem) score, reasons = composite_score( fp=0.97, title=0.5, performers=None, date_score=1.0, studio_match=False ) assert score > 0.0 assert reasons.get("studio_mismatch_overridden_by_fp") is True def test_composite_redistributes_weights_when_fp_missing() -> None: # wszystko 1.0 → composite musi być 1.0 niezależnie czy mamy fp czy nie s_with, _ = composite_score(fp=1.0, title=1.0, performers=1.0, date_score=1.0, studio_match=True) s_without, _ = composite_score(fp=None, title=1.0, performers=1.0, date_score=1.0, studio_match=True) assert s_with == pytest.approx(1.0) assert s_without == pytest.approx(1.0) def test_composite_no_signals() -> None: score, reasons = composite_score( fp=None, title=None, performers=None, date_score=None, studio_match=None ) assert score == 0.0 assert reasons.get("no_signals") def test_composite_clamps_to_unit() -> None: score, _ = composite_score(fp=2.0, title=2.0, performers=2.0, date_score=2.0, studio_match=True) assert score == 1.0 # ---- triage -------------------------------------------------------------- # ---- series position / modifier detector --------------------------------- def test_detect_series_positions_episode() -> None: assert detect_series_positions("pleasureville a dp xxx parody episode 4") == {4} def test_detect_series_positions_part_with_dot() -> None: assert detect_series_positions("neon moonlight pt. 2") == {2} def test_detect_series_positions_hash_only() -> None: assert detect_series_positions("women seeking women #131 scene 2") == {131, 2} def test_detect_series_positions_volume() -> None: assert detect_series_positions("women seeking women volume 140 scene 3") == {140, 3} def test_detect_series_positions_s_e_style() -> None: assert detect_series_positions("can you handle a woman like me s9 e8") == {9, 8} def test_detect_series_positions_empty() -> None: assert detect_series_positions(None) == set() assert detect_series_positions("") == set() def test_detect_modifier_tags_bts() -> None: assert "bts" in detect_modifier_tags("training ravyn (bts - 1)") def test_detect_modifier_tags_behind_the_scenes() -> None: assert "behind the scenes" in detect_modifier_tags( "behind the scenes - two pairs of suckable melons" ) def test_detect_modifier_tags_unedited() -> None: assert "unedited" in detect_modifier_tags("bad bella stinky feet prep (unedited)") def test_series_mismatch_episode_2_vs_4_hard() -> None: # Episode 2 vs 4 → twardy mismatch (1.0) s = series_mismatch_strength( "pleasureville a dp xxx parody episode 2", "pleasureville a dp xxx parody episode 4", ) assert s == 1.0 def test_series_mismatch_intersection_is_no_mismatch() -> None: # Oba mają {7} (Make'em Sweat #7) → BRAK mismatchu na pozycji, # ale BTS asymmetry → 0.7 s = series_mismatch_strength("make'em sweat #7", "make'em sweat #7 bts") assert s == pytest.approx(0.7) def test_series_mismatch_partial_overlap_is_still_hard() -> None: # "Volume 140 Scene 3" vs "Volume 140 Scene 4" — wspólny 140 ale różne 3/4, # to są osobne sceny ze wspólnej kompilacji → hard split. s = series_mismatch_strength( "women seeking women volume 140 scene 3", "women seeking women volume 140 scene 4", ) assert s == 1.0 def test_series_mismatch_no_year_false_positive() -> None: # "scene from 2020" nie może wygenerować fałszywej pozycji z roku. pos = detect_series_positions("scene from 2020") # Może tu być {2020}? Nie — \d{1,3} z anti-greedy boundary nie złapie 4-cyfr. assert pos == set() def test_series_mismatch_bts_asymmetric() -> None: # Tytuły: Training Ravyn vs Training Ravyn (BTS - 1) # pos: {} vs {1} → brak common pos ale jedna strona pusta → nie hard split # BTS po jednej stronie → 0.7 s = series_mismatch_strength("training ravyn", "training ravyn (bts - 1)") assert s == pytest.approx(0.7) def test_series_mismatch_no_signal() -> None: s = series_mismatch_strength("the great heist", "the great heist") assert s == 0.0 def test_composite_series_position_hard_reject() -> None: # Mimo wszystkich silnych sygnałów (fp/title/performers/date 1.0) — series mismatch # 1.0 forsuje twardy reject. To gwarantuje że "Episode 2 vs Episode 4" z tym samym # phashem (studio reuse cover art) NIE auto-mergeują. score, reasons = composite_score( fp=1.0, title=1.0, performers=1.0, date_score=1.0, studio_match=True, series_mismatch=1.0, ) assert score == 0.0 assert reasons.get("series_position_mismatch") def test_composite_series_modifier_cap_07() -> None: # Modifier mismatch (BTS po jednej stronie) → cap = 1 - 0.7 = 0.3 score, reasons = composite_score( fp=1.0, title=1.0, performers=1.0, date_score=1.0, studio_match=True, series_mismatch=0.7, ) assert score == pytest.approx(0.3) assert reasons.get("series_modifier_cap") == pytest.approx(0.3) def test_composite_series_zero_no_effect() -> None: score_a, _ = composite_score( fp=1.0, title=1.0, performers=1.0, date_score=1.0, studio_match=True, series_mismatch=0.0, ) score_b, _ = composite_score( fp=1.0, title=1.0, performers=1.0, date_score=1.0, studio_match=True, series_mismatch=None, ) assert score_a == score_b == pytest.approx(1.0) # ---- triage -------------------------------------------------------------- def test_triage_thresholds() -> None: assert triage(0.95) == "auto" # >= 0.92 assert triage(0.92) == "auto" assert triage(0.85) == "review" assert triage(0.75) == "review" assert triage(0.5) == "reject"