goon/tests/test_scoring.py
https://github.com/goon-foss/goon 642f1ab8b8 Mobile 0.1.9: OTA enable, WebView cookie-dismiss fix, porndoe connector
Mobile / OTA:
- Enable Expo Updates (app.json + AndroidManifest) → api.goon-foss.org
- Bump 0.1.6 → 0.1.9 (build.gradle, app.json, appVersion.ts, main.py /version)
- backend.ts: default public backend auto-connect (no manual login)

WebView fallback fix (PlayerScreen INJECTED_JS):
- Auto-dismiss cookie/consent gates (hqporner et al. blocked kt_player init)
- Context-scoped: only clicks consent buttons inside cookie/gdpr containers
- Retry window for <source>.src polling raised 5→15 ticks (post-dismiss init)

Resolver:
- Series-position + modifier mismatch detector (Episode 2≠4, BTS/unedited)
  → composite_score hard-reject / cap; wired into scene_score + bulk_dedup
- aggregator-mode candidate query: LIMIT 500 + title-match ordering

Connectors:
- porndoe.com browse scraper (JSON-LD VideoObject) — theporndude audit pilot

landing: APK links → goon-v0.1.9.apk

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-22 11:20:57 +02:00

265 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Unit testy scoring engine — czyste funkcje bez DB."""
from __future__ import annotations
import uuid
from datetime import date
import pytest
from app.resolve.scoring import (
composite_score,
date_proximity,
detect_modifier_tags,
detect_series_positions,
hamming_distance_hex,
performer_set_similarity,
phash_similarity,
series_mismatch_strength,
title_similarity,
triage,
)
# ---- hamming / phash ------------------------------------------------------
def test_hamming_zero_on_identical() -> None:
assert hamming_distance_hex("deadbeef", "deadbeef") == 0
def test_hamming_one_bit_difference() -> None:
# 0x0 vs 0x1 = 1 bit
assert hamming_distance_hex("00", "01") == 1
# 0xFF vs 0x00 = 8 bity
assert hamming_distance_hex("ff", "00") == 8
def test_hamming_raises_on_length_mismatch() -> None:
with pytest.raises(ValueError):
hamming_distance_hex("abcd", "abcdef")
def test_phash_similarity_64bit() -> None:
# 16 hex chars = 64 bits
a = "0000000000000000"
b = "0000000000000001" # 1 bit różnicy
sim = phash_similarity(a, b)
assert sim == pytest.approx(1.0 - 1 / 64, abs=1e-6)
def test_phash_similarity_far() -> None:
a = "0000000000000000"
b = "ffffffffffffffff" # 64 bity różnicy
assert phash_similarity(a, b) == 0.0
# ---- title similarity -----------------------------------------------------
def test_title_similarity_identical() -> None:
assert title_similarity("the great heist", "the great heist") == 1.0
def test_title_similarity_token_set_handles_extra_words() -> None:
# token_set_ratio jest odporny na zmianę kolejności i dodatkowe tokeny
s = title_similarity("great heist", "the great heist extended cut")
assert s > 0.8
def test_title_similarity_empty_inputs() -> None:
assert title_similarity("", "anything") == 0.0
assert title_similarity("anything", "") == 0.0
# ---- performer set Jaccard ------------------------------------------------
def test_performer_set_full_overlap() -> None:
a = uuid.uuid4()
b = uuid.uuid4()
assert performer_set_similarity([a, b], [a, b]) == 1.0
def test_performer_set_partial_overlap() -> None:
a, b, c = uuid.uuid4(), uuid.uuid4(), uuid.uuid4()
# |∩| = 1, || = 3 → 1/3
assert performer_set_similarity([a, b], [a, c]) == pytest.approx(1 / 3)
def test_performer_set_empty_inputs() -> None:
assert performer_set_similarity([], []) == 0.0
# ---- date proximity -------------------------------------------------------
def test_date_proximity_same_day() -> None:
d = date(2024, 1, 1)
assert date_proximity(d, d) == 1.0
def test_date_proximity_within_window() -> None:
d = date(2024, 1, 1)
# +3 days, window 7 → 1 - 3/7
assert date_proximity(d, date(2024, 1, 4), window_days=7) == pytest.approx(1 - 3 / 7)
def test_date_proximity_outside_window() -> None:
assert date_proximity(date(2024, 1, 1), date(2024, 2, 1), window_days=7) == 0.0
def test_date_proximity_none_inputs() -> None:
assert date_proximity(None, date(2024, 1, 1)) == 0.0
assert date_proximity(date(2024, 1, 1), None) == 0.0
# ---- composite -----------------------------------------------------------
def test_composite_studio_mismatch_hard_rejects() -> None:
score, reasons = composite_score(
fp=None, title=0.9, performers=0.9, date_score=1.0, studio_match=False
)
assert score == 0.0
assert reasons.get("studio_mismatch")
def test_composite_strong_fp_overrides_studio_mismatch() -> None:
# silny pHash bije studio mismatch (sub-studio mapping rozjeżdża się czasem)
score, reasons = composite_score(
fp=0.97, title=0.5, performers=None, date_score=1.0, studio_match=False
)
assert score > 0.0
assert reasons.get("studio_mismatch_overridden_by_fp") is True
def test_composite_redistributes_weights_when_fp_missing() -> None:
# wszystko 1.0 → composite musi być 1.0 niezależnie czy mamy fp czy nie
s_with, _ = composite_score(fp=1.0, title=1.0, performers=1.0, date_score=1.0, studio_match=True)
s_without, _ = composite_score(fp=None, title=1.0, performers=1.0, date_score=1.0, studio_match=True)
assert s_with == pytest.approx(1.0)
assert s_without == pytest.approx(1.0)
def test_composite_no_signals() -> None:
score, reasons = composite_score(
fp=None, title=None, performers=None, date_score=None, studio_match=None
)
assert score == 0.0
assert reasons.get("no_signals")
def test_composite_clamps_to_unit() -> None:
score, _ = composite_score(fp=2.0, title=2.0, performers=2.0, date_score=2.0, studio_match=True)
assert score == 1.0
# ---- triage --------------------------------------------------------------
# ---- series position / modifier detector ---------------------------------
def test_detect_series_positions_episode() -> None:
assert detect_series_positions("pleasureville a dp xxx parody episode 4") == {4}
def test_detect_series_positions_part_with_dot() -> None:
assert detect_series_positions("neon moonlight pt. 2") == {2}
def test_detect_series_positions_hash_only() -> None:
assert detect_series_positions("women seeking women #131 scene 2") == {131, 2}
def test_detect_series_positions_volume() -> None:
assert detect_series_positions("women seeking women volume 140 scene 3") == {140, 3}
def test_detect_series_positions_s_e_style() -> None:
assert detect_series_positions("can you handle a woman like me s9 e8") == {9, 8}
def test_detect_series_positions_empty() -> None:
assert detect_series_positions(None) == set()
assert detect_series_positions("") == set()
def test_detect_modifier_tags_bts() -> None:
assert "bts" in detect_modifier_tags("training ravyn (bts - 1)")
def test_detect_modifier_tags_behind_the_scenes() -> None:
assert "behind the scenes" in detect_modifier_tags(
"behind the scenes - two pairs of suckable melons"
)
def test_detect_modifier_tags_unedited() -> None:
assert "unedited" in detect_modifier_tags("bad bella stinky feet prep (unedited)")
def test_series_mismatch_episode_2_vs_4_hard() -> None:
# Episode 2 vs 4 → twardy mismatch (1.0)
s = series_mismatch_strength(
"pleasureville a dp xxx parody episode 2",
"pleasureville a dp xxx parody episode 4",
)
assert s == 1.0
def test_series_mismatch_intersection_is_no_mismatch() -> None:
# Oba mają {7} (Make'em Sweat #7) → BRAK mismatchu na pozycji,
# ale BTS asymmetry → 0.7
s = series_mismatch_strength("make'em sweat #7", "make'em sweat #7 bts")
assert s == pytest.approx(0.7)
def test_series_mismatch_partial_overlap_is_still_hard() -> None:
# "Volume 140 Scene 3" vs "Volume 140 Scene 4" — wspólny 140 ale różne 3/4,
# to są osobne sceny ze wspólnej kompilacji → hard split.
s = series_mismatch_strength(
"women seeking women volume 140 scene 3",
"women seeking women volume 140 scene 4",
)
assert s == 1.0
def test_series_mismatch_no_year_false_positive() -> None:
# "scene from 2020" nie może wygenerować fałszywej pozycji z roku.
pos = detect_series_positions("scene from 2020")
# Może tu być {2020}? Nie — \d{1,3} z anti-greedy boundary nie złapie 4-cyfr.
assert pos == set()
def test_series_mismatch_bts_asymmetric() -> None:
# Tytuły: Training Ravyn vs Training Ravyn (BTS - 1)
# pos: {} vs {1} → brak common pos ale jedna strona pusta → nie hard split
# BTS po jednej stronie → 0.7
s = series_mismatch_strength("training ravyn", "training ravyn (bts - 1)")
assert s == pytest.approx(0.7)
def test_series_mismatch_no_signal() -> None:
s = series_mismatch_strength("the great heist", "the great heist")
assert s == 0.0
def test_composite_series_position_hard_reject() -> None:
# Mimo wszystkich silnych sygnałów (fp/title/performers/date 1.0) — series mismatch
# 1.0 forsuje twardy reject. To gwarantuje że "Episode 2 vs Episode 4" z tym samym
# phashem (studio reuse cover art) NIE auto-mergeują.
score, reasons = composite_score(
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
studio_match=True, series_mismatch=1.0,
)
assert score == 0.0
assert reasons.get("series_position_mismatch")
def test_composite_series_modifier_cap_07() -> None:
# Modifier mismatch (BTS po jednej stronie) → cap = 1 - 0.7 = 0.3
score, reasons = composite_score(
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
studio_match=True, series_mismatch=0.7,
)
assert score == pytest.approx(0.3)
assert reasons.get("series_modifier_cap") == pytest.approx(0.3)
def test_composite_series_zero_no_effect() -> None:
score_a, _ = composite_score(
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
studio_match=True, series_mismatch=0.0,
)
score_b, _ = composite_score(
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
studio_match=True, series_mismatch=None,
)
assert score_a == score_b == pytest.approx(1.0)
# ---- triage --------------------------------------------------------------
def test_triage_thresholds() -> None:
assert triage(0.95) == "auto" # >= 0.92
assert triage(0.92) == "auto"
assert triage(0.85) == "review"
assert triage(0.75) == "review"
assert triage(0.5) == "reject"