Mobile / OTA: - Enable Expo Updates (app.json + AndroidManifest) → api.goon-foss.org - Bump 0.1.6 → 0.1.9 (build.gradle, app.json, appVersion.ts, main.py /version) - backend.ts: default public backend auto-connect (no manual login) WebView fallback fix (PlayerScreen INJECTED_JS): - Auto-dismiss cookie/consent gates (hqporner et al. blocked kt_player init) - Context-scoped: only clicks consent buttons inside cookie/gdpr containers - Retry window for <source>.src polling raised 5→15 ticks (post-dismiss init) Resolver: - Series-position + modifier mismatch detector (Episode 2≠4, BTS/unedited) → composite_score hard-reject / cap; wired into scene_score + bulk_dedup - aggregator-mode candidate query: LIMIT 500 + title-match ordering Connectors: - porndoe.com browse scraper (JSON-LD VideoObject) — theporndude audit pilot landing: APK links → goon-v0.1.9.apk Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
379 lines
14 KiB
Python
379 lines
14 KiB
Python
"""Scoring funkcji do dopasowania kandydatów scen.
|
|
|
|
Wszystkie sub-score'y wracają w [0, 1]. Composite score łączy je z wagami,
|
|
redystrybuując wagę gdy któryś sygnał jest niedostępny (np. brak fingerprintu).
|
|
|
|
Wagi (gdy wszystko dostępne):
|
|
fp_phash: 0.40
|
|
title: 0.25
|
|
performers: 0.20
|
|
date: 0.15
|
|
|
|
Twardy reject: studio_match=False → score 0.0 (chyba że ma silny fingerprint match
|
|
≥0.95, wtedy ufamy fingerprintowi i ignorujemy studio mismatch — bo zdarza się że
|
|
TPDB ma "Brazzers Exxtra" a StashDB "Brazzers" jako studio sceny).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
import re
|
|
import uuid
|
|
from collections.abc import Iterable
|
|
from dataclasses import dataclass
|
|
from datetime import date
|
|
|
|
from rapidfuzz import fuzz
|
|
|
|
from app.config import get_settings
|
|
|
|
|
|
@dataclass
|
|
class ScoreBreakdown:
|
|
"""Per-sub-score values + final composite + reasons (do zapisu w merge_candidates.reasons)."""
|
|
|
|
fp: float | None = None
|
|
title: float | None = None
|
|
performers: float | None = None
|
|
date: float | None = None
|
|
duration: float | None = None
|
|
studio_match: bool | None = None
|
|
composite: float = 0.0
|
|
reasons: dict = None # type: ignore[assignment]
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"fp": self.fp,
|
|
"title": self.title,
|
|
"performers": self.performers,
|
|
"date": self.date,
|
|
"duration": self.duration,
|
|
"studio_match": self.studio_match,
|
|
"composite": self.composite,
|
|
"reasons": self.reasons or {},
|
|
}
|
|
|
|
|
|
# ---- Sub-scorers ----------------------------------------------------------
|
|
|
|
def hamming_distance_hex(a: str, b: str) -> int:
|
|
"""Hamming distance dwóch hex hashy o tej samej długości."""
|
|
if len(a) != len(b):
|
|
raise ValueError(f"hash length mismatch: {len(a)} vs {len(b)}")
|
|
return bin(int(a, 16) ^ int(b, 16)).count("1")
|
|
|
|
|
|
def phash_similarity(a: str, b: str, *, bits: int = 64) -> float:
|
|
"""Similarity = 1 - hamming/bits. Dla 64-bit phash i ≤5 różnic → ≥0.92."""
|
|
d = hamming_distance_hex(a, b)
|
|
return max(0.0, 1.0 - d / bits)
|
|
|
|
|
|
def title_similarity(a: str, b: str) -> float:
|
|
"""`a`, `b` powinny być już znormalizowane (`title_normalized`).
|
|
|
|
Token-set ratio jest odporny na zmianę kolejności słów / dodatkowe tokeny.
|
|
"""
|
|
if not a or not b:
|
|
return 0.0
|
|
return fuzz.token_set_ratio(a, b) / 100.0
|
|
|
|
|
|
def performer_set_similarity(
|
|
left_ids: Iterable[uuid.UUID],
|
|
right_ids: Iterable[uuid.UUID],
|
|
) -> float:
|
|
"""Jaccard na zbiorach kanonicznych UUID-ów performerów."""
|
|
left = {str(i) for i in left_ids if i is not None}
|
|
right = {str(i) for i in right_ids if i is not None}
|
|
if not left and not right:
|
|
return 0.0
|
|
intersection = left & right
|
|
union = left | right
|
|
if not union:
|
|
return 0.0
|
|
return len(intersection) / len(union)
|
|
|
|
|
|
def date_proximity(left: date | None, right: date | None, *, window_days: int = 7) -> float:
|
|
"""1.0 gdy ten sam dzień, liniowy spadek do 0 w oknie window_days, poza oknem 0.0."""
|
|
if left is None or right is None:
|
|
return 0.0
|
|
delta = abs((left - right).days)
|
|
if delta == 0:
|
|
return 1.0
|
|
if delta > window_days:
|
|
return 0.0
|
|
return 1.0 - delta / window_days
|
|
|
|
|
|
# Wyłapuje "Episode 4" / "Ep 4" / "Part 2" / "Pt. 3" / "Vol 7" / "Volume 12" /
|
|
# "Scene 5" / "Chapter 9" / "Ch.3" / "#7" / "S9:E8" / "S9E8" — wszystko po
|
|
# normalizacji (lower-cased, punkt usunięty zwykle, ale tolerujemy \\.).
|
|
# `(?<!\d)` + `(?!\d)` zapobiega wyłapaniu fragmentu cyfry z dłuższego ciągu —
|
|
# np. "scene from 2020" nie wygeneruje fałszywego pos=0 z boundary-end-of-2020.
|
|
_SERIES_NUM_RE = re.compile(
|
|
r"\b(?:episode|ep|part|pt|vol|volume|chapter|ch|scene|series)\b\s*\.?\s*#?\s*(?<!\d)(\d{1,3})(?!\d)"
|
|
r"|(?<!\w)#\s*(?<!\d)(\d{1,3})(?!\d)"
|
|
r"|\bs(?<!\d)(\d{1,2})(?!\d)\s*[:e]\s*e?(?<!\d)(\d{1,3})(?!\d)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Tagi które wprost mówią że scena to wariant osobny (BTS / bonus / unedited /
|
|
# trailer). Jeśli tag jest TYLKO po jednej stronie, to NIE jest ta sama scena.
|
|
_MODIFIER_TAGS: tuple[str, ...] = (
|
|
"behind the scenes",
|
|
"behind-the-scenes",
|
|
"bts",
|
|
"bonus",
|
|
"unedited",
|
|
"uncut",
|
|
"extended",
|
|
"directors cut",
|
|
"director's cut",
|
|
"trailer",
|
|
"preview",
|
|
"teaser",
|
|
"compilation",
|
|
)
|
|
|
|
|
|
def detect_series_positions(title_normalized: str | None) -> set[int]:
|
|
"""Zwraca wszystkie pozycje (Episode/Part/Vol/Scene/Chapter/# itp.) znalezione w tytule.
|
|
|
|
Tytuł powinien być znormalizowany (lowercase, unaccent), ale regex jest case-insensitive
|
|
i tolerancyjny — chodzi tylko o sygnał, nie o robust parsing.
|
|
"""
|
|
if not title_normalized:
|
|
return set()
|
|
out: set[int] = set()
|
|
for m in _SERIES_NUM_RE.finditer(title_normalized):
|
|
for g in m.groups():
|
|
if g and g.isdigit():
|
|
out.add(int(g))
|
|
return out
|
|
|
|
|
|
def detect_modifier_tags(title_normalized: str | None) -> set[str]:
|
|
"""Zwraca set modifier tagów wykrytych w tytule (bts/bonus/unedited/itp.)."""
|
|
if not title_normalized:
|
|
return set()
|
|
lower = title_normalized.lower()
|
|
return {t for t in _MODIFIER_TAGS if t in lower}
|
|
|
|
|
|
def series_mismatch_strength(
|
|
title_a_normalized: str | None,
|
|
title_b_normalized: str | None,
|
|
) -> float:
|
|
"""Wykrywa rozjazd "wariantu sceny" między tytułami.
|
|
|
|
Zwraca strength w [0.0, 1.0]:
|
|
0.0 — brak sygnału mismatchu (tytuły kompatybilne).
|
|
0.5 — modifier tags po obu stronach ale RÓŻNE (BTS vs trailer).
|
|
0.7 — modifier tag po jednej stronie tylko (BTS vs regular).
|
|
1.0 — series position mismatch (Episode 2 vs Episode 4 → twardy reject).
|
|
"""
|
|
pos_a = detect_series_positions(title_a_normalized)
|
|
pos_b = detect_series_positions(title_b_normalized)
|
|
# Hard mismatch gdy oba mają jakieś pozycje i symmetric difference jest niepusty
|
|
# — przykład: "Vol 140 Scene 3" vs "Vol 140 Scene 4" mają wspólne 140 ale różne 3/4,
|
|
# to są osobne sceny ze wspólnej kompilacji. Asymetryczny brak (jedna strona ma
|
|
# pozycję a druga nie) nie liczy się jako mismatch — tube SEO często gubi numer.
|
|
if pos_a and pos_b and (pos_a ^ pos_b):
|
|
return 1.0
|
|
|
|
mod_a = detect_modifier_tags(title_a_normalized)
|
|
mod_b = detect_modifier_tags(title_b_normalized)
|
|
if (not mod_a) != (not mod_b):
|
|
return 0.7
|
|
if mod_a and mod_b and not (mod_a & mod_b):
|
|
return 0.5
|
|
return 0.0
|
|
|
|
|
|
def duration_proximity(
|
|
left: int | None, right: int | None, *, window_sec: int = 60
|
|
) -> float | None:
|
|
"""1.0 gdy duration identyczny, liniowy spadek do 0 w oknie window_sec.
|
|
|
|
Zwraca None gdy któraś wartość brak (sygnał nieinformatywny). Tube'y rzadko
|
|
podają dokładny duration; różnica ±60s zwykle oznacza tę samą scenę z innym
|
|
intro/outro. Poza oknem → 0.0 (różne sceny).
|
|
"""
|
|
if not left or not right:
|
|
return None
|
|
delta = abs(left - right)
|
|
if delta == 0:
|
|
return 1.0
|
|
if delta > window_sec:
|
|
return 0.0
|
|
return 1.0 - delta / window_sec
|
|
|
|
|
|
# ---- Composite ------------------------------------------------------------
|
|
|
|
# Bazowe wagi gdy wszystkie sygnały są dostępne.
|
|
_BASE_WEIGHTS = {
|
|
"fp": 0.40,
|
|
"title": 0.20,
|
|
"performers": 0.15,
|
|
"date": 0.15,
|
|
"duration": 0.10,
|
|
}
|
|
|
|
|
|
def composite_score(
|
|
*,
|
|
fp: float | None,
|
|
title: float | None,
|
|
performers: float | None,
|
|
date_score: float | None,
|
|
duration_score: float | None = None,
|
|
studio_match: bool | None,
|
|
aggregator_mode: bool = False,
|
|
series_mismatch: float | None = None,
|
|
) -> tuple[float, dict]:
|
|
"""Łączy sub-score'y w jeden composite [0, 1] + zwraca raport reasons.
|
|
|
|
studio_match=False → hard reject na 0.0, chyba że:
|
|
- fp ≥ 0.95 (silny fingerprint bije studio mismatch), albo
|
|
- aggregator_mode=True (np. tube'y typu HQPorner agregują z różnych studiów,
|
|
więc studio z naszej perspektywy nie jest informatywny — pomijamy hard reject
|
|
i zwiększamy wagę performers).
|
|
|
|
`series_mismatch` (≥0.0): wartość z `series_mismatch_strength()` — gdy 1.0 (Episode 2
|
|
vs Episode 4), wymusza twardy reject niezależnie od pozostałych sygnałów; gdy 0.5-0.7
|
|
(modifier mismatch: BTS/bonus/unedited po jednej stronie), nakłada cap = `1 - strength`.
|
|
"""
|
|
reasons: dict = {}
|
|
|
|
if series_mismatch is not None and series_mismatch >= 1.0:
|
|
reasons["series_position_mismatch"] = True
|
|
return 0.0, reasons
|
|
|
|
if studio_match is False:
|
|
if fp is not None and fp >= 0.95:
|
|
reasons["studio_mismatch_overridden_by_fp"] = True
|
|
elif aggregator_mode:
|
|
reasons["studio_ignored_aggregator"] = True
|
|
studio_match = None # nie informatywny
|
|
else:
|
|
reasons["studio_mismatch"] = True
|
|
return 0.0, reasons
|
|
|
|
available = {
|
|
k: v
|
|
for k, v in {
|
|
"fp": fp,
|
|
"title": title,
|
|
"performers": performers,
|
|
"date": date_score,
|
|
"duration": duration_score,
|
|
}.items()
|
|
if v is not None
|
|
}
|
|
if not available:
|
|
return 0.0, {"no_signals": True}
|
|
|
|
base_weights = dict(_BASE_WEIGHTS)
|
|
if aggregator_mode:
|
|
# tube'y nie mają date/fp, performer set + duration to najsilniejsze sygnały.
|
|
base_weights = {
|
|
"fp": 0.20,
|
|
"title": 0.15,
|
|
"performers": 0.35,
|
|
"date": 0.05,
|
|
"duration": 0.25,
|
|
}
|
|
reasons["aggregator_weights"] = base_weights
|
|
|
|
weights = {k: base_weights[k] for k in available}
|
|
total_w = sum(weights.values())
|
|
if total_w == 0.0:
|
|
return 0.0, reasons
|
|
norm_w = {k: w / total_w for k, w in weights.items()}
|
|
|
|
score = sum(available[k] * norm_w[k] for k in available)
|
|
reasons["weights"] = norm_w
|
|
reasons["sub_scores"] = available
|
|
|
|
# W aggregator mode wymagamy minimalnego performer overlap dla auto-merge —
|
|
# bez tego polegamy tylko na title fuzzy, co ma wysoki false-positive rate
|
|
# (różne sceny mogą mieć podobne nazwy).
|
|
if aggregator_mode and (performers is None or performers < 0.5):
|
|
score = min(score, 0.74) # cap poniżej review threshold
|
|
reasons["aggregator_low_performer_cap"] = True
|
|
|
|
# Weak-signal cap: w aggregator mode gdy NIE MAMY duration ANI fingerprint ANI
|
|
# date (wszystkie najsilniejsze sygnały braki), polegamy WYŁĄCZNIE na title +
|
|
# performers. Same-performer dla prolific actresses (Tania Amazon, Mia Malkova,
|
|
# Aria Alexander) daje 1.0, a token-set ratio 0.75 z imienia/nazwiska w tytule
|
|
# jest powszechny → composite szybko hitje threshold (0.925) i auto-merguje
|
|
# 78 różnych scen pod jedną canonical (zgłoszone 2026-05-08).
|
|
#
|
|
# Reload 2026-05-09: title bypass przy ≥0.95 zostawiał furtkę dla "Simone Peach
|
|
# BBW rides..." vs "Peach Lollypop sexy BBW rides..." (token-set sweep
|
|
# podbija title >0.95 mimo że to różne osoby). Cap zawsze, niezależnie od
|
|
# title — auto-merge wymaga **co najmniej jednego strong signal** (fp,
|
|
# duration, date). Bez nich → review queue, nigdy auto-merge.
|
|
has_strong_signal = (
|
|
(fp is not None and fp >= 0.5)
|
|
or (duration_score is not None and duration_score >= 0.5)
|
|
or (date_score is not None and date_score >= 0.5)
|
|
)
|
|
if aggregator_mode and not has_strong_signal:
|
|
score = min(score, 0.85)
|
|
reasons["aggregator_weak_signal_cap"] = True
|
|
|
|
# Strong-signal boost: w aggregator mode duration ±3s + performer overlap ≥0.5
|
|
# + title >=0.40 ≈ pewny match (te same długości + ten sam performer + nie-totalnie-
|
|
# różne tytuły to bardzo rzadki false positive). Bumpujemy do auto-merge gdy
|
|
# tube SEO title różni się od studio canonical title ale zachowuje wspólny token.
|
|
#
|
|
# **Tightened 2026-05-12** (bug-report ef090842): poprzednio duration ±6s bez
|
|
# guardu na title → "Five Star Anal Fuck" (2105s) i "Match My Freak" (2110s) Lily
|
|
# Lou auto-merge'owało się w jedną scenę bo duration diff=5 (score 0.917 ≥0.90)
|
|
# + performer 1.0 wystarczało. Zmiany:
|
|
# - duration ≥ 0.95 (≤3s diff zamiast ≤6s) — Brazzers/Naughty America często
|
|
# mają sąsiednie sceny z tym samym actorem o pochodnej długości (intro/outro),
|
|
# ale ≤3s to praktycznie ten sam encoding
|
|
# - title ≥ 0.40 — zatrzymuje "totally different title" false matches; nadal
|
|
# toleruje "TheCanonicalTitle" vs "SiteSlug SEO Title TheCanonical Title FREE"
|
|
if (
|
|
aggregator_mode
|
|
and duration_score is not None
|
|
and duration_score >= 0.95
|
|
and performers is not None
|
|
and performers >= 0.5
|
|
and title is not None
|
|
and title >= 0.40
|
|
):
|
|
if score < 0.92:
|
|
reasons["duration_perf_strong_match_bump"] = True
|
|
score = max(score, 0.92)
|
|
|
|
# Series-modifier cap: jedna ze stron ma "BTS"/"bonus"/"unedited" a druga nie,
|
|
# albo różne tagi. Twardy mismatch (różne pozycje numeryczne) został już złapany
|
|
# wcześniej (return 0.0). Tu zostają miękkie sygnały — cap żeby nigdy nie auto-merge.
|
|
if series_mismatch is not None and 0.0 < series_mismatch < 1.0:
|
|
cap = max(0.0, 1.0 - series_mismatch)
|
|
if score > cap:
|
|
reasons["series_modifier_cap"] = cap
|
|
reasons["series_mismatch_strength"] = series_mismatch
|
|
score = cap
|
|
|
|
return _clamp(score), reasons
|
|
|
|
|
|
def triage(score: float) -> str:
|
|
"""Zwraca 'auto', 'review', 'reject' wg progów z config."""
|
|
s = get_settings()
|
|
if score >= s.auto_merge_threshold:
|
|
return "auto"
|
|
if score >= s.review_threshold:
|
|
return "review"
|
|
return "reject"
|
|
|
|
|
|
def _clamp(v: float) -> float:
|
|
return max(0.0, min(1.0, v if not math.isnan(v) else 0.0))
|