goon/app/resolve/scene_score.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

117 lines
3.5 KiB
Python

"""Scoring kandydat ↔ kandydat dla pipeline'u dedup."""
from __future__ import annotations
import uuid
from collections.abc import Iterable
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.models.scene import Scene, SceneFingerprint, ScenePerformer
from app.normalize.scenes import NormalizedScene
from app.resolve.scoring import (
ScoreBreakdown,
composite_score,
date_proximity,
duration_proximity,
performer_set_similarity,
phash_similarity,
title_similarity,
)
def score_candidate(
session: Session,
*,
candidate: Scene,
norm: NormalizedScene,
resolved_performer_ids: Iterable[uuid.UUID],
studio_id: uuid.UUID | None,
aggregator_mode: bool = False,
) -> ScoreBreakdown:
"""Liczy ScoreBreakdown dla pary (kandydat z DB, znormalizowana scena z importu).
`aggregator_mode=True` dla scen pochodzących z tube/agregatora (np. pornapp): studio
nie jest informatywne (tube agreguje wiele studiów), performers stają się głównym
sygnałem — patrz `composite_score` szczegóły.
"""
fp = _best_phash_similarity(session, candidate.id, norm.fingerprints)
title = title_similarity(candidate.title_normalized, norm.title_normalized)
cand_perfs = _candidate_performer_ids(session, candidate.id)
perf = performer_set_similarity(cand_perfs, list(resolved_performer_ids)) if (cand_perfs or list(resolved_performer_ids)) else None
date_score = date_proximity(candidate.release_date, norm.release_date)
duration_score = duration_proximity(candidate.duration_sec, norm.duration_sec)
studio_match: bool | None
if studio_id is None or candidate.studio_id is None:
studio_match = None # nieinformatywne
else:
studio_match = candidate.studio_id == studio_id
composite, reasons = composite_score(
fp=fp,
title=title,
performers=perf,
date_score=date_score if (candidate.release_date and norm.release_date) else None,
duration_score=duration_score,
studio_match=studio_match,
aggregator_mode=aggregator_mode,
)
breakdown = ScoreBreakdown(
fp=fp,
title=title,
performers=perf,
date=date_score,
duration=duration_score,
studio_match=studio_match,
composite=composite,
reasons=reasons,
)
return breakdown
def _best_phash_similarity(
session: Session,
scene_id: uuid.UUID,
incoming_fingerprints: list[tuple[str, str]],
) -> float | None:
"""Najlepsza similarity między phashami sceny w DB a incoming."""
incoming = [v for kind, v in incoming_fingerprints if kind == "phash"]
if not incoming:
return None
existing = (
session.execute(
select(SceneFingerprint.value).where(
SceneFingerprint.scene_id == scene_id,
SceneFingerprint.kind == "phash",
)
)
.scalars()
.all()
)
if not existing:
return None
best = 0.0
for left in incoming:
for right in existing:
if len(left) != len(right):
continue
try:
sim = phash_similarity(left, right)
except ValueError:
continue
if sim > best:
best = sim
return best
def _candidate_performer_ids(session: Session, scene_id: uuid.UUID) -> list[uuid.UUID]:
return list(
session.execute(
select(ScenePerformer.performer_id).where(ScenePerformer.scene_id == scene_id)
)
.scalars()
.all()
)