goon/scripts/phash_benchmark.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

153 lines
5.9 KiB
Python

"""Benchmark report dla phash backfill per performer.
Pokazuje:
1. Coverage: ile scen ma phash / ile mogłoby (z thumb)
2. Distribution: Hamming distance histogram across all pairs (within performer)
3. Clusters: groups scen z phash Hamming ≤ THRESHOLD (likely-duplicate)
4. Sample par: title/duration/source dla 5 najlepszych match candidates
Run: python /srv/scripts/phash_benchmark.py <performer_id> [--threshold N]
"""
from __future__ import annotations
import argparse
import sys
from collections import defaultdict
from sqlalchemy import select
sys.path.insert(0, "/srv")
from app.db import SessionLocal
from app.models.performer import Performer
from app.models.scene import Scene, ScenePerformer, SceneFingerprint
from app.models.playback_source import PlaybackSource
from app.resolve.scoring import hamming_distance_hex
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("performer_id", type=str)
ap.add_argument("--threshold", type=int, default=5, help="Hamming threshold for likely-dup")
args = ap.parse_args()
import uuid as _uuid
perf_id = _uuid.UUID(args.performer_id)
with SessionLocal() as session:
perf = session.get(Performer, perf_id)
if not perf:
print(f"performer {perf_id} not found")
return
print(f"=== {perf.canonical_name} ({perf_id}) ===\n")
# 1) Scenes + phash
scenes = session.execute(
select(Scene.id, Scene.title, Scene.duration_sec)
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
.where(ScenePerformer.performer_id == perf_id)
).all()
scene_ids = {s.id for s in scenes}
title_by_id = {s.id: s.title for s in scenes}
dur_by_id = {s.id: s.duration_sec for s in scenes}
phashes = session.execute(
select(SceneFingerprint.scene_id, SceneFingerprint.value)
.where(SceneFingerprint.kind == "phash")
.where(SceneFingerprint.scene_id.in_(scene_ids))
).all()
ph_by_scene: dict = defaultdict(list)
for sid, val in phashes:
# Skip garbage entries (length≠16 = failed extract, "0" placeholder from
# legacy TPDB ingest, etc.).
if val and len(val) == 16:
ph_by_scene[sid].append(val)
n_total = len(scene_ids)
n_with_phash = len(ph_by_scene)
print(f"Coverage: {n_with_phash}/{n_total} scenes have phash ({100*n_with_phash/max(n_total,1):.0f}%)")
print()
# 2) Pairwise distance distribution
scene_list = list(ph_by_scene.items())
histogram: dict = defaultdict(int)
pairs: list = [] # (dist, sid_a, sid_b)
for i in range(len(scene_list)):
sid_a, hashes_a = scene_list[i]
for j in range(i + 1, len(scene_list)):
sid_b, hashes_b = scene_list[j]
# Min distance across all (a_hash, b_hash) — sceny mogą mieć multiple phash
min_d = min(hamming_distance_hex(ha, hb) for ha in hashes_a for hb in hashes_b)
histogram[min_d] += 1
pairs.append((min_d, sid_a, sid_b))
if not pairs:
print("No pairs to compare (need ≥2 scenes with phash).")
return
print("Hamming distance distribution (all pairs):")
for d in sorted(histogram):
if d > 20:
break
bar = "#" * min(50, histogram[d])
print(f" d={d:2d} : {histogram[d]:4d} {bar}")
n_unique = sum(c for d, c in histogram.items() if d > 20)
print(f" d>20: {n_unique:4d} (effectively unrelated)")
print()
# 3) Pairs within threshold
likely_dup = sorted([p for p in pairs if p[0] <= args.threshold])
print(f"Likely-duplicate pairs (Hamming ≤ {args.threshold}): {len(likely_dup)}")
print()
if not likely_dup:
print("No likely duplicates found.")
return
# 4) Group into clusters (union-find)
parent: dict = {sid: sid for sid in ph_by_scene}
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(a, b):
ra, rb = find(a), find(b)
if ra != rb:
parent[ra] = rb
for d, a, b in likely_dup:
union(a, b)
clusters: dict = defaultdict(list)
for sid in ph_by_scene:
clusters[find(sid)].append(sid)
dup_clusters = [c for c in clusters.values() if len(c) > 1]
print(f"Duplicate clusters: {len(dup_clusters)}")
total_redundant = sum(len(c) - 1 for c in dup_clusters)
print(f"Total redundant scenes: {total_redundant} (would merge to {len(dup_clusters)} canonical)")
print()
# 5) Print sample clusters
print(f"=== Sample top {min(5, len(dup_clusters))} clusters ===\n")
# Resolve sources per scene
ps_by_scene: dict = defaultdict(list)
for row in session.execute(
select(PlaybackSource.scene_id, PlaybackSource.origin)
.where(PlaybackSource.scene_id.in_([s for c in dup_clusters for s in c]))
.where(PlaybackSource.dead_at.is_(None))
).all():
origin = row.origin.split(":", 1)[1] if ":" in row.origin else row.origin
ps_by_scene[row.scene_id].append(origin)
for i, cluster in enumerate(sorted(dup_clusters, key=len, reverse=True)[:5], 1):
print(f"--- Cluster {i} ({len(cluster)} scenes) ---")
for sid in cluster:
title = title_by_id.get(sid, "?")[:65]
dur = dur_by_id.get(sid)
dur_s = f"{dur}s" if dur else "no-dur"
sources = ",".join(sorted(set(ps_by_scene.get(sid, [])))) or "no-source"
ph = ph_by_scene[sid][0]
print(f" {sid} {ph} {dur_s:>7s} [{sources}] {title}")
print()
if __name__ == "__main__":
main()