goon/scripts/title_levenshtein_benchmark.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

245 lines
11 KiB
Python

"""Title-based duplicate detection per performer (no phash needed).
Uzupełnia phash heuristykę: gdy thumbs są różne (xvideos auto-grab z różnych
klatek) ale title fragmenty się pokrywają, te 2 sceny pewnie są dupami. Używamy
fuzz.token_set_ratio (rapidfuzz) bo:
- tolerancyjne na kolejność słów
- ignoruje powtórzenia
- ignoruje extra tokeny
Normalization aggressive — usuwamy:
1. Performer canonical_name (dominuje większość tytułów)
2. Site brand prefixes (Brazzers, LegalPorno, Mofos, etc.)
3. Generic adjectives (bimbo, milf, hot, sexy, blonde, brunette, teen)
4. Generic act words (anal, blowjob, dp, dvp, bbc, gangbang)
5. Hashtags (#blonde #bigtits)
6. Trailing IDs (xhWaKl9, 12117054, vidara.so, etc.)
7. Punctuation + extra whitespace
Run: python /srv/scripts/title_levenshtein_benchmark.py <performer_id> [--threshold 80]
"""
from __future__ import annotations
import argparse
import re
import sys
from collections import defaultdict
from rapidfuzz import fuzz
from sqlalchemy import select
sys.path.insert(0, "/srv")
from app.db import SessionLocal
from app.models.performer import Performer
from app.models.scene import Scene, ScenePerformer
from app.models.playback_source import PlaybackSource
# Site brand prefixes — często duplikują studio info już w `scenes.studio`.
_SITE_BRANDS = {
"legalporno", "brazzers", "brazzersexxtra", "publicagent", "mofos",
"twistys", "vivid", "ddf", "ddfbusty", "kink", "naughty", "naughtyamerica",
"joymii", "julesjordan", "mompov", "perfectgirls", "puremature",
"realitykings", "scoreland", "brazilbang", "foxes", "lubed",
"castingcouchx", "throated", "collegerules", "fakehub", "faketaxi",
"fakedrivingschool", "porn", "brazzersexxtra", "publicpickups",
"blacked", "blackedraw", "tushy", "tushyraw", "vixen", "deeper",
"evilangel", "newsensations", "private", "privatecom",
}
# Generic adjectives + act vocab — informacyjne ale nie title-distinguishing.
_NOISE_WORDS = {
"new", "brand", "full", "leaked", "masked", "ppv", "video", "scene",
"aka", "ft", "feat", "with", "and", "the", "a", "an",
"bimbo", "milf", "teen", "teens", "blonde", "brunette", "redhead",
"busty", "curvy", "skinny", "petite", "tall", "short", "thick", "thin",
"amateur", "homemade", "professional", "kinky", "sexy", "hot", "horny",
"beautiful", "gorgeous", "stunning", "perfect", "sweet", "naughty",
"wild", "tight", "young", "older",
"anal", "blowjob", "bj", "dp", "dvp", "dap", "dpp", "bbc", "bg",
"gangbang", "fuck", "fucking", "fucked", "fucks",
"boobs", "tits", "ass", "pussy", "cock", "dick", "facial", "cumshot",
"creampie", "rough", "deep", "hard", "wet", "pov", "pounding",
"smal", "small", "big", "huge", "massive", "intense",
"loves", "gets", "takes", "gives", "shares", "wants", "needs",
"her", "his", "she", "he", "for", "from", "to", "in", "on", "by",
"of", "her", "with",
"first", "her", "best", "girl", "boy", "girls", "guy", "guys", "boys",
"fwb", "ass", "wife", "wifey", "step", "stepmom", "stepdad", "stepsis",
}
# Trailing tokens we drop: numeric IDs, short alphanumerics, domains
_TRAILING_ID_RE = re.compile(r'\b(?:[a-z]*\d{4,}[a-z\d]*|\d{4,})\b', re.IGNORECASE)
_DOMAIN_RE = re.compile(r'\b[a-z0-9]+\.(?:com|net|so|to|sx|tv|video|porn|xxx|cc|biz|info)\b', re.IGNORECASE)
_HASHTAG_RE = re.compile(r'#\w+')
_PUNCT_RE = re.compile(r'[^a-z0-9\s]+')
_WS_RE = re.compile(r'\s+')
# Dates + times — sceny "Stream Started At MM/DD/YYYY HH:MM Pm" są UNIKALNE per date,
# po stripping mają identyczny normalized title → mass false-positive cluster.
_DATE_RE = re.compile(r'\b\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}\b')
_TIME_RE = re.compile(r'\b\d{1,2}:\d{2}\s*[ap]m?\b', re.IGNORECASE)
# Phrases które zdradzają per-instance recording (live stream session, schedule entry).
_LIVE_RE = re.compile(r'\bstream\s+started\s+at\b', re.IGNORECASE)
def normalize_title(title: str, performer_name: str | None = None) -> str:
"""Aggressive normalization dla dup-detection.
Zwraca pusty string (skip in compare) dla per-instance recordings które
by się ścinały do generic phrase (np. "Stream Started At ..." — wszystkie
sesje live mają identyczny normalized title bo daty są stripped).
"""
if not title:
return ""
# Live recordings — skip całkowicie, każdy entry unique per date.
if _LIVE_RE.search(title):
return ""
t = title.lower()
# Strip dates/times PRZED resztą (mogą zawierać digits które _TRAILING_ID drop).
t = _DATE_RE.sub(" ", t)
t = _TIME_RE.sub(" ", t)
# Strip hashtags + domains + numeric IDs (xhWaKl9, 12117054, vidara.so).
t = _HASHTAG_RE.sub(" ", t)
t = _DOMAIN_RE.sub(" ", t)
t = _TRAILING_ID_RE.sub(" ", t)
# Strip performer name if known
if performer_name:
for part in performer_name.lower().split():
t = re.sub(rf'\b{re.escape(part)}\b', " ", t)
# Punctuation → space, collapse whitespace
t = _PUNCT_RE.sub(" ", t)
t = _WS_RE.sub(" ", t).strip()
# Drop noise words (per token)
tokens = [tok for tok in t.split() if tok not in _NOISE_WORDS and tok not in _SITE_BRANDS and len(tok) > 1]
return " ".join(tokens)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("performer_id", type=str)
ap.add_argument("--threshold", type=int, default=80, help="rapidfuzz token_set_ratio threshold (0-100)")
args = ap.parse_args()
import uuid as _uuid
perf_id = _uuid.UUID(args.performer_id)
with SessionLocal() as session:
perf = session.get(Performer, perf_id)
if not perf:
print(f"performer {perf_id} not found")
return
print(f"=== {perf.canonical_name} ({perf_id}) ===\n")
rows = session.execute(
select(Scene.id, Scene.title, Scene.duration_sec)
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
.where(ScenePerformer.performer_id == perf_id)
).all()
if not rows:
print("no scenes")
return
scenes = [(r.id, r.title, r.duration_sec, normalize_title(r.title, perf.canonical_name)) for r in rows]
scenes = [s for s in scenes if s[3]] # skip empty normalized
print(f"Total scenes with normalizable title: {len(scenes)}")
print()
# Show normalization sample
print("Normalization examples (first 5):")
for sid, title, dur, norm in scenes[:5]:
print(f" '{title[:55]}''{norm[:55]}'")
print()
# Filter: scenes z min 3 znaczącymi tokenami (krótkie tytuły jak "fetish boots"
# albo "private gold" generują false-positive z byle czym).
MIN_TOKENS = 3
scenes_for_compare = [s for s in scenes if len(s[3].split()) >= MIN_TOKENS]
print(f"After min-tokens filter ({MIN_TOKENS}+): {len(scenes_for_compare)} scenes ({len(scenes)-len(scenes_for_compare)} skipped)")
print()
# Pairwise fuzzy compare + duration similarity gate
histogram: dict = defaultdict(int)
pairs: list = []
for i in range(len(scenes_for_compare)):
sid_a, title_a, dur_a, norm_a = scenes_for_compare[i]
for j in range(i + 1, len(scenes_for_compare)):
sid_b, title_b, dur_b, norm_b = scenes_for_compare[j]
# Duration gate: jeśli OBA znane, dropny gdy różnica > 30% (różne cuts).
# Jeden None tolerujemy (tube'y często nie zwracają duration).
if dur_a and dur_b:
longer, shorter = max(dur_a, dur_b), min(dur_a, dur_b)
if shorter > 0 and (longer - shorter) / longer > 0.30:
continue
# Use token_set_ratio (kolejność słów + extra tokeny tolerated).
# Plus partial_ratio as secondary (substring match).
ts = fuzz.token_set_ratio(norm_a, norm_b)
pr = fuzz.partial_ratio(norm_a, norm_b)
# Wymóg: BOTH high. token_set_ratio sam daje false-positives gdy
# jeden norm_a jest subset (np. "Private Massage" subset "Private Massage Threesome").
score = min(ts, pr)
bucket = int(score // 10) * 10
histogram[bucket] += 1
if score >= args.threshold:
pairs.append((score, sid_a, sid_b, title_a, title_b, dur_a, dur_b))
if not pairs:
print(f"No pairs ≥ {args.threshold} threshold.")
return
print("Token-set ratio distribution (all pairs):")
for b in sorted(histogram.keys(), reverse=True):
if b < 50:
break
bar = "#" * min(50, histogram[b])
print(f" {b:3d}-{b+9:3d} : {histogram[b]:4d} {bar}")
print()
print(f"Likely-duplicate pairs (token_set ≥ {args.threshold}): {len(pairs)}")
print()
# Union-find clusters
parent: dict = {s[0]: s[0] for s in scenes}
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(a, b):
ra, rb = find(a), find(b)
if ra != rb:
parent[ra] = rb
for score, a, b, _, _, _, _ in pairs:
union(a, b)
clusters: dict = defaultdict(list)
for s in scenes:
clusters[find(s[0])].append(s)
dup_clusters = [c for c in clusters.values() if len(c) > 1]
print(f"Duplicate clusters: {len(dup_clusters)}")
total_redundant = sum(len(c) - 1 for c in dup_clusters)
print(f"Total redundant scenes: {total_redundant} (merge to {len(dup_clusters)} canonical)")
print()
# Sources per scene for context
ps_by_scene: dict = defaultdict(set)
for row in session.execute(
select(PlaybackSource.scene_id, PlaybackSource.origin)
.where(PlaybackSource.scene_id.in_([s[0] for c in dup_clusters for s in c]))
.where(PlaybackSource.dead_at.is_(None))
).all():
origin = row.origin.split(":", 1)[1] if ":" in row.origin else row.origin
ps_by_scene[row.scene_id].add(origin)
n_show = min(10, len(dup_clusters))
print(f"=== Top {n_show} clusters ===\n")
for i, cluster in enumerate(sorted(dup_clusters, key=len, reverse=True)[:n_show], 1):
print(f"--- Cluster {i} ({len(cluster)} scenes) ---")
for sid, title, dur, norm in cluster:
dur_s = f"{dur}s" if dur else "no-dur"
sources = ",".join(sorted(ps_by_scene.get(sid, []))) or "no-src"
print(f" {sid} {dur_s:>7s} [{sources[:35]:35s}] {title[:55]}")
print()
if __name__ == "__main__":
main()