Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
245 lines
11 KiB
Python
245 lines
11 KiB
Python
"""Title-based duplicate detection per performer (no phash needed).
|
|
|
|
Uzupełnia phash heuristykę: gdy thumbs są różne (xvideos auto-grab z różnych
|
|
klatek) ale title fragmenty się pokrywają, te 2 sceny pewnie są dupami. Używamy
|
|
fuzz.token_set_ratio (rapidfuzz) bo:
|
|
- tolerancyjne na kolejność słów
|
|
- ignoruje powtórzenia
|
|
- ignoruje extra tokeny
|
|
|
|
Normalization aggressive — usuwamy:
|
|
1. Performer canonical_name (dominuje większość tytułów)
|
|
2. Site brand prefixes (Brazzers, LegalPorno, Mofos, etc.)
|
|
3. Generic adjectives (bimbo, milf, hot, sexy, blonde, brunette, teen)
|
|
4. Generic act words (anal, blowjob, dp, dvp, bbc, gangbang)
|
|
5. Hashtags (#blonde #bigtits)
|
|
6. Trailing IDs (xhWaKl9, 12117054, vidara.so, etc.)
|
|
7. Punctuation + extra whitespace
|
|
|
|
Run: python /srv/scripts/title_levenshtein_benchmark.py <performer_id> [--threshold 80]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
from rapidfuzz import fuzz
|
|
from sqlalchemy import select
|
|
|
|
sys.path.insert(0, "/srv")
|
|
from app.db import SessionLocal
|
|
from app.models.performer import Performer
|
|
from app.models.scene import Scene, ScenePerformer
|
|
from app.models.playback_source import PlaybackSource
|
|
|
|
# Site brand prefixes — często duplikują studio info już w `scenes.studio`.
|
|
_SITE_BRANDS = {
|
|
"legalporno", "brazzers", "brazzersexxtra", "publicagent", "mofos",
|
|
"twistys", "vivid", "ddf", "ddfbusty", "kink", "naughty", "naughtyamerica",
|
|
"joymii", "julesjordan", "mompov", "perfectgirls", "puremature",
|
|
"realitykings", "scoreland", "brazilbang", "foxes", "lubed",
|
|
"castingcouchx", "throated", "collegerules", "fakehub", "faketaxi",
|
|
"fakedrivingschool", "porn", "brazzersexxtra", "publicpickups",
|
|
"blacked", "blackedraw", "tushy", "tushyraw", "vixen", "deeper",
|
|
"evilangel", "newsensations", "private", "privatecom",
|
|
}
|
|
|
|
# Generic adjectives + act vocab — informacyjne ale nie title-distinguishing.
|
|
_NOISE_WORDS = {
|
|
"new", "brand", "full", "leaked", "masked", "ppv", "video", "scene",
|
|
"aka", "ft", "feat", "with", "and", "the", "a", "an",
|
|
"bimbo", "milf", "teen", "teens", "blonde", "brunette", "redhead",
|
|
"busty", "curvy", "skinny", "petite", "tall", "short", "thick", "thin",
|
|
"amateur", "homemade", "professional", "kinky", "sexy", "hot", "horny",
|
|
"beautiful", "gorgeous", "stunning", "perfect", "sweet", "naughty",
|
|
"wild", "tight", "young", "older",
|
|
"anal", "blowjob", "bj", "dp", "dvp", "dap", "dpp", "bbc", "bg",
|
|
"gangbang", "fuck", "fucking", "fucked", "fucks",
|
|
"boobs", "tits", "ass", "pussy", "cock", "dick", "facial", "cumshot",
|
|
"creampie", "rough", "deep", "hard", "wet", "pov", "pounding",
|
|
"smal", "small", "big", "huge", "massive", "intense",
|
|
"loves", "gets", "takes", "gives", "shares", "wants", "needs",
|
|
"her", "his", "she", "he", "for", "from", "to", "in", "on", "by",
|
|
"of", "her", "with",
|
|
"first", "her", "best", "girl", "boy", "girls", "guy", "guys", "boys",
|
|
"fwb", "ass", "wife", "wifey", "step", "stepmom", "stepdad", "stepsis",
|
|
}
|
|
|
|
# Trailing tokens we drop: numeric IDs, short alphanumerics, domains
|
|
_TRAILING_ID_RE = re.compile(r'\b(?:[a-z]*\d{4,}[a-z\d]*|\d{4,})\b', re.IGNORECASE)
|
|
_DOMAIN_RE = re.compile(r'\b[a-z0-9]+\.(?:com|net|so|to|sx|tv|video|porn|xxx|cc|biz|info)\b', re.IGNORECASE)
|
|
_HASHTAG_RE = re.compile(r'#\w+')
|
|
_PUNCT_RE = re.compile(r'[^a-z0-9\s]+')
|
|
_WS_RE = re.compile(r'\s+')
|
|
# Dates + times — sceny "Stream Started At MM/DD/YYYY HH:MM Pm" są UNIKALNE per date,
|
|
# po stripping mają identyczny normalized title → mass false-positive cluster.
|
|
_DATE_RE = re.compile(r'\b\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}\b')
|
|
_TIME_RE = re.compile(r'\b\d{1,2}:\d{2}\s*[ap]m?\b', re.IGNORECASE)
|
|
# Phrases które zdradzają per-instance recording (live stream session, schedule entry).
|
|
_LIVE_RE = re.compile(r'\bstream\s+started\s+at\b', re.IGNORECASE)
|
|
|
|
|
|
def normalize_title(title: str, performer_name: str | None = None) -> str:
|
|
"""Aggressive normalization dla dup-detection.
|
|
|
|
Zwraca pusty string (skip in compare) dla per-instance recordings które
|
|
by się ścinały do generic phrase (np. "Stream Started At ..." — wszystkie
|
|
sesje live mają identyczny normalized title bo daty są stripped).
|
|
"""
|
|
if not title:
|
|
return ""
|
|
# Live recordings — skip całkowicie, każdy entry unique per date.
|
|
if _LIVE_RE.search(title):
|
|
return ""
|
|
t = title.lower()
|
|
# Strip dates/times PRZED resztą (mogą zawierać digits które _TRAILING_ID drop).
|
|
t = _DATE_RE.sub(" ", t)
|
|
t = _TIME_RE.sub(" ", t)
|
|
# Strip hashtags + domains + numeric IDs (xhWaKl9, 12117054, vidara.so).
|
|
t = _HASHTAG_RE.sub(" ", t)
|
|
t = _DOMAIN_RE.sub(" ", t)
|
|
t = _TRAILING_ID_RE.sub(" ", t)
|
|
# Strip performer name if known
|
|
if performer_name:
|
|
for part in performer_name.lower().split():
|
|
t = re.sub(rf'\b{re.escape(part)}\b', " ", t)
|
|
# Punctuation → space, collapse whitespace
|
|
t = _PUNCT_RE.sub(" ", t)
|
|
t = _WS_RE.sub(" ", t).strip()
|
|
# Drop noise words (per token)
|
|
tokens = [tok for tok in t.split() if tok not in _NOISE_WORDS and tok not in _SITE_BRANDS and len(tok) > 1]
|
|
return " ".join(tokens)
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("performer_id", type=str)
|
|
ap.add_argument("--threshold", type=int, default=80, help="rapidfuzz token_set_ratio threshold (0-100)")
|
|
args = ap.parse_args()
|
|
import uuid as _uuid
|
|
perf_id = _uuid.UUID(args.performer_id)
|
|
|
|
with SessionLocal() as session:
|
|
perf = session.get(Performer, perf_id)
|
|
if not perf:
|
|
print(f"performer {perf_id} not found")
|
|
return
|
|
print(f"=== {perf.canonical_name} ({perf_id}) ===\n")
|
|
|
|
rows = session.execute(
|
|
select(Scene.id, Scene.title, Scene.duration_sec)
|
|
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
|
|
.where(ScenePerformer.performer_id == perf_id)
|
|
).all()
|
|
if not rows:
|
|
print("no scenes")
|
|
return
|
|
|
|
scenes = [(r.id, r.title, r.duration_sec, normalize_title(r.title, perf.canonical_name)) for r in rows]
|
|
scenes = [s for s in scenes if s[3]] # skip empty normalized
|
|
print(f"Total scenes with normalizable title: {len(scenes)}")
|
|
print()
|
|
|
|
# Show normalization sample
|
|
print("Normalization examples (first 5):")
|
|
for sid, title, dur, norm in scenes[:5]:
|
|
print(f" '{title[:55]}' → '{norm[:55]}'")
|
|
print()
|
|
|
|
# Filter: scenes z min 3 znaczącymi tokenami (krótkie tytuły jak "fetish boots"
|
|
# albo "private gold" generują false-positive z byle czym).
|
|
MIN_TOKENS = 3
|
|
scenes_for_compare = [s for s in scenes if len(s[3].split()) >= MIN_TOKENS]
|
|
print(f"After min-tokens filter ({MIN_TOKENS}+): {len(scenes_for_compare)} scenes ({len(scenes)-len(scenes_for_compare)} skipped)")
|
|
print()
|
|
|
|
# Pairwise fuzzy compare + duration similarity gate
|
|
histogram: dict = defaultdict(int)
|
|
pairs: list = []
|
|
for i in range(len(scenes_for_compare)):
|
|
sid_a, title_a, dur_a, norm_a = scenes_for_compare[i]
|
|
for j in range(i + 1, len(scenes_for_compare)):
|
|
sid_b, title_b, dur_b, norm_b = scenes_for_compare[j]
|
|
# Duration gate: jeśli OBA znane, dropny gdy różnica > 30% (różne cuts).
|
|
# Jeden None tolerujemy (tube'y często nie zwracają duration).
|
|
if dur_a and dur_b:
|
|
longer, shorter = max(dur_a, dur_b), min(dur_a, dur_b)
|
|
if shorter > 0 and (longer - shorter) / longer > 0.30:
|
|
continue
|
|
# Use token_set_ratio (kolejność słów + extra tokeny tolerated).
|
|
# Plus partial_ratio as secondary (substring match).
|
|
ts = fuzz.token_set_ratio(norm_a, norm_b)
|
|
pr = fuzz.partial_ratio(norm_a, norm_b)
|
|
# Wymóg: BOTH high. token_set_ratio sam daje false-positives gdy
|
|
# jeden norm_a jest subset (np. "Private Massage" subset "Private Massage Threesome").
|
|
score = min(ts, pr)
|
|
bucket = int(score // 10) * 10
|
|
histogram[bucket] += 1
|
|
if score >= args.threshold:
|
|
pairs.append((score, sid_a, sid_b, title_a, title_b, dur_a, dur_b))
|
|
|
|
if not pairs:
|
|
print(f"No pairs ≥ {args.threshold} threshold.")
|
|
return
|
|
|
|
print("Token-set ratio distribution (all pairs):")
|
|
for b in sorted(histogram.keys(), reverse=True):
|
|
if b < 50:
|
|
break
|
|
bar = "#" * min(50, histogram[b])
|
|
print(f" {b:3d}-{b+9:3d} : {histogram[b]:4d} {bar}")
|
|
print()
|
|
|
|
print(f"Likely-duplicate pairs (token_set ≥ {args.threshold}): {len(pairs)}")
|
|
print()
|
|
|
|
# Union-find clusters
|
|
parent: dict = {s[0]: s[0] for s in scenes}
|
|
def find(x):
|
|
while parent[x] != x:
|
|
parent[x] = parent[parent[x]]
|
|
x = parent[x]
|
|
return x
|
|
def union(a, b):
|
|
ra, rb = find(a), find(b)
|
|
if ra != rb:
|
|
parent[ra] = rb
|
|
|
|
for score, a, b, _, _, _, _ in pairs:
|
|
union(a, b)
|
|
|
|
clusters: dict = defaultdict(list)
|
|
for s in scenes:
|
|
clusters[find(s[0])].append(s)
|
|
dup_clusters = [c for c in clusters.values() if len(c) > 1]
|
|
|
|
print(f"Duplicate clusters: {len(dup_clusters)}")
|
|
total_redundant = sum(len(c) - 1 for c in dup_clusters)
|
|
print(f"Total redundant scenes: {total_redundant} (merge to {len(dup_clusters)} canonical)")
|
|
print()
|
|
|
|
# Sources per scene for context
|
|
ps_by_scene: dict = defaultdict(set)
|
|
for row in session.execute(
|
|
select(PlaybackSource.scene_id, PlaybackSource.origin)
|
|
.where(PlaybackSource.scene_id.in_([s[0] for c in dup_clusters for s in c]))
|
|
.where(PlaybackSource.dead_at.is_(None))
|
|
).all():
|
|
origin = row.origin.split(":", 1)[1] if ":" in row.origin else row.origin
|
|
ps_by_scene[row.scene_id].add(origin)
|
|
|
|
n_show = min(10, len(dup_clusters))
|
|
print(f"=== Top {n_show} clusters ===\n")
|
|
for i, cluster in enumerate(sorted(dup_clusters, key=len, reverse=True)[:n_show], 1):
|
|
print(f"--- Cluster {i} ({len(cluster)} scenes) ---")
|
|
for sid, title, dur, norm in cluster:
|
|
dur_s = f"{dur}s" if dur else "no-dur"
|
|
sources = ",".join(sorted(ps_by_scene.get(sid, []))) or "no-src"
|
|
print(f" {sid} {dur_s:>7s} [{sources[:35]:35s}] {title[:55]}")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|