"""Title-based duplicate detection per performer (no phash needed). Uzupełnia phash heuristykę: gdy thumbs są różne (xvideos auto-grab z różnych klatek) ale title fragmenty się pokrywają, te 2 sceny pewnie są dupami. Używamy fuzz.token_set_ratio (rapidfuzz) bo: - tolerancyjne na kolejność słów - ignoruje powtórzenia - ignoruje extra tokeny Normalization aggressive — usuwamy: 1. Performer canonical_name (dominuje większość tytułów) 2. Site brand prefixes (Brazzers, LegalPorno, Mofos, etc.) 3. Generic adjectives (bimbo, milf, hot, sexy, blonde, brunette, teen) 4. Generic act words (anal, blowjob, dp, dvp, bbc, gangbang) 5. Hashtags (#blonde #bigtits) 6. Trailing IDs (xhWaKl9, 12117054, vidara.so, etc.) 7. Punctuation + extra whitespace Run: python /srv/scripts/title_levenshtein_benchmark.py [--threshold 80] """ from __future__ import annotations import argparse import re import sys from collections import defaultdict from rapidfuzz import fuzz from sqlalchemy import select sys.path.insert(0, "/srv") from app.db import SessionLocal from app.models.performer import Performer from app.models.scene import Scene, ScenePerformer from app.models.playback_source import PlaybackSource # Site brand prefixes — często duplikują studio info już w `scenes.studio`. _SITE_BRANDS = { "legalporno", "brazzers", "brazzersexxtra", "publicagent", "mofos", "twistys", "vivid", "ddf", "ddfbusty", "kink", "naughty", "naughtyamerica", "joymii", "julesjordan", "mompov", "perfectgirls", "puremature", "realitykings", "scoreland", "brazilbang", "foxes", "lubed", "castingcouchx", "throated", "collegerules", "fakehub", "faketaxi", "fakedrivingschool", "porn", "brazzersexxtra", "publicpickups", "blacked", "blackedraw", "tushy", "tushyraw", "vixen", "deeper", "evilangel", "newsensations", "private", "privatecom", } # Generic adjectives + act vocab — informacyjne ale nie title-distinguishing. _NOISE_WORDS = { "new", "brand", "full", "leaked", "masked", "ppv", "video", "scene", "aka", "ft", "feat", "with", "and", "the", "a", "an", "bimbo", "milf", "teen", "teens", "blonde", "brunette", "redhead", "busty", "curvy", "skinny", "petite", "tall", "short", "thick", "thin", "amateur", "homemade", "professional", "kinky", "sexy", "hot", "horny", "beautiful", "gorgeous", "stunning", "perfect", "sweet", "naughty", "wild", "tight", "young", "older", "anal", "blowjob", "bj", "dp", "dvp", "dap", "dpp", "bbc", "bg", "gangbang", "fuck", "fucking", "fucked", "fucks", "boobs", "tits", "ass", "pussy", "cock", "dick", "facial", "cumshot", "creampie", "rough", "deep", "hard", "wet", "pov", "pounding", "smal", "small", "big", "huge", "massive", "intense", "loves", "gets", "takes", "gives", "shares", "wants", "needs", "her", "his", "she", "he", "for", "from", "to", "in", "on", "by", "of", "her", "with", "first", "her", "best", "girl", "boy", "girls", "guy", "guys", "boys", "fwb", "ass", "wife", "wifey", "step", "stepmom", "stepdad", "stepsis", } # Trailing tokens we drop: numeric IDs, short alphanumerics, domains _TRAILING_ID_RE = re.compile(r'\b(?:[a-z]*\d{4,}[a-z\d]*|\d{4,})\b', re.IGNORECASE) _DOMAIN_RE = re.compile(r'\b[a-z0-9]+\.(?:com|net|so|to|sx|tv|video|porn|xxx|cc|biz|info)\b', re.IGNORECASE) _HASHTAG_RE = re.compile(r'#\w+') _PUNCT_RE = re.compile(r'[^a-z0-9\s]+') _WS_RE = re.compile(r'\s+') # Dates + times — sceny "Stream Started At MM/DD/YYYY HH:MM Pm" są UNIKALNE per date, # po stripping mają identyczny normalized title → mass false-positive cluster. _DATE_RE = re.compile(r'\b\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}\b') _TIME_RE = re.compile(r'\b\d{1,2}:\d{2}\s*[ap]m?\b', re.IGNORECASE) # Phrases które zdradzają per-instance recording (live stream session, schedule entry). _LIVE_RE = re.compile(r'\bstream\s+started\s+at\b', re.IGNORECASE) def normalize_title(title: str, performer_name: str | None = None) -> str: """Aggressive normalization dla dup-detection. Zwraca pusty string (skip in compare) dla per-instance recordings które by się ścinały do generic phrase (np. "Stream Started At ..." — wszystkie sesje live mają identyczny normalized title bo daty są stripped). """ if not title: return "" # Live recordings — skip całkowicie, każdy entry unique per date. if _LIVE_RE.search(title): return "" t = title.lower() # Strip dates/times PRZED resztą (mogą zawierać digits które _TRAILING_ID drop). t = _DATE_RE.sub(" ", t) t = _TIME_RE.sub(" ", t) # Strip hashtags + domains + numeric IDs (xhWaKl9, 12117054, vidara.so). t = _HASHTAG_RE.sub(" ", t) t = _DOMAIN_RE.sub(" ", t) t = _TRAILING_ID_RE.sub(" ", t) # Strip performer name if known if performer_name: for part in performer_name.lower().split(): t = re.sub(rf'\b{re.escape(part)}\b', " ", t) # Punctuation → space, collapse whitespace t = _PUNCT_RE.sub(" ", t) t = _WS_RE.sub(" ", t).strip() # Drop noise words (per token) tokens = [tok for tok in t.split() if tok not in _NOISE_WORDS and tok not in _SITE_BRANDS and len(tok) > 1] return " ".join(tokens) def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("performer_id", type=str) ap.add_argument("--threshold", type=int, default=80, help="rapidfuzz token_set_ratio threshold (0-100)") args = ap.parse_args() import uuid as _uuid perf_id = _uuid.UUID(args.performer_id) with SessionLocal() as session: perf = session.get(Performer, perf_id) if not perf: print(f"performer {perf_id} not found") return print(f"=== {perf.canonical_name} ({perf_id}) ===\n") rows = session.execute( select(Scene.id, Scene.title, Scene.duration_sec) .join(ScenePerformer, ScenePerformer.scene_id == Scene.id) .where(ScenePerformer.performer_id == perf_id) ).all() if not rows: print("no scenes") return scenes = [(r.id, r.title, r.duration_sec, normalize_title(r.title, perf.canonical_name)) for r in rows] scenes = [s for s in scenes if s[3]] # skip empty normalized print(f"Total scenes with normalizable title: {len(scenes)}") print() # Show normalization sample print("Normalization examples (first 5):") for sid, title, dur, norm in scenes[:5]: print(f" '{title[:55]}' → '{norm[:55]}'") print() # Filter: scenes z min 3 znaczącymi tokenami (krótkie tytuły jak "fetish boots" # albo "private gold" generują false-positive z byle czym). MIN_TOKENS = 3 scenes_for_compare = [s for s in scenes if len(s[3].split()) >= MIN_TOKENS] print(f"After min-tokens filter ({MIN_TOKENS}+): {len(scenes_for_compare)} scenes ({len(scenes)-len(scenes_for_compare)} skipped)") print() # Pairwise fuzzy compare + duration similarity gate histogram: dict = defaultdict(int) pairs: list = [] for i in range(len(scenes_for_compare)): sid_a, title_a, dur_a, norm_a = scenes_for_compare[i] for j in range(i + 1, len(scenes_for_compare)): sid_b, title_b, dur_b, norm_b = scenes_for_compare[j] # Duration gate: jeśli OBA znane, dropny gdy różnica > 30% (różne cuts). # Jeden None tolerujemy (tube'y często nie zwracają duration). if dur_a and dur_b: longer, shorter = max(dur_a, dur_b), min(dur_a, dur_b) if shorter > 0 and (longer - shorter) / longer > 0.30: continue # Use token_set_ratio (kolejność słów + extra tokeny tolerated). # Plus partial_ratio as secondary (substring match). ts = fuzz.token_set_ratio(norm_a, norm_b) pr = fuzz.partial_ratio(norm_a, norm_b) # Wymóg: BOTH high. token_set_ratio sam daje false-positives gdy # jeden norm_a jest subset (np. "Private Massage" subset "Private Massage Threesome"). score = min(ts, pr) bucket = int(score // 10) * 10 histogram[bucket] += 1 if score >= args.threshold: pairs.append((score, sid_a, sid_b, title_a, title_b, dur_a, dur_b)) if not pairs: print(f"No pairs ≥ {args.threshold} threshold.") return print("Token-set ratio distribution (all pairs):") for b in sorted(histogram.keys(), reverse=True): if b < 50: break bar = "#" * min(50, histogram[b]) print(f" {b:3d}-{b+9:3d} : {histogram[b]:4d} {bar}") print() print(f"Likely-duplicate pairs (token_set ≥ {args.threshold}): {len(pairs)}") print() # Union-find clusters parent: dict = {s[0]: s[0] for s in scenes} def find(x): while parent[x] != x: parent[x] = parent[parent[x]] x = parent[x] return x def union(a, b): ra, rb = find(a), find(b) if ra != rb: parent[ra] = rb for score, a, b, _, _, _, _ in pairs: union(a, b) clusters: dict = defaultdict(list) for s in scenes: clusters[find(s[0])].append(s) dup_clusters = [c for c in clusters.values() if len(c) > 1] print(f"Duplicate clusters: {len(dup_clusters)}") total_redundant = sum(len(c) - 1 for c in dup_clusters) print(f"Total redundant scenes: {total_redundant} (merge to {len(dup_clusters)} canonical)") print() # Sources per scene for context ps_by_scene: dict = defaultdict(set) for row in session.execute( select(PlaybackSource.scene_id, PlaybackSource.origin) .where(PlaybackSource.scene_id.in_([s[0] for c in dup_clusters for s in c])) .where(PlaybackSource.dead_at.is_(None)) ).all(): origin = row.origin.split(":", 1)[1] if ":" in row.origin else row.origin ps_by_scene[row.scene_id].add(origin) n_show = min(10, len(dup_clusters)) print(f"=== Top {n_show} clusters ===\n") for i, cluster in enumerate(sorted(dup_clusters, key=len, reverse=True)[:n_show], 1): print(f"--- Cluster {i} ({len(cluster)} scenes) ---") for sid, title, dur, norm in cluster: dur_s = f"{dur}s" if dur else "no-dur" sources = ",".join(sorted(ps_by_scene.get(sid, []))) or "no-src" print(f" {sid} {dur_s:>7s} [{sources[:35]:35s}] {title[:55]}") print() if __name__ == "__main__": main()