goon/scripts/title_levenshtein_benchmark.py

"""Title-based duplicate detection per performer (no phash needed).

Uzupełnia phash heuristykę: gdy thumbs są różne (xvideos auto-grab z różnych
klatek) ale title fragmenty się pokrywają, te 2 sceny pewnie są dupami. Używamy
fuzz.token_set_ratio (rapidfuzz) bo:
- tolerancyjne na kolejność słów
- ignoruje powtórzenia
- ignoruje extra tokeny

Normalization aggressive — usuwamy:
1. Performer canonical_name (dominuje większość tytułów)
2. Site brand prefixes (Brazzers, LegalPorno, Mofos, etc.)
3. Generic adjectives (bimbo, milf, hot, sexy, blonde, brunette, teen)
4. Generic act words (anal, blowjob, dp, dvp, bbc, gangbang)
5. Hashtags (#blonde #bigtits)
6. Trailing IDs (xhWaKl9, 12117054, vidara.so, etc.)
7. Punctuation + extra whitespace

Run: python /srv/scripts/title_levenshtein_benchmark.py <performer_id> [--threshold 80]
"""
from __future__ import annotations

import argparse
import re
import sys
from collections import defaultdict

from rapidfuzz import fuzz
from sqlalchemy import select

sys.path.insert(0, "/srv")
from app.db import SessionLocal
from app.models.performer import Performer
from app.models.scene import Scene, ScenePerformer
from app.models.playback_source import PlaybackSource

# Site brand prefixes — często duplikują studio info już w `scenes.studio`.
_SITE_BRANDS = {
    "legalporno", "brazzers", "brazzersexxtra", "publicagent", "mofos",
    "twistys", "vivid", "ddf", "ddfbusty", "kink", "naughty", "naughtyamerica",
    "joymii", "julesjordan", "mompov", "perfectgirls", "puremature",
    "realitykings", "scoreland", "brazilbang", "foxes", "lubed",
    "castingcouchx", "throated", "collegerules", "fakehub", "faketaxi",
    "fakedrivingschool", "porn", "brazzersexxtra", "publicpickups",
    "blacked", "blackedraw", "tushy", "tushyraw", "vixen", "deeper",
    "evilangel", "newsensations", "private", "privatecom",
}

# Generic adjectives + act vocab — informacyjne ale nie title-distinguishing.
_NOISE_WORDS = {
    "new", "brand", "full", "leaked", "masked", "ppv", "video", "scene",
    "aka", "ft", "feat", "with", "and", "the", "a", "an",
    "bimbo", "milf", "teen", "teens", "blonde", "brunette", "redhead",
    "busty", "curvy", "skinny", "petite", "tall", "short", "thick", "thin",
    "amateur", "homemade", "professional", "kinky", "sexy", "hot", "horny",
    "beautiful", "gorgeous", "stunning", "perfect", "sweet", "naughty",
    "wild", "tight", "young", "older",
    "anal", "blowjob", "bj", "dp", "dvp", "dap", "dpp", "bbc", "bg",
    "gangbang", "fuck", "fucking", "fucked", "fucks",
    "boobs", "tits", "ass", "pussy", "cock", "dick", "facial", "cumshot",
    "creampie", "rough", "deep", "hard", "wet", "pov", "pounding",
    "smal", "small", "big", "huge", "massive", "intense",
    "loves", "gets", "takes", "gives", "shares", "wants", "needs",
    "her", "his", "she", "he", "for", "from", "to", "in", "on", "by",
    "of", "her", "with",
    "first", "her", "best", "girl", "boy", "girls", "guy", "guys", "boys",
    "fwb", "ass", "wife", "wifey", "step", "stepmom", "stepdad", "stepsis",
}

# Trailing tokens we drop: numeric IDs, short alphanumerics, domains
_TRAILING_ID_RE = re.compile(r'\b(?:[a-z]*\d{4,}[a-z\d]*|\d{4,})\b', re.IGNORECASE)
_DOMAIN_RE = re.compile(r'\b[a-z0-9]+\.(?:com|net|so|to|sx|tv|video|porn|xxx|cc|biz|info)\b', re.IGNORECASE)
_HASHTAG_RE = re.compile(r'#\w+')
_PUNCT_RE = re.compile(r'[^a-z0-9\s]+')
_WS_RE = re.compile(r'\s+')
# Dates + times — sceny "Stream Started At MM/DD/YYYY HH:MM Pm" są UNIKALNE per date,
# po stripping mają identyczny normalized title → mass false-positive cluster.
_DATE_RE = re.compile(r'\b\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}\b')
_TIME_RE = re.compile(r'\b\d{1,2}:\d{2}\s*[ap]m?\b', re.IGNORECASE)
# Phrases które zdradzają per-instance recording (live stream session, schedule entry).
_LIVE_RE = re.compile(r'\bstream\s+started\s+at\b', re.IGNORECASE)


def normalize_title(title: str, performer_name: str | None = None) -> str:
    """Aggressive normalization dla dup-detection.

    Zwraca pusty string (skip in compare) dla per-instance recordings które
    by się ścinały do generic phrase (np. "Stream Started At ..." — wszystkie
    sesje live mają identyczny normalized title bo daty są stripped).
    """
    if not title:
        return ""
    # Live recordings — skip całkowicie, każdy entry unique per date.
    if _LIVE_RE.search(title):
        return ""
    t = title.lower()
    # Strip dates/times PRZED resztą (mogą zawierać digits które _TRAILING_ID drop).
    t = _DATE_RE.sub(" ", t)
    t = _TIME_RE.sub(" ", t)
    # Strip hashtags + domains + numeric IDs (xhWaKl9, 12117054, vidara.so).
    t = _HASHTAG_RE.sub(" ", t)
    t = _DOMAIN_RE.sub(" ", t)
    t = _TRAILING_ID_RE.sub(" ", t)
    # Strip performer name if known
    if performer_name:
        for part in performer_name.lower().split():
            t = re.sub(rf'\b{re.escape(part)}\b', " ", t)
    # Punctuation → space, collapse whitespace
    t = _PUNCT_RE.sub(" ", t)
    t = _WS_RE.sub(" ", t).strip()
    # Drop noise words (per token)
    tokens = [tok for tok in t.split() if tok not in _NOISE_WORDS and tok not in _SITE_BRANDS and len(tok) > 1]
    return " ".join(tokens)


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("performer_id", type=str)
    ap.add_argument("--threshold", type=int, default=80, help="rapidfuzz token_set_ratio threshold (0-100)")
    args = ap.parse_args()
    import uuid as _uuid
    perf_id = _uuid.UUID(args.performer_id)

    with SessionLocal() as session:
        perf = session.get(Performer, perf_id)
        if not perf:
            print(f"performer {perf_id} not found")
            return
        print(f"=== {perf.canonical_name} ({perf_id}) ===\n")

        rows = session.execute(
            select(Scene.id, Scene.title, Scene.duration_sec)
            .join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
            .where(ScenePerformer.performer_id == perf_id)
        ).all()
        if not rows:
            print("no scenes")
            return

        scenes = [(r.id, r.title, r.duration_sec, normalize_title(r.title, perf.canonical_name)) for r in rows]
        scenes = [s for s in scenes if s[3]]  # skip empty normalized
        print(f"Total scenes with normalizable title: {len(scenes)}")
        print()

        # Show normalization sample
        print("Normalization examples (first 5):")
        for sid, title, dur, norm in scenes[:5]:
            print(f"  '{title[:55]}' → '{norm[:55]}'")
        print()

        # Filter: scenes z min 3 znaczącymi tokenami (krótkie tytuły jak "fetish boots"
        # albo "private gold" generują false-positive z byle czym).
        MIN_TOKENS = 3
        scenes_for_compare = [s for s in scenes if len(s[3].split()) >= MIN_TOKENS]
        print(f"After min-tokens filter ({MIN_TOKENS}+): {len(scenes_for_compare)} scenes ({len(scenes)-len(scenes_for_compare)} skipped)")
        print()

        # Pairwise fuzzy compare + duration similarity gate
        histogram: dict = defaultdict(int)
        pairs: list = []
        for i in range(len(scenes_for_compare)):
            sid_a, title_a, dur_a, norm_a = scenes_for_compare[i]
            for j in range(i + 1, len(scenes_for_compare)):
                sid_b, title_b, dur_b, norm_b = scenes_for_compare[j]
                # Duration gate: jeśli OBA znane, dropny gdy różnica > 30% (różne cuts).
                # Jeden None tolerujemy (tube'y często nie zwracają duration).
                if dur_a and dur_b:
                    longer, shorter = max(dur_a, dur_b), min(dur_a, dur_b)
                    if shorter > 0 and (longer - shorter) / longer > 0.30:
                        continue
                # Use token_set_ratio (kolejność słów + extra tokeny tolerated).
                # Plus partial_ratio as secondary (substring match).
                ts = fuzz.token_set_ratio(norm_a, norm_b)
                pr = fuzz.partial_ratio(norm_a, norm_b)
                # Wymóg: BOTH high. token_set_ratio sam daje false-positives gdy
                # jeden norm_a jest subset (np. "Private Massage" subset "Private Massage Threesome").
                score = min(ts, pr)
                bucket = int(score // 10) * 10
                histogram[bucket] += 1
                if score >= args.threshold:
                    pairs.append((score, sid_a, sid_b, title_a, title_b, dur_a, dur_b))

        if not pairs:
            print(f"No pairs ≥ {args.threshold} threshold.")
            return

        print("Token-set ratio distribution (all pairs):")
        for b in sorted(histogram.keys(), reverse=True):
            if b < 50:
                break
            bar = "#" * min(50, histogram[b])
            print(f"  {b:3d}-{b+9:3d} : {histogram[b]:4d}  {bar}")
        print()

        print(f"Likely-duplicate pairs (token_set ≥ {args.threshold}): {len(pairs)}")
        print()

        # Union-find clusters
        parent: dict = {s[0]: s[0] for s in scenes}
        def find(x):
            while parent[x] != x:
                parent[x] = parent[parent[x]]
                x = parent[x]
            return x
        def union(a, b):
            ra, rb = find(a), find(b)
            if ra != rb:
                parent[ra] = rb

        for score, a, b, _, _, _, _ in pairs:
            union(a, b)

        clusters: dict = defaultdict(list)
        for s in scenes:
            clusters[find(s[0])].append(s)
        dup_clusters = [c for c in clusters.values() if len(c) > 1]

        print(f"Duplicate clusters: {len(dup_clusters)}")
        total_redundant = sum(len(c) - 1 for c in dup_clusters)
        print(f"Total redundant scenes: {total_redundant} (merge to {len(dup_clusters)} canonical)")
        print()

        # Sources per scene for context
        ps_by_scene: dict = defaultdict(set)
        for row in session.execute(
            select(PlaybackSource.scene_id, PlaybackSource.origin)
            .where(PlaybackSource.scene_id.in_([s[0] for c in dup_clusters for s in c]))
            .where(PlaybackSource.dead_at.is_(None))
        ).all():
            origin = row.origin.split(":", 1)[1] if ":" in row.origin else row.origin
            ps_by_scene[row.scene_id].add(origin)

        n_show = min(10, len(dup_clusters))
        print(f"=== Top {n_show} clusters ===\n")
        for i, cluster in enumerate(sorted(dup_clusters, key=len, reverse=True)[:n_show], 1):
            print(f"--- Cluster {i} ({len(cluster)} scenes) ---")
            for sid, title, dur, norm in cluster:
                dur_s = f"{dur}s" if dur else "no-dur"
                sources = ",".join(sorted(ps_by_scene.get(sid, []))) or "no-src"
                print(f"  {sid}  {dur_s:>7s}  [{sources[:35]:35s}]  {title[:55]}")
            print()


if __name__ == "__main__":
    main()