diff --git a/scripts/merge_dupe_thumb_asset.py b/scripts/merge_dupe_thumb_asset.py new file mode 100644 index 0000000..0cb8a81 --- /dev/null +++ b/scripts/merge_dupe_thumb_asset.py @@ -0,0 +1,115 @@ +"""Merge tube-dup po asset-id miniatury (rodzina hdporn.gg / fullmovies.xxx). + +Te siostrzane platformy dzielą jedną przestrzeń id wideo i ingestują ten sam film pod +RÓŻNYMI tytułami → bulk_dedup tego nie łapie (różne tytuły, brak phash, exact-title-merge +nie działa). Sygnał: identyczny asset-id w ścieżce miniatury `/000//` na +`img.hdporn.gg` LUB `img.fullmovies.xxx` + IDENTYCZNA długość (co do sekundy) = ten sam +film (zweryfikowane 2026-06-14, próbka = realne dupy; report 205b17d9 / 5a2944cb). + +KRYTYCZNE: wspólny id-space istnieje TYLKO dla tej pary hostów. Inne CDN-y z tym samym +wzorcem ścieżki (ptx.cdntrex, porn00, freshporno, pornhat...) reużywają te numery dla +NIEpowiązanych filmów — grupowanie cross-host po gołym numerze daje fałszywe pary +(dry-run 2026-06-14: "UsePOV Gia Paige" vs "chelsie rae bikini squad" pod tym samym +numerem). Stąd twarde `~ img.(hdporn.gg|fullmovies.xxx)` + guard długości (GROUP BY +asset_id, dur → różna długość przy tym samym numerze NIE jest łączona). + +Keep = scena z największą liczbą external_refs → playback_sources → najstarsza. +Merge przez resolve.scene_merge.merge_scenes (przenosi refs/performers/tags/fingerprints/ +playback_sources, kasuje drop). + +Użycie (kontener worker): + python scripts/merge_dupe_thumb_asset.py [STUDIO_ID] [--commit] +Bez STUDIO_ID = global. Bez --commit = dry-run. +""" +from __future__ import annotations + +import sys +import uuid as _u + +from sqlalchemy import text + +from app.db import session_scope +from app.resolve.scene_merge import merge_scenes + +_HOST_RE = r"://img\.(hdporn\.gg|fullmovies\.xxx)/[0-9]+000/[0-9]+/" + + +def _args() -> tuple[str | None, bool]: + commit = "--commit" in sys.argv + studio = None + for a in sys.argv[1:]: + if not a.startswith("--") and len(a) >= 32: + studio = a + return studio, commit + + +def _groups(studio_id: str | None) -> list[list[str]]: + where_studio = "AND s.studio_id = :sid" if studio_id else "" + sql = f""" + WITH cand AS ( + SELECT DISTINCT s.id, + substring(p.thumbnail_url from '/[0-9]+000/([0-9]+)/') AS asset_id, + s.duration_sec dur, + s.created_at, + (SELECT count(*) FROM scene_external_refs r WHERE r.scene_id=s.id) refs, + (SELECT count(*) FROM playback_sources pp WHERE pp.scene_id=s.id) srcs + FROM scenes s + JOIN playback_sources p ON p.scene_id=s.id + WHERE p.thumbnail_url ~ '{_HOST_RE}' + AND p.dead_at IS NULL + AND s.duration_sec IS NOT NULL + {where_studio} + ) + SELECT array_agg(id::text ORDER BY refs DESC, srcs DESC, created_at ASC) members + FROM cand + WHERE asset_id IS NOT NULL + GROUP BY asset_id, dur + HAVING count(DISTINCT id) > 1 + """ + params = {"sid": studio_id} if studio_id else {} + with session_scope() as s: + rows = s.execute(text(sql), params).all() + seen: set[frozenset] = set() + out: list[list[str]] = [] + for (members,) in rows: + key = frozenset(members) + if key in seen: + continue + seen.add(key) + out.append(list(members)) + return out + + +def main() -> None: + studio_id, commit = _args() + groups = _groups(studio_id) + pairs = sum(len(g) - 1 for g in groups) + print( + f"studio={studio_id or 'ALL'} groups={len(groups)} merges={pairs} commit={commit}", + flush=True, + ) + merged = errors = 0 + for g in groups: + keep = g[0] + for drop in g[1:]: + if not commit: + print(f" [dry] keep {keep[:8]} <- drop {drop[:8]}") + continue + try: + with session_scope() as s: + merge_scenes( + s, keep_id=_u.UUID(keep), drop_id=_u.UUID(drop), + resolved_by="merge_dupe_thumb_asset", + ) + merged += 1 + if merged % 500 == 0: + print(f" progress merged={merged}/{pairs} errors={errors}", flush=True) + except Exception as e: + errors += 1 + if errors <= 20: + print(f" ERR keep {keep[:8]} drop {drop[:8]}: {str(e)[:120]}") + print(f"DONE merged={merged}/{pairs} errors={errors}", flush=True) + + +if __name__ == "__main__": + main()