"""One-shot: dla scen mających canonical (tpdb/stashdb) external_ref, przywróć title z external_records.raw zamiast tube SEO crap'u który wcześniej polluował. Bug który to spowodował: `_update_scene_fields` przed 2026-05-03 17:50 stosował "longer wins" dla tytułów niezależnie od source_kind, więc tube title przewyższał canonical. Fix poszedł na produkcję, ale 175 starych rekordów wymaga manualnej naprawy. Strategy: - Preferuj TPDB title jeśli istnieje (najbardziej studio-canonical format) - Inaczej weź stashdb title - Skip jeśli canonical title jest TYLKO listą performerów (wtedy zostaw co jest) """ from __future__ import annotations import logging from sqlalchemy import select from app.db import session_scope from app.models.external_record import EntityKind, ExternalRecord from app.models.performer import Performer from app.models.scene import Scene, SceneExternalRef, ScenePerformer from app.models.source import Source, SourceKind from app.normalize.text import normalize, slugify log = logging.getLogger("restore_canonical_titles") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") def _get_canonical_title(session, scene_id) -> tuple[str, str] | None: """Returns (title, source_name) lub None gdy brak canonical refs.""" rows = session.execute( select(Source.name, ExternalRecord.raw) .join(SceneExternalRef, (SceneExternalRef.source_id == Source.id)) .join( ExternalRecord, (ExternalRecord.source_id == SceneExternalRef.source_id) & (ExternalRecord.external_id == SceneExternalRef.external_id) & (ExternalRecord.entity_kind == EntityKind.scene), ) .where( SceneExternalRef.scene_id == scene_id, Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]), ) ).all() if not rows: return None # Preferuj tpdb > stashdb by_kind: dict[str, str] = {} for src_name, raw in rows: title = (raw or {}).get("title") or "" if title.strip(): by_kind[src_name] = title.strip() if "tpdb" in by_kind: return by_kind["tpdb"], "tpdb" if "stashdb" in by_kind: return by_kind["stashdb"], "stashdb" return None def _is_just_performer_names(title: str, performer_names: list[str]) -> bool: """Heurystyka: tytuł to tylko lista nazw performerów (StashDB default for missing title).""" t = title.lower().strip() # Prosty check: czy każda nazwa performera występuje, a nie ma innych słów for n in performer_names: t = t.replace(n.lower(), "").replace(",", "").replace("&", "").strip() return len(t) <= 3 # zostały tylko spacje / "and" / kreski def main() -> None: fixed = 0 skipped = 0 not_polluted = 0 with session_scope() as session: # Find polluted scene IDs first (can't yield over session scope, so collect ids) scene_ids = [ r[0] for r in session.execute( select(Scene.id).where( Scene.id.in_( select(SceneExternalRef.scene_id) .join(Source, Source.id == SceneExternalRef.source_id) .where(Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb])) ), Scene.id.in_( select(SceneExternalRef.scene_id) .join(Source, Source.id == SceneExternalRef.source_id) .where(Source.kind == SourceKind.scraper) ), ) ) ] log.info("Found %d scenes with both canonical and scraper refs", len(scene_ids)) for scene_id in scene_ids: with session_scope() as session: scene = session.get(Scene, scene_id) if scene is None: continue cand = _get_canonical_title(session, scene_id) if cand is None: skipped += 1 continue canon_title, src = cand if canon_title == scene.title: not_polluted += 1 continue # Skip jeśli canonical title to tylko nazwy performerów perf_names = [ r[0] for r in session.execute( select(Performer.canonical_name) .join(ScenePerformer, ScenePerformer.performer_id == Performer.id) .where(ScenePerformer.scene_id == scene_id) ) ] if perf_names and _is_just_performer_names(canon_title, perf_names): log.debug("skip %s: canon is just performer names: %s", scene_id, canon_title) skipped += 1 continue log.info( "fix %s: '%s' (%d) → '%s' (%d, %s)", scene_id, (scene.title or "")[:40], len(scene.title or ""), canon_title[:40], len(canon_title), src, ) scene.title = canon_title scene.title_normalized = normalize(canon_title) if not scene.slug: scene.slug = slugify(canon_title) fixed += 1 log.info( "done: fixed=%d skipped=%d not_polluted=%d total=%d", fixed, skipped, not_polluted, len(scene_ids), ) if __name__ == "__main__": main()