goon/scripts/restore_canonical_titles.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

145 lines
5.4 KiB
Python

"""One-shot: dla scen mających canonical (tpdb/stashdb) external_ref, przywróć
title z external_records.raw zamiast tube SEO crap'u który wcześniej polluował.
Bug który to spowodował: `_update_scene_fields` przed 2026-05-03 17:50 stosował
"longer wins" dla tytułów niezależnie od source_kind, więc tube title przewyższał
canonical. Fix poszedł na produkcję, ale 175 starych rekordów wymaga manualnej naprawy.
Strategy:
- Preferuj TPDB title jeśli istnieje (najbardziej studio-canonical format)
- Inaczej weź stashdb title
- Skip jeśli canonical title jest TYLKO listą performerów (wtedy zostaw co jest)
"""
from __future__ import annotations
import logging
from sqlalchemy import select
from app.db import session_scope
from app.models.external_record import EntityKind, ExternalRecord
from app.models.performer import Performer
from app.models.scene import Scene, SceneExternalRef, ScenePerformer
from app.models.source import Source, SourceKind
from app.normalize.text import normalize, slugify
log = logging.getLogger("restore_canonical_titles")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
def _get_canonical_title(session, scene_id) -> tuple[str, str] | None:
"""Returns (title, source_name) lub None gdy brak canonical refs."""
rows = session.execute(
select(Source.name, ExternalRecord.raw)
.join(SceneExternalRef, (SceneExternalRef.source_id == Source.id))
.join(
ExternalRecord,
(ExternalRecord.source_id == SceneExternalRef.source_id)
& (ExternalRecord.external_id == SceneExternalRef.external_id)
& (ExternalRecord.entity_kind == EntityKind.scene),
)
.where(
SceneExternalRef.scene_id == scene_id,
Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]),
)
).all()
if not rows:
return None
# Preferuj tpdb > stashdb
by_kind: dict[str, str] = {}
for src_name, raw in rows:
title = (raw or {}).get("title") or ""
if title.strip():
by_kind[src_name] = title.strip()
if "tpdb" in by_kind:
return by_kind["tpdb"], "tpdb"
if "stashdb" in by_kind:
return by_kind["stashdb"], "stashdb"
return None
def _is_just_performer_names(title: str, performer_names: list[str]) -> bool:
"""Heurystyka: tytuł to tylko lista nazw performerów (StashDB default for missing title)."""
t = title.lower().strip()
# Prosty check: czy każda nazwa performera występuje, a nie ma innych słów
for n in performer_names:
t = t.replace(n.lower(), "").replace(",", "").replace("&", "").strip()
return len(t) <= 3 # zostały tylko spacje / "and" / kreski
def main() -> None:
fixed = 0
skipped = 0
not_polluted = 0
with session_scope() as session:
# Find polluted scene IDs first (can't yield over session scope, so collect ids)
scene_ids = [
r[0]
for r in session.execute(
select(Scene.id).where(
Scene.id.in_(
select(SceneExternalRef.scene_id)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]))
),
Scene.id.in_(
select(SceneExternalRef.scene_id)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(Source.kind == SourceKind.scraper)
),
)
)
]
log.info("Found %d scenes with both canonical and scraper refs", len(scene_ids))
for scene_id in scene_ids:
with session_scope() as session:
scene = session.get(Scene, scene_id)
if scene is None:
continue
cand = _get_canonical_title(session, scene_id)
if cand is None:
skipped += 1
continue
canon_title, src = cand
if canon_title == scene.title:
not_polluted += 1
continue
# Skip jeśli canonical title to tylko nazwy performerów
perf_names = [
r[0]
for r in session.execute(
select(Performer.canonical_name)
.join(ScenePerformer, ScenePerformer.performer_id == Performer.id)
.where(ScenePerformer.scene_id == scene_id)
)
]
if perf_names and _is_just_performer_names(canon_title, perf_names):
log.debug("skip %s: canon is just performer names: %s", scene_id, canon_title)
skipped += 1
continue
log.info(
"fix %s: '%s' (%d) → '%s' (%d, %s)",
scene_id,
(scene.title or "")[:40],
len(scene.title or ""),
canon_title[:40],
len(canon_title),
src,
)
scene.title = canon_title
scene.title_normalized = normalize(canon_title)
if not scene.slug:
scene.slug = slugify(canon_title)
fixed += 1
log.info(
"done: fixed=%d skipped=%d not_polluted=%d total=%d",
fixed,
skipped,
not_polluted,
len(scene_ids),
)
if __name__ == "__main__":
main()