Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
145 lines
5.4 KiB
Python
145 lines
5.4 KiB
Python
"""One-shot: dla scen mających canonical (tpdb/stashdb) external_ref, przywróć
|
|
title z external_records.raw zamiast tube SEO crap'u który wcześniej polluował.
|
|
|
|
Bug który to spowodował: `_update_scene_fields` przed 2026-05-03 17:50 stosował
|
|
"longer wins" dla tytułów niezależnie od source_kind, więc tube title przewyższał
|
|
canonical. Fix poszedł na produkcję, ale 175 starych rekordów wymaga manualnej naprawy.
|
|
|
|
Strategy:
|
|
- Preferuj TPDB title jeśli istnieje (najbardziej studio-canonical format)
|
|
- Inaczej weź stashdb title
|
|
- Skip jeśli canonical title jest TYLKO listą performerów (wtedy zostaw co jest)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from sqlalchemy import select
|
|
|
|
from app.db import session_scope
|
|
from app.models.external_record import EntityKind, ExternalRecord
|
|
from app.models.performer import Performer
|
|
from app.models.scene import Scene, SceneExternalRef, ScenePerformer
|
|
from app.models.source import Source, SourceKind
|
|
from app.normalize.text import normalize, slugify
|
|
|
|
log = logging.getLogger("restore_canonical_titles")
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
|
|
def _get_canonical_title(session, scene_id) -> tuple[str, str] | None:
|
|
"""Returns (title, source_name) lub None gdy brak canonical refs."""
|
|
rows = session.execute(
|
|
select(Source.name, ExternalRecord.raw)
|
|
.join(SceneExternalRef, (SceneExternalRef.source_id == Source.id))
|
|
.join(
|
|
ExternalRecord,
|
|
(ExternalRecord.source_id == SceneExternalRef.source_id)
|
|
& (ExternalRecord.external_id == SceneExternalRef.external_id)
|
|
& (ExternalRecord.entity_kind == EntityKind.scene),
|
|
)
|
|
.where(
|
|
SceneExternalRef.scene_id == scene_id,
|
|
Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]),
|
|
)
|
|
).all()
|
|
if not rows:
|
|
return None
|
|
# Preferuj tpdb > stashdb
|
|
by_kind: dict[str, str] = {}
|
|
for src_name, raw in rows:
|
|
title = (raw or {}).get("title") or ""
|
|
if title.strip():
|
|
by_kind[src_name] = title.strip()
|
|
if "tpdb" in by_kind:
|
|
return by_kind["tpdb"], "tpdb"
|
|
if "stashdb" in by_kind:
|
|
return by_kind["stashdb"], "stashdb"
|
|
return None
|
|
|
|
|
|
def _is_just_performer_names(title: str, performer_names: list[str]) -> bool:
|
|
"""Heurystyka: tytuł to tylko lista nazw performerów (StashDB default for missing title)."""
|
|
t = title.lower().strip()
|
|
# Prosty check: czy każda nazwa performera występuje, a nie ma innych słów
|
|
for n in performer_names:
|
|
t = t.replace(n.lower(), "").replace(",", "").replace("&", "").strip()
|
|
return len(t) <= 3 # zostały tylko spacje / "and" / kreski
|
|
|
|
|
|
def main() -> None:
|
|
fixed = 0
|
|
skipped = 0
|
|
not_polluted = 0
|
|
with session_scope() as session:
|
|
# Find polluted scene IDs first (can't yield over session scope, so collect ids)
|
|
scene_ids = [
|
|
r[0]
|
|
for r in session.execute(
|
|
select(Scene.id).where(
|
|
Scene.id.in_(
|
|
select(SceneExternalRef.scene_id)
|
|
.join(Source, Source.id == SceneExternalRef.source_id)
|
|
.where(Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]))
|
|
),
|
|
Scene.id.in_(
|
|
select(SceneExternalRef.scene_id)
|
|
.join(Source, Source.id == SceneExternalRef.source_id)
|
|
.where(Source.kind == SourceKind.scraper)
|
|
),
|
|
)
|
|
)
|
|
]
|
|
log.info("Found %d scenes with both canonical and scraper refs", len(scene_ids))
|
|
|
|
for scene_id in scene_ids:
|
|
with session_scope() as session:
|
|
scene = session.get(Scene, scene_id)
|
|
if scene is None:
|
|
continue
|
|
cand = _get_canonical_title(session, scene_id)
|
|
if cand is None:
|
|
skipped += 1
|
|
continue
|
|
canon_title, src = cand
|
|
if canon_title == scene.title:
|
|
not_polluted += 1
|
|
continue
|
|
# Skip jeśli canonical title to tylko nazwy performerów
|
|
perf_names = [
|
|
r[0]
|
|
for r in session.execute(
|
|
select(Performer.canonical_name)
|
|
.join(ScenePerformer, ScenePerformer.performer_id == Performer.id)
|
|
.where(ScenePerformer.scene_id == scene_id)
|
|
)
|
|
]
|
|
if perf_names and _is_just_performer_names(canon_title, perf_names):
|
|
log.debug("skip %s: canon is just performer names: %s", scene_id, canon_title)
|
|
skipped += 1
|
|
continue
|
|
log.info(
|
|
"fix %s: '%s' (%d) → '%s' (%d, %s)",
|
|
scene_id,
|
|
(scene.title or "")[:40],
|
|
len(scene.title or ""),
|
|
canon_title[:40],
|
|
len(canon_title),
|
|
src,
|
|
)
|
|
scene.title = canon_title
|
|
scene.title_normalized = normalize(canon_title)
|
|
if not scene.slug:
|
|
scene.slug = slugify(canon_title)
|
|
fixed += 1
|
|
|
|
log.info(
|
|
"done: fixed=%d skipped=%d not_polluted=%d total=%d",
|
|
fixed,
|
|
skipped,
|
|
not_polluted,
|
|
len(scene_ids),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|