goon/app/normalize/scenes.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

167 lines
4.9 KiB
Python

"""Mapowanie RawScene/RawPerformer/RawStudio/RawTag → znormalizowane DTO gotowe do upsertu.
Normalizacja = wyliczenie pól indeksujących (`*_normalized`, `slug`). Surowe pola jak
`title`, `release_date` przekazujemy bez zmian — kanon wybiera resolver.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import date
from app.connectors.base import (
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.normalize.text import normalize, normalize_person, slugify
@dataclass
class NormalizedTag:
name: str
slug: str
external_id: str | None
@dataclass
class NormalizedStudio:
name: str
name_normalized: str
slug: str
external_id: str | None
parent_external_id: str | None
parent_name: str | None
network: str | None
homepage_url: str | None
@dataclass
class NormalizedPerformer:
canonical_name: str
name_normalized: str
slug: str
external_id: str | None
aliases: list[str]
aliases_normalized: list[str]
gender: str | None
birth_date: date | None
country: str | None
as_alias_in_scene: str | None
@dataclass
class NormalizedScene:
external_id: str
title: str
title_normalized: str
slug: str
release_date: date | None
description: str | None
duration_sec: int | None
code: str | None
director: str | None
url: str | None
studio: NormalizedStudio | None = None
performers: list[NormalizedPerformer] = field(default_factory=list)
tags: list[NormalizedTag] = field(default_factory=list)
fingerprints: list[tuple[str, str]] = field(default_factory=list)
"""Lista (kind, value), np. [('phash', 'abc...'), ('oshash', '...')]."""
playback_sources: list[RawPlaybackSource] = field(default_factory=list)
"""Linki do odtwarzania (passthrough — resolver dorzuca do tabeli playback_sources)."""
cross_source_refs: dict[str, str] = field(default_factory=dict)
"""Mapowanie source_name → external_id deklarowane przez źródło z którego pochodzi
ta scena (np. StashDB ujawnia tpdb_id w `urls`)."""
def normalize_tag(raw: RawTag) -> NormalizedTag:
return NormalizedTag(
name=raw.name,
slug=raw.slug or slugify(raw.name),
external_id=raw.external_id,
)
def normalize_studio(raw: RawStudio) -> NormalizedStudio:
return NormalizedStudio(
name=raw.name,
name_normalized=normalize(raw.name),
slug=raw.slug or slugify(raw.name),
external_id=raw.external_id,
parent_external_id=raw.parent_external_id,
parent_name=raw.parent_name,
network=raw.network,
homepage_url=raw.homepage_url,
)
_GENDER_ALIASES = {
"female": "female",
"f": "female",
"male": "male",
"m": "male",
"transgender female": "transgender_female",
"trans female": "transgender_female",
"trans-female": "transgender_female",
"transgender_female": "transgender_female",
"transgender male": "transgender_male",
"trans male": "transgender_male",
"trans-male": "transgender_male",
"transgender_male": "transgender_male",
"non binary": "non_binary",
"non-binary": "non_binary",
"nonbinary": "non_binary",
"non_binary": "non_binary",
"intersex": "intersex",
"unknown": "unknown",
}
def _normalize_gender(value: str | None) -> str | None:
"""TPDB/StashDB zwracają warianty z spacjami/myślnikami. Normalizujemy do enum
`performer_gender` w bazie. Wartość niezmapowana → None (NULL w DB)."""
if not value:
return None
return _GENDER_ALIASES.get(value.strip().lower())
def normalize_performer(raw: RawPerformer) -> NormalizedPerformer:
aliases = list(dict.fromkeys(raw.aliases)) # de-dup zachowując kolejność
return NormalizedPerformer(
canonical_name=raw.name,
name_normalized=normalize_person(raw.name),
slug=slugify(raw.name),
external_id=raw.external_id,
aliases=aliases,
aliases_normalized=[normalize_person(a) for a in aliases],
gender=_normalize_gender(raw.gender),
birth_date=raw.birth_date,
country=raw.country,
as_alias_in_scene=raw.as_alias_in_scene,
)
def normalize_scene(raw: RawScene) -> NormalizedScene:
return NormalizedScene(
external_id=raw.external_id,
title=raw.title,
title_normalized=normalize(raw.title),
slug=slugify(raw.title),
release_date=raw.release_date,
description=raw.description,
duration_sec=raw.duration_sec,
code=raw.code,
director=raw.director,
url=raw.url,
studio=normalize_studio(raw.studio) if raw.studio else None,
performers=[normalize_performer(p) for p in raw.performers],
tags=[normalize_tag(t) for t in raw.tags],
fingerprints=[(fp.kind, fp.value) for fp in raw.fingerprints],
playback_sources=list(raw.playback_sources),
cross_source_refs=dict(raw.cross_source_refs),
)