Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
167 lines
4.9 KiB
Python
167 lines
4.9 KiB
Python
"""Mapowanie RawScene/RawPerformer/RawStudio/RawTag → znormalizowane DTO gotowe do upsertu.
|
|
|
|
Normalizacja = wyliczenie pól indeksujących (`*_normalized`, `slug`). Surowe pola jak
|
|
`title`, `release_date` przekazujemy bez zmian — kanon wybiera resolver.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import date
|
|
|
|
from app.connectors.base import (
|
|
RawPerformer,
|
|
RawPlaybackSource,
|
|
RawScene,
|
|
RawStudio,
|
|
RawTag,
|
|
)
|
|
from app.normalize.text import normalize, normalize_person, slugify
|
|
|
|
|
|
@dataclass
|
|
class NormalizedTag:
|
|
name: str
|
|
slug: str
|
|
external_id: str | None
|
|
|
|
|
|
@dataclass
|
|
class NormalizedStudio:
|
|
name: str
|
|
name_normalized: str
|
|
slug: str
|
|
external_id: str | None
|
|
parent_external_id: str | None
|
|
parent_name: str | None
|
|
network: str | None
|
|
homepage_url: str | None
|
|
|
|
|
|
@dataclass
|
|
class NormalizedPerformer:
|
|
canonical_name: str
|
|
name_normalized: str
|
|
slug: str
|
|
external_id: str | None
|
|
aliases: list[str]
|
|
aliases_normalized: list[str]
|
|
gender: str | None
|
|
birth_date: date | None
|
|
country: str | None
|
|
as_alias_in_scene: str | None
|
|
|
|
|
|
@dataclass
|
|
class NormalizedScene:
|
|
external_id: str
|
|
title: str
|
|
title_normalized: str
|
|
slug: str
|
|
release_date: date | None
|
|
description: str | None
|
|
duration_sec: int | None
|
|
code: str | None
|
|
director: str | None
|
|
url: str | None
|
|
|
|
studio: NormalizedStudio | None = None
|
|
performers: list[NormalizedPerformer] = field(default_factory=list)
|
|
tags: list[NormalizedTag] = field(default_factory=list)
|
|
|
|
fingerprints: list[tuple[str, str]] = field(default_factory=list)
|
|
"""Lista (kind, value), np. [('phash', 'abc...'), ('oshash', '...')]."""
|
|
|
|
playback_sources: list[RawPlaybackSource] = field(default_factory=list)
|
|
"""Linki do odtwarzania (passthrough — resolver dorzuca do tabeli playback_sources)."""
|
|
|
|
cross_source_refs: dict[str, str] = field(default_factory=dict)
|
|
"""Mapowanie source_name → external_id deklarowane przez źródło z którego pochodzi
|
|
ta scena (np. StashDB ujawnia tpdb_id w `urls`)."""
|
|
|
|
|
|
def normalize_tag(raw: RawTag) -> NormalizedTag:
|
|
return NormalizedTag(
|
|
name=raw.name,
|
|
slug=raw.slug or slugify(raw.name),
|
|
external_id=raw.external_id,
|
|
)
|
|
|
|
|
|
def normalize_studio(raw: RawStudio) -> NormalizedStudio:
|
|
return NormalizedStudio(
|
|
name=raw.name,
|
|
name_normalized=normalize(raw.name),
|
|
slug=raw.slug or slugify(raw.name),
|
|
external_id=raw.external_id,
|
|
parent_external_id=raw.parent_external_id,
|
|
parent_name=raw.parent_name,
|
|
network=raw.network,
|
|
homepage_url=raw.homepage_url,
|
|
)
|
|
|
|
|
|
_GENDER_ALIASES = {
|
|
"female": "female",
|
|
"f": "female",
|
|
"male": "male",
|
|
"m": "male",
|
|
"transgender female": "transgender_female",
|
|
"trans female": "transgender_female",
|
|
"trans-female": "transgender_female",
|
|
"transgender_female": "transgender_female",
|
|
"transgender male": "transgender_male",
|
|
"trans male": "transgender_male",
|
|
"trans-male": "transgender_male",
|
|
"transgender_male": "transgender_male",
|
|
"non binary": "non_binary",
|
|
"non-binary": "non_binary",
|
|
"nonbinary": "non_binary",
|
|
"non_binary": "non_binary",
|
|
"intersex": "intersex",
|
|
"unknown": "unknown",
|
|
}
|
|
|
|
|
|
def _normalize_gender(value: str | None) -> str | None:
|
|
"""TPDB/StashDB zwracają warianty z spacjami/myślnikami. Normalizujemy do enum
|
|
`performer_gender` w bazie. Wartość niezmapowana → None (NULL w DB)."""
|
|
if not value:
|
|
return None
|
|
return _GENDER_ALIASES.get(value.strip().lower())
|
|
|
|
|
|
def normalize_performer(raw: RawPerformer) -> NormalizedPerformer:
|
|
aliases = list(dict.fromkeys(raw.aliases)) # de-dup zachowując kolejność
|
|
return NormalizedPerformer(
|
|
canonical_name=raw.name,
|
|
name_normalized=normalize_person(raw.name),
|
|
slug=slugify(raw.name),
|
|
external_id=raw.external_id,
|
|
aliases=aliases,
|
|
aliases_normalized=[normalize_person(a) for a in aliases],
|
|
gender=_normalize_gender(raw.gender),
|
|
birth_date=raw.birth_date,
|
|
country=raw.country,
|
|
as_alias_in_scene=raw.as_alias_in_scene,
|
|
)
|
|
|
|
|
|
def normalize_scene(raw: RawScene) -> NormalizedScene:
|
|
return NormalizedScene(
|
|
external_id=raw.external_id,
|
|
title=raw.title,
|
|
title_normalized=normalize(raw.title),
|
|
slug=slugify(raw.title),
|
|
release_date=raw.release_date,
|
|
description=raw.description,
|
|
duration_sec=raw.duration_sec,
|
|
code=raw.code,
|
|
director=raw.director,
|
|
url=raw.url,
|
|
studio=normalize_studio(raw.studio) if raw.studio else None,
|
|
performers=[normalize_performer(p) for p in raw.performers],
|
|
tags=[normalize_tag(t) for t in raw.tags],
|
|
fingerprints=[(fp.kind, fp.value) for fp in raw.fingerprints],
|
|
playback_sources=list(raw.playback_sources),
|
|
cross_source_refs=dict(raw.cross_source_refs),
|
|
)
|