"""Mapowanie RawScene/RawPerformer/RawStudio/RawTag → znormalizowane DTO gotowe do upsertu. Normalizacja = wyliczenie pól indeksujących (`*_normalized`, `slug`). Surowe pola jak `title`, `release_date` przekazujemy bez zmian — kanon wybiera resolver. """ from __future__ import annotations from dataclasses import dataclass, field from datetime import date from app.connectors.base import ( RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.normalize.tag_inference import infer_tag_slugs from app.normalize.text import normalize, normalize_person, slugify @dataclass class NormalizedTag: name: str slug: str # Optional — `resolve_tag` keys on slug and `Tag` has no external_id column. external_id: str | None = None @dataclass class NormalizedStudio: name: str name_normalized: str slug: str external_id: str | None parent_external_id: str | None parent_name: str | None network: str | None homepage_url: str | None @dataclass class NormalizedPerformer: canonical_name: str name_normalized: str slug: str external_id: str | None aliases: list[str] aliases_normalized: list[str] gender: str | None birth_date: date | None country: str | None as_alias_in_scene: str | None @dataclass class NormalizedScene: external_id: str title: str title_normalized: str slug: str release_date: date | None description: str | None duration_sec: int | None code: str | None director: str | None url: str | None studio: NormalizedStudio | None = None performers: list[NormalizedPerformer] = field(default_factory=list) tags: list[NormalizedTag] = field(default_factory=list) fingerprints: list[tuple[str, str]] = field(default_factory=list) """Lista (kind, value), np. [('phash', 'abc...'), ('oshash', '...')].""" playback_sources: list[RawPlaybackSource] = field(default_factory=list) """Linki do odtwarzania (passthrough — resolver dorzuca do tabeli playback_sources).""" cross_source_refs: dict[str, str] = field(default_factory=dict) """Mapowanie source_name → external_id deklarowane przez źródło z którego pochodzi ta scena (np. StashDB ujawnia tpdb_id w `urls`).""" def normalize_tag(raw: RawTag) -> NormalizedTag: return NormalizedTag( name=raw.name, slug=raw.slug or slugify(raw.name), external_id=raw.external_id, ) def _merge_inferred_tags(raw_tags: list[RawTag], *, title: str) -> list[NormalizedTag]: """Tagi z connectora + tagi wywnioskowane z tytułu (`infer_tag_slugs`). Tube'y (96% grywalnego katalogu) przychodzą prawie bez tagów — tylko 16% tube-only scen ma jakikolwiek tag, vs 99% scen zmatchowanych z TPDB/StashDB. `infer_tag_slugs` mapuje phrasy z tytułu na canonical slugi (zgrane z DB), więc resolve_tag trafia w istniejące Tag rows. Union, nie nadpisanie: gdy scena potem zmergeuje się z TPDB, tagi się sumują. Inference odpalamy dla KAŻDEJ sceny — dla canonical to no-op (już mają komplet), dla tube to wypełnienie braku. """ tags = [normalize_tag(t) for t in raw_tags] seen = {t.slug for t in tags} for slug in infer_tag_slugs(title): if slug in seen: continue seen.add(slug) tags.append(NormalizedTag(name=slug.replace("-", " ").title(), slug=slug, external_id=None)) return tags def normalize_studio(raw: RawStudio) -> NormalizedStudio: return NormalizedStudio( name=raw.name, name_normalized=normalize(raw.name), slug=raw.slug or slugify(raw.name), external_id=raw.external_id, parent_external_id=raw.parent_external_id, parent_name=raw.parent_name, network=raw.network, homepage_url=raw.homepage_url, ) _GENDER_ALIASES = { "female": "female", "f": "female", "male": "male", "m": "male", "transgender female": "transgender_female", "trans female": "transgender_female", "trans-female": "transgender_female", "transgender_female": "transgender_female", "transgender male": "transgender_male", "trans male": "transgender_male", "trans-male": "transgender_male", "transgender_male": "transgender_male", "non binary": "non_binary", "non-binary": "non_binary", "nonbinary": "non_binary", "non_binary": "non_binary", "intersex": "intersex", "unknown": "unknown", } def _normalize_gender(value: str | None) -> str | None: """TPDB/StashDB zwracają warianty z spacjami/myślnikami. Normalizujemy do enum `performer_gender` w bazie. Wartość niezmapowana → None (NULL w DB).""" if not value: return None return _GENDER_ALIASES.get(value.strip().lower()) def normalize_performer(raw: RawPerformer) -> NormalizedPerformer: aliases = list(dict.fromkeys(raw.aliases)) # de-dup zachowując kolejność return NormalizedPerformer( canonical_name=raw.name, name_normalized=normalize_person(raw.name), slug=slugify(raw.name), external_id=raw.external_id, aliases=aliases, aliases_normalized=[normalize_person(a) for a in aliases], gender=_normalize_gender(raw.gender), birth_date=raw.birth_date, country=raw.country, as_alias_in_scene=raw.as_alias_in_scene, ) def normalize_scene(raw: RawScene) -> NormalizedScene: return NormalizedScene( external_id=raw.external_id, title=raw.title, title_normalized=normalize(raw.title), slug=slugify(raw.title), release_date=raw.release_date, description=raw.description, duration_sec=raw.duration_sec, code=raw.code, director=raw.director, url=raw.url, studio=normalize_studio(raw.studio) if raw.studio else None, performers=[normalize_performer(p) for p in raw.performers], tags=_merge_inferred_tags(raw.tags, title=raw.title), fingerprints=[(fp.kind, fp.value) for fp in raw.fingerprints], playback_sources=list(raw.playback_sources), cross_source_refs=dict(raw.cross_source_refs), )