goon/app/normalize/scenes.py
jtrzupek da7fcda132 feat(ingest): SQL phash match, tag inference + backfill, clip-store skip, browse tubes, watchdog
Resolver/perf:
- find_by_phash_within: nearest match via Postgres bit_count over bit(64) XOR
  instead of Python scan of all phash fingerprints (~20x faster per scene;
  unblocks long delta runs that were killed mid-run before since advanced).

Scheduler/reliability:
- reap ingest_runs stuck in 'running' on worker startup (killed_by_restart).
- smoke_test: per-source ingest health, stuck-run and browse-freshness checks
  -> Sentry; exclude killed_by_restart from the failed-run alarm.

Tags (ingest with tags + fill blanks):
- wire infer_tag_slugs into normalize_scene so tube scenes get title-inferred
  tags (was dead code); union with connector tags.
- scripts/backfill_inferred_tags.py: keyset/batched/idempotent backfill for
  existing tagless scenes (playable tag coverage 16% -> ~52%).

Clip-store:
- skip ManyVids/IWantClips/Clips4Sale/... from canonical sources at ingest
  (GOON_SKIP_CLIP_STORE, default on) — permanent orphans, ~56% of canonical
  ingest, never have a free-tube playback source.

Browse tubes:
- enable fullmovies + hdporn.gg: studio parsed from title prefix instead of
  the /networks/ sidebar (which always yielded the first listed network);
  drop phash compute (pilot: 0% canonical hit within Hamming 5 — auto-screenshots),
  matching relies on title/performer/duration.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 15:07:35 +02:00

189 lines
6.1 KiB
Python

"""Mapowanie RawScene/RawPerformer/RawStudio/RawTag → znormalizowane DTO gotowe do upsertu.
Normalizacja = wyliczenie pól indeksujących (`*_normalized`, `slug`). Surowe pola jak
`title`, `release_date` przekazujemy bez zmian — kanon wybiera resolver.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import date
from app.connectors.base import (
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.normalize.tag_inference import infer_tag_slugs
from app.normalize.text import normalize, normalize_person, slugify
@dataclass
class NormalizedTag:
name: str
slug: str
# Optional — `resolve_tag` keys on slug and `Tag` has no external_id column.
external_id: str | None = None
@dataclass
class NormalizedStudio:
name: str
name_normalized: str
slug: str
external_id: str | None
parent_external_id: str | None
parent_name: str | None
network: str | None
homepage_url: str | None
@dataclass
class NormalizedPerformer:
canonical_name: str
name_normalized: str
slug: str
external_id: str | None
aliases: list[str]
aliases_normalized: list[str]
gender: str | None
birth_date: date | None
country: str | None
as_alias_in_scene: str | None
@dataclass
class NormalizedScene:
external_id: str
title: str
title_normalized: str
slug: str
release_date: date | None
description: str | None
duration_sec: int | None
code: str | None
director: str | None
url: str | None
studio: NormalizedStudio | None = None
performers: list[NormalizedPerformer] = field(default_factory=list)
tags: list[NormalizedTag] = field(default_factory=list)
fingerprints: list[tuple[str, str]] = field(default_factory=list)
"""Lista (kind, value), np. [('phash', 'abc...'), ('oshash', '...')]."""
playback_sources: list[RawPlaybackSource] = field(default_factory=list)
"""Linki do odtwarzania (passthrough — resolver dorzuca do tabeli playback_sources)."""
cross_source_refs: dict[str, str] = field(default_factory=dict)
"""Mapowanie source_name → external_id deklarowane przez źródło z którego pochodzi
ta scena (np. StashDB ujawnia tpdb_id w `urls`)."""
def normalize_tag(raw: RawTag) -> NormalizedTag:
return NormalizedTag(
name=raw.name,
slug=raw.slug or slugify(raw.name),
external_id=raw.external_id,
)
def _merge_inferred_tags(raw_tags: list[RawTag], *, title: str) -> list[NormalizedTag]:
"""Tagi z connectora + tagi wywnioskowane z tytułu (`infer_tag_slugs`).
Tube'y (96% grywalnego katalogu) przychodzą prawie bez tagów — tylko 16%
tube-only scen ma jakikolwiek tag, vs 99% scen zmatchowanych z TPDB/StashDB.
`infer_tag_slugs` mapuje phrasy z tytułu na canonical slugi (zgrane z DB), więc
resolve_tag trafia w istniejące Tag rows. Union, nie nadpisanie: gdy scena potem
zmergeuje się z TPDB, tagi się sumują. Inference odpalamy dla KAŻDEJ sceny —
dla canonical to no-op (już mają komplet), dla tube to wypełnienie braku.
"""
tags = [normalize_tag(t) for t in raw_tags]
seen = {t.slug for t in tags}
for slug in infer_tag_slugs(title):
if slug in seen:
continue
seen.add(slug)
tags.append(NormalizedTag(name=slug.replace("-", " ").title(), slug=slug, external_id=None))
return tags
def normalize_studio(raw: RawStudio) -> NormalizedStudio:
return NormalizedStudio(
name=raw.name,
name_normalized=normalize(raw.name),
slug=raw.slug or slugify(raw.name),
external_id=raw.external_id,
parent_external_id=raw.parent_external_id,
parent_name=raw.parent_name,
network=raw.network,
homepage_url=raw.homepage_url,
)
_GENDER_ALIASES = {
"female": "female",
"f": "female",
"male": "male",
"m": "male",
"transgender female": "transgender_female",
"trans female": "transgender_female",
"trans-female": "transgender_female",
"transgender_female": "transgender_female",
"transgender male": "transgender_male",
"trans male": "transgender_male",
"trans-male": "transgender_male",
"transgender_male": "transgender_male",
"non binary": "non_binary",
"non-binary": "non_binary",
"nonbinary": "non_binary",
"non_binary": "non_binary",
"intersex": "intersex",
"unknown": "unknown",
}
def _normalize_gender(value: str | None) -> str | None:
"""TPDB/StashDB zwracają warianty z spacjami/myślnikami. Normalizujemy do enum
`performer_gender` w bazie. Wartość niezmapowana → None (NULL w DB)."""
if not value:
return None
return _GENDER_ALIASES.get(value.strip().lower())
def normalize_performer(raw: RawPerformer) -> NormalizedPerformer:
aliases = list(dict.fromkeys(raw.aliases)) # de-dup zachowując kolejność
return NormalizedPerformer(
canonical_name=raw.name,
name_normalized=normalize_person(raw.name),
slug=slugify(raw.name),
external_id=raw.external_id,
aliases=aliases,
aliases_normalized=[normalize_person(a) for a in aliases],
gender=_normalize_gender(raw.gender),
birth_date=raw.birth_date,
country=raw.country,
as_alias_in_scene=raw.as_alias_in_scene,
)
def normalize_scene(raw: RawScene) -> NormalizedScene:
return NormalizedScene(
external_id=raw.external_id,
title=raw.title,
title_normalized=normalize(raw.title),
slug=slugify(raw.title),
release_date=raw.release_date,
description=raw.description,
duration_sec=raw.duration_sec,
code=raw.code,
director=raw.director,
url=raw.url,
studio=normalize_studio(raw.studio) if raw.studio else None,
performers=[normalize_performer(p) for p in raw.performers],
tags=_merge_inferred_tags(raw.tags, title=raw.title),
fingerprints=[(fp.kind, fp.value) for fp in raw.fingerprints],
playback_sources=list(raw.playback_sources),
cross_source_refs=dict(raw.cross_source_refs),
)