goon/app/resolve/performer_resolver.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

145 lines
4.9 KiB
Python

"""M2: prosty resolver performera.
Ścieżki:
1. exact external_ref dla source → update.
2. name_normalized match w `performers` → reuse + dopnij external_ref.
3. alias_normalized match w `performer_aliases` → reuse parent + external_ref.
4. insert nowego.
(Fuzzy / triage merge_candidates: M8.)
"""
from __future__ import annotations
import logging
import uuid
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.models.performer import Performer, PerformerAlias, PerformerExternalRef
from app.normalize.scenes import NormalizedPerformer
from app.normalize.text import slugify
log = logging.getLogger(__name__)
def resolve_performer(
session: Session,
*,
norm: NormalizedPerformer,
source_id: uuid.UUID,
) -> Performer:
if norm.external_id:
ref = session.execute(
select(PerformerExternalRef).where(
PerformerExternalRef.source_id == source_id,
PerformerExternalRef.external_id == norm.external_id,
)
).scalar_one_or_none()
if ref is not None:
performer = session.get(Performer, ref.performer_id)
assert performer is not None
_update_performer_fields(performer, norm)
_ensure_aliases(session, performer.id, norm, source_id)
return performer
# name_normalized i alias_normalized nie mają unique constraint w schemacie — historycznie
# się zdarzają duplikaty (np. ten sam name_normalized dla różnych Performer rows po niefortunych
# ingestach). Bierzemy pierwszy stabilny match (po id) zamiast wybuchać.
performer = session.execute(
select(Performer)
.where(Performer.name_normalized == norm.name_normalized)
.order_by(Performer.id)
.limit(1)
).scalars().first()
if performer is None:
alias_match = session.execute(
select(PerformerAlias)
.where(PerformerAlias.alias_normalized == norm.name_normalized)
.order_by(PerformerAlias.performer_id)
.limit(1)
).scalars().first()
if alias_match is not None:
performer = session.get(Performer, alias_match.performer_id)
if performer is None:
performer = Performer(
canonical_name=norm.canonical_name,
name_normalized=norm.name_normalized,
slug=_unique_slug(session, norm.slug or slugify(norm.canonical_name) or "performer"),
gender=norm.gender,
birth_date=norm.birth_date,
country=norm.country,
)
session.add(performer)
session.flush()
log.debug("performer create id=%s name=%s", performer.id, performer.canonical_name)
else:
_update_performer_fields(performer, norm)
if norm.external_id:
existing_ref = session.execute(
select(PerformerExternalRef).where(
PerformerExternalRef.source_id == source_id,
PerformerExternalRef.external_id == norm.external_id,
)
).scalar_one_or_none()
if existing_ref is None:
session.add(
PerformerExternalRef(
source_id=source_id,
external_id=norm.external_id,
performer_id=performer.id,
confidence=1.0,
)
)
_ensure_aliases(session, performer.id, norm, source_id)
return performer
def _update_performer_fields(performer: Performer, norm: NormalizedPerformer) -> None:
if norm.gender and not performer.gender:
performer.gender = norm.gender
if norm.birth_date and not performer.birth_date:
performer.birth_date = norm.birth_date
if norm.country and not performer.country:
performer.country = norm.country
def _ensure_aliases(
session: Session,
performer_id: uuid.UUID,
norm: NormalizedPerformer,
source_id: uuid.UUID,
) -> None:
seen: set[str] = set()
for alias, alias_norm in zip(norm.aliases, norm.aliases_normalized, strict=True):
if not alias_norm or alias_norm in seen or alias_norm == norm.name_normalized:
continue
seen.add(alias_norm)
existing = session.execute(
select(PerformerAlias.id).where(
PerformerAlias.performer_id == performer_id,
PerformerAlias.alias_normalized == alias_norm,
)
).first()
if existing is None:
session.add(
PerformerAlias(
performer_id=performer_id,
alias=alias,
alias_normalized=alias_norm,
source_id=source_id,
)
)
def _unique_slug(session: Session, base: str) -> str:
candidate = base
n = 1
while session.execute(select(Performer.id).where(Performer.slug == candidate)).first():
n += 1
candidate = f"{base}-{n}"
return candidate