goon/app/normalize/text.py

"""Normalizacja tekstu używana wszędzie w pipeline'ie matchingu.

Cel: dwie formy tej samej nazwy (różne casing, akcenty, interpunkcja, whitespace, „the"/myślniki)
muszą produkować identyczny string po `normalize`. Trgram index na tych polach robi resztę.
"""
from __future__ import annotations

import re
import unicodedata

from slugify import slugify as _slugify

_NON_ASCII_ALNUM_RE = re.compile(r"[^a-z0-9\s]+")
_WS_RE = re.compile(r"\s+")
_LEADING_ARTICLE_RE = re.compile(r"^(the|a|an)\s+", flags=re.IGNORECASE)

# Litery których NFKD nie rozkłada na ASCII — mapujemy ręcznie.
_EXTRA_TRANSLIT = str.maketrans(
    {
        "Ł": "L", "ł": "l",
        "Ø": "O", "ø": "o",
        "Æ": "AE", "æ": "ae",
        "Œ": "OE", "œ": "oe",
        "Þ": "Th", "þ": "th",
        "Ð": "D", "ð": "d",
        "Đ": "D", "đ": "d",
        "ß": "ss",
        "Ħ": "H", "ħ": "h",
    }
)


def strip_accents(value: str) -> str:
    pre = value.translate(_EXTRA_TRANSLIT)
    nfkd = unicodedata.normalize("NFKD", pre)
    return "".join(c for c in nfkd if not unicodedata.combining(c))


def normalize(value: str | None) -> str:
    """Lower + strip accents + remove punctuation + collapse whitespace + drop leading article."""
    if not value:
        return ""
    out = strip_accents(value).lower()
    out = _NON_ASCII_ALNUM_RE.sub(" ", out)
    out = _WS_RE.sub(" ", out).strip()
    out = _LEADING_ARTICLE_RE.sub("", out)
    return out


def normalize_person(value: str | None) -> str:
    """Normalizacja imion: jak `normalize`, plus zwija inicjały „M." → „m"."""
    if not value:
        return ""
    out = normalize(value)
    return _WS_RE.sub(" ", out).strip()


def slugify(value: str | None) -> str:
    if not value:
        return ""
    return _slugify(value, lowercase=True, max_length=200)