goon/app/normalize/text.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

61 lines
1.8 KiB
Python

"""Normalizacja tekstu używana wszędzie w pipeline'ie matchingu.
Cel: dwie formy tej samej nazwy (różne casing, akcenty, interpunkcja, whitespace, „the"/myślniki)
muszą produkować identyczny string po `normalize`. Trgram index na tych polach robi resztę.
"""
from __future__ import annotations
import re
import unicodedata
from slugify import slugify as _slugify
_NON_ASCII_ALNUM_RE = re.compile(r"[^a-z0-9\s]+")
_WS_RE = re.compile(r"\s+")
_LEADING_ARTICLE_RE = re.compile(r"^(the|a|an)\s+", flags=re.IGNORECASE)
# Litery których NFKD nie rozkłada na ASCII — mapujemy ręcznie.
_EXTRA_TRANSLIT = str.maketrans(
{
"Ł": "L", "ł": "l",
"Ø": "O", "ø": "o",
"Æ": "AE", "æ": "ae",
"Œ": "OE", "œ": "oe",
"Þ": "Th", "þ": "th",
"Ð": "D", "ð": "d",
"Đ": "D", "đ": "d",
"ß": "ss",
"Ħ": "H", "ħ": "h",
}
)
def strip_accents(value: str) -> str:
pre = value.translate(_EXTRA_TRANSLIT)
nfkd = unicodedata.normalize("NFKD", pre)
return "".join(c for c in nfkd if not unicodedata.combining(c))
def normalize(value: str | None) -> str:
"""Lower + strip accents + remove punctuation + collapse whitespace + drop leading article."""
if not value:
return ""
out = strip_accents(value).lower()
out = _NON_ASCII_ALNUM_RE.sub(" ", out)
out = _WS_RE.sub(" ", out).strip()
out = _LEADING_ARTICLE_RE.sub("", out)
return out
def normalize_person(value: str | None) -> str:
"""Normalizacja imion: jak `normalize`, plus zwija inicjały „M." → „m"."""
if not value:
return ""
out = normalize(value)
return _WS_RE.sub(" ", out).strip()
def slugify(value: str | None) -> str:
if not value:
return ""
return _slugify(value, lowercase=True, max_length=200)