Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
61 lines
1.8 KiB
Python
61 lines
1.8 KiB
Python
"""Normalizacja tekstu używana wszędzie w pipeline'ie matchingu.
|
|
|
|
Cel: dwie formy tej samej nazwy (różne casing, akcenty, interpunkcja, whitespace, „the"/myślniki)
|
|
muszą produkować identyczny string po `normalize`. Trgram index na tych polach robi resztę.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import unicodedata
|
|
|
|
from slugify import slugify as _slugify
|
|
|
|
_NON_ASCII_ALNUM_RE = re.compile(r"[^a-z0-9\s]+")
|
|
_WS_RE = re.compile(r"\s+")
|
|
_LEADING_ARTICLE_RE = re.compile(r"^(the|a|an)\s+", flags=re.IGNORECASE)
|
|
|
|
# Litery których NFKD nie rozkłada na ASCII — mapujemy ręcznie.
|
|
_EXTRA_TRANSLIT = str.maketrans(
|
|
{
|
|
"Ł": "L", "ł": "l",
|
|
"Ø": "O", "ø": "o",
|
|
"Æ": "AE", "æ": "ae",
|
|
"Œ": "OE", "œ": "oe",
|
|
"Þ": "Th", "þ": "th",
|
|
"Ð": "D", "ð": "d",
|
|
"Đ": "D", "đ": "d",
|
|
"ß": "ss",
|
|
"Ħ": "H", "ħ": "h",
|
|
}
|
|
)
|
|
|
|
|
|
def strip_accents(value: str) -> str:
|
|
pre = value.translate(_EXTRA_TRANSLIT)
|
|
nfkd = unicodedata.normalize("NFKD", pre)
|
|
return "".join(c for c in nfkd if not unicodedata.combining(c))
|
|
|
|
|
|
def normalize(value: str | None) -> str:
|
|
"""Lower + strip accents + remove punctuation + collapse whitespace + drop leading article."""
|
|
if not value:
|
|
return ""
|
|
out = strip_accents(value).lower()
|
|
out = _NON_ASCII_ALNUM_RE.sub(" ", out)
|
|
out = _WS_RE.sub(" ", out).strip()
|
|
out = _LEADING_ARTICLE_RE.sub("", out)
|
|
return out
|
|
|
|
|
|
def normalize_person(value: str | None) -> str:
|
|
"""Normalizacja imion: jak `normalize`, plus zwija inicjały „M." → „m"."""
|
|
if not value:
|
|
return ""
|
|
out = normalize(value)
|
|
return _WS_RE.sub(" ", out).strip()
|
|
|
|
|
|
def slugify(value: str | None) -> str:
|
|
if not value:
|
|
return ""
|
|
return _slugify(value, lowercase=True, max_length=200)
|