fix(ingest): strip NUL bytes from raw payloads before Postgres write

A source (TPDB) returned a performer alias containing a literal U+0000 ("Ramon..").
Postgres cannot store  in JSONB or text, so the external_records JSONB insert in
_upsert_external_record failed with UntranslatableCharacter and the scene never ingested
(GOON-Z). Recursively strip NUL from the raw payload (-> external_records.raw) and, when
present, also re-validate the RawScene/RawMovie so normalize -> typed text columns get
clean data too. Gated by a cheap _has_nul scan so clean records (the overwhelming
majority) pay no extra cost.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-11 19:48:22 +02:00
parent 16eb633bde
commit 1654d78d59

View file

@ -92,6 +92,34 @@ def _hash_raw(payload: dict) -> bytes:
return hashlib.sha256(_canonical_json(payload)).digest() return hashlib.sha256(_canonical_json(payload)).digest()
def _has_nul(obj) -> bool:
"""Szybki rekurencyjny check na NUL byte (U+0000) w stringach. Short-circuituje,
więc dla czystych rekordów (99.99%) zero narzutu poza tanim spacerem."""
if isinstance(obj, str):
return "\x00" in obj
if isinstance(obj, dict):
return any(_has_nul(k) or _has_nul(v) for k, v in obj.items())
if isinstance(obj, list):
return any(_has_nul(v) for v in obj)
return False
def _strip_nul(obj):
"""Rekurencyjnie usuwa NUL byte (U+0000) ze stringów (wartości i klucze dict).
Postgres JSONB/text NIE umie przechowywać `\\u0000` (DataError UntranslatableCharacter,
GOON-Z 2026-06-11: TPDB podał alias performera "Ramon\\u0000..."). Strip przed hash +
insert ORAZ przed normalize (aliasy kolumny TEXT), żeby cała scena/film weszły do
ingestu zamiast się wywalić."""
if isinstance(obj, str):
return obj.replace("\x00", "") if "\x00" in obj else obj
if isinstance(obj, dict):
return {_strip_nul(k): _strip_nul(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_strip_nul(v) for v in obj]
return obj
def get_or_create_source( def get_or_create_source(
session: Session, *, kind: SourceKind, name: str, base_url: str | None = None session: Session, *, kind: SourceKind, name: str, base_url: str | None = None
) -> Source: ) -> Source:
@ -241,6 +269,12 @@ def ingest_from_connector(
def _process_scene(*, source_id: uuid.UUID, raw_scene: RawScene, counters: dict[str, int]) -> None: def _process_scene(*, source_id: uuid.UUID, raw_scene: RawScene, counters: dict[str, int]) -> None:
payload = raw_scene.raw or raw_scene.model_dump(mode="json") payload = raw_scene.raw or raw_scene.model_dump(mode="json")
if _has_nul(payload):
# Strip NUL z payloadu (→ external_records.raw JSONB) ORAZ ze structured fields
# (→ normalize_scene → kolumny TEXT performera/aliasów). Inaczej insert pada na
# UntranslatableCharacter i scena nigdy nie wchodzi (GOON-Z).
payload = _strip_nul(payload)
raw_scene = raw_scene.model_validate(_strip_nul(raw_scene.model_dump()))
raw_hash = _hash_raw(payload) raw_hash = _hash_raw(payload)
now = datetime.now(UTC) now = datetime.now(UTC)
@ -363,6 +397,9 @@ def ingest_movies_from_connector(
def _process_movie(*, source_id: uuid.UUID, raw_movie: RawMovie, counters: dict[str, int]) -> None: def _process_movie(*, source_id: uuid.UUID, raw_movie: RawMovie, counters: dict[str, int]) -> None:
payload = raw_movie.raw or raw_movie.model_dump(mode="json") payload = raw_movie.raw or raw_movie.model_dump(mode="json")
if _has_nul(payload):
payload = _strip_nul(payload)
raw_movie = raw_movie.model_validate(_strip_nul(raw_movie.model_dump()))
raw_hash = _hash_raw(payload) raw_hash = _hash_raw(payload)
now = datetime.now(UTC) now = datetime.now(UTC)