From 1654d78d598238d143f0e069d1a72f9dfdea580f Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Thu, 11 Jun 2026 19:48:22 +0200 Subject: [PATCH] fix(ingest): strip NUL bytes from raw payloads before Postgres write A source (TPDB) returned a performer alias containing a literal U+0000 ("Ramon.."). Postgres cannot store in JSONB or text, so the external_records JSONB insert in _upsert_external_record failed with UntranslatableCharacter and the scene never ingested (GOON-Z). Recursively strip NUL from the raw payload (-> external_records.raw) and, when present, also re-validate the RawScene/RawMovie so normalize -> typed text columns get clean data too. Gated by a cheap _has_nul scan so clean records (the overwhelming majority) pay no extra cost. Co-Authored-By: Claude Fable 5 --- app/ingest.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/app/ingest.py b/app/ingest.py index 9bd6384..af4f68c 100644 --- a/app/ingest.py +++ b/app/ingest.py @@ -92,6 +92,34 @@ def _hash_raw(payload: dict) -> bytes: return hashlib.sha256(_canonical_json(payload)).digest() +def _has_nul(obj) -> bool: + """Szybki rekurencyjny check na NUL byte (U+0000) w stringach. Short-circuituje, + więc dla czystych rekordów (99.99%) zero narzutu poza tanim spacerem.""" + if isinstance(obj, str): + return "\x00" in obj + if isinstance(obj, dict): + return any(_has_nul(k) or _has_nul(v) for k, v in obj.items()) + if isinstance(obj, list): + return any(_has_nul(v) for v in obj) + return False + + +def _strip_nul(obj): + """Rekurencyjnie usuwa NUL byte (U+0000) ze stringów (wartości i klucze dict). + + Postgres JSONB/text NIE umie przechowywać `\\u0000` (DataError UntranslatableCharacter, + GOON-Z 2026-06-11: TPDB podał alias performera "Ramon\\u0000..."). Strip przed hash + + insert ORAZ przed normalize (aliasy → kolumny TEXT), żeby cała scena/film weszły do + ingestu zamiast się wywalić.""" + if isinstance(obj, str): + return obj.replace("\x00", "") if "\x00" in obj else obj + if isinstance(obj, dict): + return {_strip_nul(k): _strip_nul(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_strip_nul(v) for v in obj] + return obj + + def get_or_create_source( session: Session, *, kind: SourceKind, name: str, base_url: str | None = None ) -> Source: @@ -241,6 +269,12 @@ def ingest_from_connector( def _process_scene(*, source_id: uuid.UUID, raw_scene: RawScene, counters: dict[str, int]) -> None: payload = raw_scene.raw or raw_scene.model_dump(mode="json") + if _has_nul(payload): + # Strip NUL z payloadu (→ external_records.raw JSONB) ORAZ ze structured fields + # (→ normalize_scene → kolumny TEXT performera/aliasów). Inaczej insert pada na + # UntranslatableCharacter i scena nigdy nie wchodzi (GOON-Z). + payload = _strip_nul(payload) + raw_scene = raw_scene.model_validate(_strip_nul(raw_scene.model_dump())) raw_hash = _hash_raw(payload) now = datetime.now(UTC) @@ -363,6 +397,9 @@ def ingest_movies_from_connector( def _process_movie(*, source_id: uuid.UUID, raw_movie: RawMovie, counters: dict[str, int]) -> None: payload = raw_movie.raw or raw_movie.model_dump(mode="json") + if _has_nul(payload): + payload = _strip_nul(payload) + raw_movie = raw_movie.model_validate(_strip_nul(raw_movie.model_dump())) raw_hash = _hash_raw(payload) now = datetime.now(UTC)