diff --git a/app/ingest.py b/app/ingest.py index 9bd6384..af4f68c 100644 --- a/app/ingest.py +++ b/app/ingest.py @@ -92,6 +92,34 @@ def _hash_raw(payload: dict) -> bytes: return hashlib.sha256(_canonical_json(payload)).digest() +def _has_nul(obj) -> bool: + """Szybki rekurencyjny check na NUL byte (U+0000) w stringach. Short-circuituje, + więc dla czystych rekordów (99.99%) zero narzutu poza tanim spacerem.""" + if isinstance(obj, str): + return "\x00" in obj + if isinstance(obj, dict): + return any(_has_nul(k) or _has_nul(v) for k, v in obj.items()) + if isinstance(obj, list): + return any(_has_nul(v) for v in obj) + return False + + +def _strip_nul(obj): + """Rekurencyjnie usuwa NUL byte (U+0000) ze stringów (wartości i klucze dict). + + Postgres JSONB/text NIE umie przechowywać `\\u0000` (DataError UntranslatableCharacter, + GOON-Z 2026-06-11: TPDB podał alias performera "Ramon\\u0000..."). Strip przed hash + + insert ORAZ przed normalize (aliasy → kolumny TEXT), żeby cała scena/film weszły do + ingestu zamiast się wywalić.""" + if isinstance(obj, str): + return obj.replace("\x00", "") if "\x00" in obj else obj + if isinstance(obj, dict): + return {_strip_nul(k): _strip_nul(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_strip_nul(v) for v in obj] + return obj + + def get_or_create_source( session: Session, *, kind: SourceKind, name: str, base_url: str | None = None ) -> Source: @@ -241,6 +269,12 @@ def ingest_from_connector( def _process_scene(*, source_id: uuid.UUID, raw_scene: RawScene, counters: dict[str, int]) -> None: payload = raw_scene.raw or raw_scene.model_dump(mode="json") + if _has_nul(payload): + # Strip NUL z payloadu (→ external_records.raw JSONB) ORAZ ze structured fields + # (→ normalize_scene → kolumny TEXT performera/aliasów). Inaczej insert pada na + # UntranslatableCharacter i scena nigdy nie wchodzi (GOON-Z). + payload = _strip_nul(payload) + raw_scene = raw_scene.model_validate(_strip_nul(raw_scene.model_dump())) raw_hash = _hash_raw(payload) now = datetime.now(UTC) @@ -363,6 +397,9 @@ def ingest_movies_from_connector( def _process_movie(*, source_id: uuid.UUID, raw_movie: RawMovie, counters: dict[str, int]) -> None: payload = raw_movie.raw or raw_movie.model_dump(mode="json") + if _has_nul(payload): + payload = _strip_nul(payload) + raw_movie = raw_movie.model_validate(_strip_nul(raw_movie.model_dump())) raw_hash = _hash_raw(payload) now = datetime.now(UTC)