fix(ingest): strip NUL bytes from raw payloads before Postgres write
A source (TPDB) returned a performer alias containing a literal U+0000 ("Ramon..").
Postgres cannot store in JSONB or text, so the external_records JSONB insert in
_upsert_external_record failed with UntranslatableCharacter and the scene never ingested
(GOON-Z). Recursively strip NUL from the raw payload (-> external_records.raw) and, when
present, also re-validate the RawScene/RawMovie so normalize -> typed text columns get
clean data too. Gated by a cheap _has_nul scan so clean records (the overwhelming
majority) pay no extra cost.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
16eb633bde
commit
1654d78d59
1 changed files with 37 additions and 0 deletions
|
|
@ -92,6 +92,34 @@ def _hash_raw(payload: dict) -> bytes:
|
||||||
return hashlib.sha256(_canonical_json(payload)).digest()
|
return hashlib.sha256(_canonical_json(payload)).digest()
|
||||||
|
|
||||||
|
|
||||||
|
def _has_nul(obj) -> bool:
|
||||||
|
"""Szybki rekurencyjny check na NUL byte (U+0000) w stringach. Short-circuituje,
|
||||||
|
więc dla czystych rekordów (99.99%) zero narzutu poza tanim spacerem."""
|
||||||
|
if isinstance(obj, str):
|
||||||
|
return "\x00" in obj
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return any(_has_nul(k) or _has_nul(v) for k, v in obj.items())
|
||||||
|
if isinstance(obj, list):
|
||||||
|
return any(_has_nul(v) for v in obj)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_nul(obj):
|
||||||
|
"""Rekurencyjnie usuwa NUL byte (U+0000) ze stringów (wartości i klucze dict).
|
||||||
|
|
||||||
|
Postgres JSONB/text NIE umie przechowywać `\\u0000` (DataError UntranslatableCharacter,
|
||||||
|
GOON-Z 2026-06-11: TPDB podał alias performera "Ramon\\u0000..."). Strip przed hash +
|
||||||
|
insert ORAZ przed normalize (aliasy → kolumny TEXT), żeby cała scena/film weszły do
|
||||||
|
ingestu zamiast się wywalić."""
|
||||||
|
if isinstance(obj, str):
|
||||||
|
return obj.replace("\x00", "") if "\x00" in obj else obj
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {_strip_nul(k): _strip_nul(v) for k, v in obj.items()}
|
||||||
|
if isinstance(obj, list):
|
||||||
|
return [_strip_nul(v) for v in obj]
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
def get_or_create_source(
|
def get_or_create_source(
|
||||||
session: Session, *, kind: SourceKind, name: str, base_url: str | None = None
|
session: Session, *, kind: SourceKind, name: str, base_url: str | None = None
|
||||||
) -> Source:
|
) -> Source:
|
||||||
|
|
@ -241,6 +269,12 @@ def ingest_from_connector(
|
||||||
|
|
||||||
def _process_scene(*, source_id: uuid.UUID, raw_scene: RawScene, counters: dict[str, int]) -> None:
|
def _process_scene(*, source_id: uuid.UUID, raw_scene: RawScene, counters: dict[str, int]) -> None:
|
||||||
payload = raw_scene.raw or raw_scene.model_dump(mode="json")
|
payload = raw_scene.raw or raw_scene.model_dump(mode="json")
|
||||||
|
if _has_nul(payload):
|
||||||
|
# Strip NUL z payloadu (→ external_records.raw JSONB) ORAZ ze structured fields
|
||||||
|
# (→ normalize_scene → kolumny TEXT performera/aliasów). Inaczej insert pada na
|
||||||
|
# UntranslatableCharacter i scena nigdy nie wchodzi (GOON-Z).
|
||||||
|
payload = _strip_nul(payload)
|
||||||
|
raw_scene = raw_scene.model_validate(_strip_nul(raw_scene.model_dump()))
|
||||||
raw_hash = _hash_raw(payload)
|
raw_hash = _hash_raw(payload)
|
||||||
now = datetime.now(UTC)
|
now = datetime.now(UTC)
|
||||||
|
|
||||||
|
|
@ -363,6 +397,9 @@ def ingest_movies_from_connector(
|
||||||
|
|
||||||
def _process_movie(*, source_id: uuid.UUID, raw_movie: RawMovie, counters: dict[str, int]) -> None:
|
def _process_movie(*, source_id: uuid.UUID, raw_movie: RawMovie, counters: dict[str, int]) -> None:
|
||||||
payload = raw_movie.raw or raw_movie.model_dump(mode="json")
|
payload = raw_movie.raw or raw_movie.model_dump(mode="json")
|
||||||
|
if _has_nul(payload):
|
||||||
|
payload = _strip_nul(payload)
|
||||||
|
raw_movie = raw_movie.model_validate(_strip_nul(raw_movie.model_dump()))
|
||||||
raw_hash = _hash_raw(payload)
|
raw_hash = _hash_raw(payload)
|
||||||
now = datetime.now(UTC)
|
now = datetime.now(UTC)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue