Skip to content

Commit de9bacb

Browse files
authored
[GH-1147] Fix null byte crash in semantic memory ingestion (#1165)
Fix null byte crash in PostgreSQL storage (issue #1147) Add sanitize_pg_text utility to strip null bytes from text before writing to PostgreSQL. Apply sanitization in PG storage add_feature and update_feature, and add validators to SemanticCommand and LLMReducedFeature to strip null bytes at model boundary. Includes regression tests for null byte handling. (Rebased onto main via path translation — packaging refactor applied) Signed-off-by: Oscar Love <olabradalove@gmail.com>
1 parent d5b9fcf commit de9bacb

7 files changed

Lines changed: 206 additions & 7 deletions

File tree

packages/server/server_tests/memmachine_server/semantic_memory/storage/test_semantic_storage.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,6 +1257,24 @@ async def test_filter_features_by_created_at_range(
12571257
assert results[0].value == "note2"
12581258

12591259

1260+
@pytest.mark.asyncio
1261+
async def test_add_feature_with_null_bytes_does_not_crash(
1262+
semantic_storage: SemanticStorage,
1263+
):
1264+
"""Regression test for issue #1147: null bytes in feature values must not crash."""
1265+
feature_id = await semantic_storage.add_feature(
1266+
set_id="user1",
1267+
category_name="profile",
1268+
feature="cultural\x00focus",
1269+
value="D\x00a de Los Muertos",
1270+
tag="Ins\x00ights",
1271+
embedding=np.array([1.0] * 1536, dtype=float),
1272+
)
1273+
1274+
stored = await semantic_storage.get_feature(feature_id)
1275+
assert stored is not None
1276+
1277+
12601278
@pytest.mark.asyncio
12611279
async def test_filter_equality(semantic_storage: SemanticStorage):
12621280
feature_ids: list[FeatureIdT] = [
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""Tests for the PostgreSQL text sanitizer utility."""
2+
3+
import logging
4+
5+
from memmachine_server.semantic_memory.storage.text_sanitizer import sanitize_pg_text
6+
7+
8+
class TestSanitizePgText:
9+
"""Tests for sanitize_pg_text."""
10+
11+
def test_clean_string_passes_through(self):
12+
assert sanitize_pg_text("hello world") == "hello world"
13+
14+
def test_empty_string_passes_through(self):
15+
assert sanitize_pg_text("") == ""
16+
17+
def test_unicode_string_passes_through(self):
18+
value = "Día de Los Muertos"
19+
assert sanitize_pg_text(value) == value
20+
21+
def test_strips_single_null_byte(self):
22+
assert sanitize_pg_text("D\x00a de Los Muertos") == "Da de Los Muertos"
23+
24+
def test_strips_multiple_null_bytes(self):
25+
assert sanitize_pg_text("a\x00b\x00c") == "abc"
26+
27+
def test_strips_leading_null_byte(self):
28+
assert sanitize_pg_text("\x00hello") == "hello"
29+
30+
def test_strips_trailing_null_byte(self):
31+
assert sanitize_pg_text("hello\x00") == "hello"
32+
33+
def test_string_of_only_null_bytes(self):
34+
assert sanitize_pg_text("\x00\x00\x00") == ""
35+
36+
def test_logs_warning_when_stripping(self, caplog):
37+
with caplog.at_level(logging.WARNING):
38+
sanitize_pg_text("D\x00a de Los Muertos", context="feature.value")
39+
40+
assert len(caplog.records) == 1
41+
assert "null byte" in caplog.records[0].message.lower()
42+
assert "feature.value" in caplog.records[0].message
43+
44+
def test_no_log_for_clean_string(self, caplog):
45+
with caplog.at_level(logging.WARNING):
46+
sanitize_pg_text("clean string")
47+
48+
assert len(caplog.records) == 0

packages/server/server_tests/memmachine_server/semantic_memory/test_semantic_model.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,3 +312,96 @@ def test_consolidation_prompt_should_include_user_tags(self):
312312
f"Consolidation prompt should contain tag '{tag_name}' "
313313
f"so the LLM knows which tags are valid"
314314
)
315+
316+
317+
class TestSemanticCommandNullByteStripping:
318+
"""Tests that SemanticCommand strips null bytes from LLM output."""
319+
320+
def test_strips_null_byte_from_value(self):
321+
from memmachine_server.semantic_memory.semantic_model import (
322+
SemanticCommand,
323+
SemanticCommandType,
324+
)
325+
326+
cmd = SemanticCommand(
327+
command=SemanticCommandType.ADD,
328+
feature="cultural_focus",
329+
tag="Insights",
330+
value="D\x00a de Los Muertos",
331+
)
332+
assert "\x00" not in cmd.value
333+
assert cmd.value == "Da de Los Muertos"
334+
335+
def test_strips_null_byte_from_feature(self):
336+
from memmachine_server.semantic_memory.semantic_model import (
337+
SemanticCommand,
338+
SemanticCommandType,
339+
)
340+
341+
cmd = SemanticCommand(
342+
command=SemanticCommandType.ADD,
343+
feature="cultural\x00focus",
344+
tag="Insights",
345+
value="some value",
346+
)
347+
assert "\x00" not in cmd.feature
348+
assert cmd.feature == "culturalfocus"
349+
350+
def test_strips_null_byte_from_tag(self):
351+
from memmachine_server.semantic_memory.semantic_model import (
352+
SemanticCommand,
353+
SemanticCommandType,
354+
)
355+
356+
cmd = SemanticCommand(
357+
command=SemanticCommandType.ADD,
358+
feature="focus",
359+
tag="Ins\x00ights",
360+
value="some value",
361+
)
362+
assert "\x00" not in cmd.tag
363+
assert cmd.tag == "Insights"
364+
365+
def test_clean_strings_pass_through(self):
366+
from memmachine_server.semantic_memory.semantic_model import (
367+
SemanticCommand,
368+
SemanticCommandType,
369+
)
370+
371+
cmd = SemanticCommand(
372+
command=SemanticCommandType.ADD,
373+
feature="cultural_focus",
374+
tag="Insights",
375+
value="Día de Los Muertos",
376+
)
377+
assert cmd.value == "Día de Los Muertos"
378+
assert cmd.feature == "cultural_focus"
379+
assert cmd.tag == "Insights"
380+
381+
382+
class TestLLMReducedFeatureNullByteStripping:
383+
"""Tests that LLMReducedFeature strips null bytes from consolidation output."""
384+
385+
def test_strips_null_byte_from_value(self):
386+
from memmachine_server.semantic_memory.semantic_llm import LLMReducedFeature
387+
388+
f = LLMReducedFeature(tag="tag", feature="feat", value="D\x00a")
389+
assert f.value == "Da"
390+
391+
def test_strips_null_byte_from_feature(self):
392+
from memmachine_server.semantic_memory.semantic_llm import LLMReducedFeature
393+
394+
f = LLMReducedFeature(tag="tag", feature="fe\x00at", value="val")
395+
assert f.feature == "feat"
396+
397+
def test_strips_null_byte_from_tag(self):
398+
from memmachine_server.semantic_memory.semantic_llm import LLMReducedFeature
399+
400+
f = LLMReducedFeature(tag="t\x00ag", feature="feat", value="val")
401+
assert f.tag == "tag"
402+
403+
def test_clean_strings_pass_through(self):
404+
from memmachine_server.semantic_memory.semantic_llm import LLMReducedFeature
405+
406+
f = LLMReducedFeature(tag="tag", feature="feat", value="Día")
407+
assert f.value == "Día"

packages/server/src/memmachine_server/semantic_memory/semantic_llm.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
Field,
1010
InstanceOf,
1111
TypeAdapter,
12+
field_validator,
1213
validate_call,
1314
)
1415

@@ -107,6 +108,13 @@ class LLMReducedFeature(BaseModel):
107108
feature: str
108109
value: str
109110

111+
@field_validator("tag", "feature", "value", mode="after")
112+
@classmethod
113+
def strip_null_bytes(cls, v: str) -> str:
114+
if "\x00" in v:
115+
return v.replace("\x00", "")
116+
return v
117+
110118

111119
class SemanticConsolidateMemoryRes(BaseModel):
112120
"""LLM response describing merged features and ids of features to retain."""

packages/server/src/memmachine_server/semantic_memory/semantic_model.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from enum import Enum
66
from typing import Any, Literal, Protocol, runtime_checkable
77

8-
from pydantic import BaseModel, InstanceOf
8+
from pydantic import BaseModel, InstanceOf, field_validator
99

1010
from memmachine_server.common.embedder import Embedder
1111
from memmachine_server.common.episode_store import EpisodeIdT
@@ -33,6 +33,13 @@ class SemanticCommand(BaseModel):
3333
tag: str
3434
value: str
3535

36+
@field_validator("feature", "tag", "value", mode="after")
37+
@classmethod
38+
def strip_null_bytes(cls, v: str) -> str:
39+
if "\x00" in v:
40+
return v.replace("\x00", "")
41+
return v
42+
3643

3744
@dataclass
3845
class RawSemanticPrompt:

packages/server/src/memmachine_server/semantic_memory/storage/sqlalchemy_pgvector_semantic.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
FeatureIdT,
5454
SemanticStorage,
5555
)
56+
from memmachine_server.semantic_memory.storage.text_sanitizer import sanitize_pg_text
5657

5758
logger = logging.getLogger(__name__)
5859

@@ -235,9 +236,9 @@ async def add_feature(
235236
.values(
236237
set_id=set_id,
237238
semantic_category_id=category_name,
238-
tag_id=tag,
239-
feature=feature,
240-
value=value,
239+
tag_id=sanitize_pg_text(tag, context="feature.tag"),
240+
feature=sanitize_pg_text(feature, context="feature.feature"),
241+
value=sanitize_pg_text(value, context="feature.value"),
241242
embedding=embedding,
242243
json_metadata=metadata,
243244
)
@@ -275,11 +276,13 @@ async def update_feature(
275276
if category_name is not None:
276277
stmt = stmt.values(semantic_category_id=category_name)
277278
if feature is not None:
278-
stmt = stmt.values(feature=feature)
279+
stmt = stmt.values(
280+
feature=sanitize_pg_text(feature, context="feature.feature")
281+
)
279282
if value is not None:
280-
stmt = stmt.values(value=value)
283+
stmt = stmt.values(value=sanitize_pg_text(value, context="feature.value"))
281284
if tag is not None:
282-
stmt = stmt.values(tag_id=tag)
285+
stmt = stmt.values(tag_id=sanitize_pg_text(tag, context="feature.tag"))
283286
if embedding is not None:
284287
stmt = stmt.values(embedding=embedding)
285288
if metadata is not None:
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
"""Sanitize text values before PostgreSQL insertion."""
2+
3+
import logging
4+
5+
logger = logging.getLogger(__name__)
6+
7+
8+
def sanitize_pg_text(value: str, *, context: str = "") -> str:
9+
"""Strip characters that PostgreSQL TEXT columns cannot store.
10+
11+
PostgreSQL rejects null bytes (0x00) in TEXT/VARCHAR columns. This
12+
function removes them and logs a warning so the data-quality issue
13+
is visible for monitoring.
14+
"""
15+
if "\x00" in value:
16+
logger.warning(
17+
"Stripped null byte(s) from text value%s: %.200r",
18+
f" ({context})" if context else "",
19+
value,
20+
)
21+
return value.replace("\x00", "")
22+
return value

0 commit comments

Comments
 (0)