Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion sdk/python/feast/type_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def _validate_collection_item_types(
"""
if sample is None:
return
if all(type(item) in valid_types for item in sample):
if all(type(item) in valid_types for item in sample if item is not None):
return

# to_numpy() upcasts INT32/INT64 with NULL to Float64 automatically
Expand All @@ -750,6 +750,8 @@ def _validate_collection_item_types(
ValueType.INT64_SET,
]
for item in sample:
if item is None:
continue # None elements in STRING_LIST are replaced with ""; for other types they are dropped
if type(item) not in valid_types:
if feast_value_type in int_collection_types:
# Check if the float values are due to NULL upcast
Expand Down Expand Up @@ -868,6 +870,39 @@ def convert_set_to_list(value: Any) -> Any:
]


# Per-type default values substituted for None elements inside list columns.
# Protobuf repeated fields do not accept None, so we replace with a
# type-appropriate zero/empty value.
_LIST_NONE_DEFAULTS: Dict[ValueType, Any] = {
ValueType.STRING_LIST: "",
ValueType.BYTES_LIST: b"",
ValueType.INT32_LIST: 0,
ValueType.INT64_LIST: 0,
ValueType.FLOAT_LIST: 0.0,
ValueType.DOUBLE_LIST: 0.0,
ValueType.BOOL_LIST: False,
ValueType.UNIX_TIMESTAMP_LIST: NULL_TIMESTAMP_INT_VALUE,
ValueType.UUID_LIST: "",
ValueType.TIME_UUID_LIST: "",
ValueType.DECIMAL_LIST: "",
}


def _sanitize_list_value(value: Any, feast_value_type: ValueType) -> Any:
"""Convert ndarray to list and replace None elements with a type-appropriate default.

Arrow/Athena may deserialize array columns as numpy.ndarray with object dtype
instead of plain Python lists. Protobuf repeated fields do not accept ndarrays
or None elements, so we normalise here before building proto messages.
"""
if isinstance(value, np.ndarray):
value = value.tolist()
none_default = _LIST_NONE_DEFAULTS.get(feast_value_type)
if none_default is not None and isinstance(value, list):
value = [none_default if v is None else v for v in value]
return value


def _convert_list_values_to_proto(
feast_value_type: ValueType,
values: List[Any],
Expand All @@ -890,6 +925,13 @@ def _convert_list_values_to_proto(
feast_value_type
]

values = [
_sanitize_list_value(v, feast_value_type) if v is not None else v
for v in values
]
if sample is not None:
sample = _sanitize_list_value(sample, feast_value_type)

# Bytes to array type conversion
if isinstance(sample, (bytes, bytearray)):
if feast_value_type == ValueType.BYTES_LIST:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour
)

store.apply([driver_entity, driver_stats])
# Refresh registry after apply to ensure subsequent reads see the new feature view
store.refresh_registry()
df = store.get_historical_features(
entity_df=entity_df,
features=[
Expand Down
164 changes: 164 additions & 0 deletions sdk/python/tests/unit/test_type_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def test_python_values_to_proto_values_bool(values):
(np.array([None]), ValueType.BYTES_LIST, None),
(np.array([None]), ValueType.STRING_LIST, None),
(np.array([None]), ValueType.UNIX_TIMESTAMP_LIST, None),
([np.array([], dtype=np.int32)], ValueType.INT32_LIST, []),
([np.array([], dtype=np.float32)], ValueType.FLOAT_LIST, []),
([np.array([], dtype=np.bool_)], ValueType.BOOL_LIST, []),
([b"[1,2,3]"], ValueType.INT64_LIST, [1, 2, 3]),
([b"[1,2,3]"], ValueType.INT32_LIST, [1, 2, 3]),
([b"[1.5,2.5,3.5]"], ValueType.FLOAT_LIST, [1.5, 2.5, 3.5]),
Expand Down Expand Up @@ -2065,3 +2068,164 @@ def test_proto_field_name_in_map(self):
from feast.type_map import PROTO_VALUE_TO_VALUE_TYPE_MAP

assert PROTO_VALUE_TO_VALUE_TYPE_MAP["scalar_map_val"] == ValueType.SCALAR_MAP


class TestArrowArrayStringListMaterialization:
"""Regression tests for Array(String) columns from Arrow/Athena materialization.

Arrow/Athena deserializes Array(String) feature columns as numpy.ndarray with
object dtype. Two bugs were triggered:

1. ValueError: "The truth value of an empty array is ambiguous"
— when an empty ndarray reached the scalar null-check `elif not pd.isnull(value)`.

2. TypeError: "bad argument type for built-in operation"
— when proto_type(val=<ndarray>) was called; protobuf rejects ndarrays.

Both are fixed by _sanitize_list_value, which converts ndarrays to plain Python
lists and replaces None elements with a type-appropriate zero/empty default
(see _LIST_NONE_DEFAULTS).
"""

def test_sanitize_list_value_ndarray(self):
"""ndarray is converted to a plain Python list."""
from feast.type_map import _sanitize_list_value

arr = np.array(["foo", "bar"], dtype=object)
result = _sanitize_list_value(arr, ValueType.STRING_LIST)
assert result == ["foo", "bar"]
assert isinstance(result, list)

def test_sanitize_list_value_empty_ndarray(self):
"""Empty ndarray is converted to an empty Python list."""
from feast.type_map import _sanitize_list_value

arr = np.array([], dtype=object)
result = _sanitize_list_value(arr, ValueType.STRING_LIST)
assert result == []

def test_sanitize_list_value_ndarray_with_none(self):
"""None elements inside a STRING_LIST ndarray are replaced with empty string."""
from feast.type_map import _sanitize_list_value

arr = np.array(["foo", None, "baz"], dtype=object)
result = _sanitize_list_value(arr, ValueType.STRING_LIST)
assert result == ["foo", "", "baz"]

def test_sanitize_list_value_plain_list(self):
"""Plain Python lists without None pass through unchanged."""
from feast.type_map import _sanitize_list_value

lst = ["foo", "bar"]
result = _sanitize_list_value(lst, ValueType.STRING_LIST)
assert result == ["foo", "bar"]

def test_sanitize_list_value_plain_list_with_none(self):
"""None elements in a STRING_LIST plain list are replaced with empty string."""
from feast.type_map import _sanitize_list_value

lst = ["foo", None]
result = _sanitize_list_value(lst, ValueType.STRING_LIST)
assert result == ["foo", ""]

def test_sanitize_list_value_numeric_none_replaced(self):
"""None elements in numeric lists are replaced with a type-appropriate default."""
from feast.type_map import _sanitize_list_value

assert _sanitize_list_value([1, None, 2], ValueType.INT32_LIST) == [1, 0, 2]
assert _sanitize_list_value([1, None, 2], ValueType.INT64_LIST) == [1, 0, 2]
assert _sanitize_list_value([1.0, None, 2.0], ValueType.FLOAT_LIST) == [
1.0,
0.0,
2.0,
]
assert _sanitize_list_value([1.0, None, 2.0], ValueType.DOUBLE_LIST) == [
1.0,
0.0,
2.0,
]
assert _sanitize_list_value([True, None, False], ValueType.BOOL_LIST) == [
True,
False,
False,
]

def test_sanitize_list_value_bytes_none_replaced(self):
"""None elements in BYTES_LIST are replaced with b''."""
from feast.type_map import _sanitize_list_value

result = _sanitize_list_value([b"x", None], ValueType.BYTES_LIST)
assert result == [b"x", b""]

def test_sanitize_list_value_scalar_passthrough(self):
"""Non-list, non-ndarray values are returned unchanged."""
from feast.type_map import _sanitize_list_value

assert _sanitize_list_value("hello", ValueType.STRING_LIST) == "hello"
assert _sanitize_list_value(42, ValueType.INT32_LIST) == 42

def test_string_list_from_ndarray(self):
"""STRING_LIST column with ndarray values materializes without TypeError."""
values = [
np.array(["foo", "bar"], dtype=object),
np.array(["baz"], dtype=object),
]
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)
assert len(protos) == 2
assert list(protos[0].string_list_val.val) == ["foo", "bar"]
assert list(protos[1].string_list_val.val) == ["baz"]

def test_string_list_from_empty_ndarray(self):
"""Empty ndarray in a STRING_LIST column must not raise ValueError."""
values = [
np.array([], dtype=object),
np.array(["foo"], dtype=object),
]
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)
assert list(protos[0].string_list_val.val) == []
assert list(protos[1].string_list_val.val) == ["foo"]

def test_string_list_from_ndarray_with_none_elements(self):
"""None elements inside an ndarray must not cause TypeError in protobuf."""
values = [
np.array(["foo", None, "baz"], dtype=object),
]
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)
# None is replaced with empty string
assert list(protos[0].string_list_val.val) == ["foo", "", "baz"]

def test_string_list_null_row_produces_empty_proto(self):
"""A None row (missing user) produces an empty ProtoValue."""
from feast.protos.feast.types.Value_pb2 import Value as ProtoValue

values = [
None,
np.array(["foo"], dtype=object),
]
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)
assert protos[0] == ProtoValue()
assert list(protos[1].string_list_val.val) == ["foo"]

def test_mixed_batch_simulating_athena_chunk(self):
"""Simulate a real Athena chunk: mix of ndarray, empty ndarray, and None rows.

This is the exact scenario that triggered the TypeError during
string_list_features materialization.
"""
from feast.protos.feast.types.Value_pb2 import Value as ProtoValue

# tags / labels column from Athena
values = [
np.array(["foo", "bar"], dtype=object), # normal entity
np.array([], dtype=object), # entity with no values set
None, # missing entity (NULL row)
np.array(["baz"], dtype=object), # normal entity
np.array(["qux", None], dtype=object), # entity with partial null
]
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)

assert list(protos[0].string_list_val.val) == ["foo", "bar"]
assert list(protos[1].string_list_val.val) == []
assert protos[2] == ProtoValue()
assert list(protos[3].string_list_val.val) == ["baz"]
assert list(protos[4].string_list_val.val) == ["qux", ""]
Loading