Skip to content

Commit 4ed0278

Browse files
fix: Handle array of strings columns in Athena materialization (#6324)
* fix: handle string arrays in materialization Signed-off-by: Alan Gauthier <alan.gauthier@jobteaser.com> * pr feedback: none default values per type Signed-off-by: Alan Gauthier <alan.gauthier@jobteaser.com> * fix integration tests Signed-off-by: Alan Gauthier <alan.gauthier@jobteaser.com> --------- Signed-off-by: Alan Gauthier <alan.gauthier@jobteaser.com>
1 parent ae50414 commit 4ed0278

3 files changed

Lines changed: 209 additions & 1 deletion

File tree

sdk/python/feast/type_map.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -739,7 +739,7 @@ def _validate_collection_item_types(
739739
"""
740740
if sample is None:
741741
return
742-
if all(type(item) in valid_types for item in sample):
742+
if all(type(item) in valid_types for item in sample if item is not None):
743743
return
744744

745745
# to_numpy() upcasts INT32/INT64 with NULL to Float64 automatically
@@ -750,6 +750,8 @@ def _validate_collection_item_types(
750750
ValueType.INT64_SET,
751751
]
752752
for item in sample:
753+
if item is None:
754+
continue # None elements in STRING_LIST are replaced with ""; for other types they are dropped
753755
if type(item) not in valid_types:
754756
if feast_value_type in int_collection_types:
755757
# Check if the float values are due to NULL upcast
@@ -868,6 +870,39 @@ def convert_set_to_list(value: Any) -> Any:
868870
]
869871

870872

873+
# Per-type default values substituted for None elements inside list columns.
874+
# Protobuf repeated fields do not accept None, so we replace with a
875+
# type-appropriate zero/empty value.
876+
_LIST_NONE_DEFAULTS: Dict[ValueType, Any] = {
877+
ValueType.STRING_LIST: "",
878+
ValueType.BYTES_LIST: b"",
879+
ValueType.INT32_LIST: 0,
880+
ValueType.INT64_LIST: 0,
881+
ValueType.FLOAT_LIST: 0.0,
882+
ValueType.DOUBLE_LIST: 0.0,
883+
ValueType.BOOL_LIST: False,
884+
ValueType.UNIX_TIMESTAMP_LIST: NULL_TIMESTAMP_INT_VALUE,
885+
ValueType.UUID_LIST: "",
886+
ValueType.TIME_UUID_LIST: "",
887+
ValueType.DECIMAL_LIST: "",
888+
}
889+
890+
891+
def _sanitize_list_value(value: Any, feast_value_type: ValueType) -> Any:
892+
"""Convert ndarray to list and replace None elements with a type-appropriate default.
893+
894+
Arrow/Athena may deserialize array columns as numpy.ndarray with object dtype
895+
instead of plain Python lists. Protobuf repeated fields do not accept ndarrays
896+
or None elements, so we normalise here before building proto messages.
897+
"""
898+
if isinstance(value, np.ndarray):
899+
value = value.tolist()
900+
none_default = _LIST_NONE_DEFAULTS.get(feast_value_type)
901+
if none_default is not None and isinstance(value, list):
902+
value = [none_default if v is None else v for v in value]
903+
return value
904+
905+
871906
def _convert_list_values_to_proto(
872907
feast_value_type: ValueType,
873908
values: List[Any],
@@ -890,6 +925,13 @@ def _convert_list_values_to_proto(
890925
feast_value_type
891926
]
892927

928+
values = [
929+
_sanitize_list_value(v, feast_value_type) if v is not None else v
930+
for v in values
931+
]
932+
if sample is not None:
933+
sample = _sanitize_list_value(sample, feast_value_type)
934+
893935
# Bytes to array type conversion
894936
if isinstance(sample, (bytes, bytearray)):
895937
if feast_value_type == ValueType.BYTES_LIST:

sdk/python/tests/integration/offline_store/test_offline_write.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour
135135
)
136136

137137
store.apply([driver_entity, driver_stats])
138+
# Refresh registry after apply to ensure subsequent reads see the new feature view
139+
store.refresh_registry()
138140
df = store.get_historical_features(
139141
entity_df=entity_df,
140142
features=[

sdk/python/tests/unit/test_type_map.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ def test_python_values_to_proto_values_bool(values):
8787
(np.array([None]), ValueType.BYTES_LIST, None),
8888
(np.array([None]), ValueType.STRING_LIST, None),
8989
(np.array([None]), ValueType.UNIX_TIMESTAMP_LIST, None),
90+
([np.array([], dtype=np.int32)], ValueType.INT32_LIST, []),
91+
([np.array([], dtype=np.float32)], ValueType.FLOAT_LIST, []),
92+
([np.array([], dtype=np.bool_)], ValueType.BOOL_LIST, []),
9093
([b"[1,2,3]"], ValueType.INT64_LIST, [1, 2, 3]),
9194
([b"[1,2,3]"], ValueType.INT32_LIST, [1, 2, 3]),
9295
([b"[1.5,2.5,3.5]"], ValueType.FLOAT_LIST, [1.5, 2.5, 3.5]),
@@ -2065,3 +2068,164 @@ def test_proto_field_name_in_map(self):
20652068
from feast.type_map import PROTO_VALUE_TO_VALUE_TYPE_MAP
20662069

20672070
assert PROTO_VALUE_TO_VALUE_TYPE_MAP["scalar_map_val"] == ValueType.SCALAR_MAP
2071+
2072+
2073+
class TestArrowArrayStringListMaterialization:
2074+
"""Regression tests for Array(String) columns from Arrow/Athena materialization.
2075+
2076+
Arrow/Athena deserializes Array(String) feature columns as numpy.ndarray with
2077+
object dtype. Two bugs were triggered:
2078+
2079+
1. ValueError: "The truth value of an empty array is ambiguous"
2080+
— when an empty ndarray reached the scalar null-check `elif not pd.isnull(value)`.
2081+
2082+
2. TypeError: "bad argument type for built-in operation"
2083+
— when proto_type(val=<ndarray>) was called; protobuf rejects ndarrays.
2084+
2085+
Both are fixed by _sanitize_list_value, which converts ndarrays to plain Python
2086+
lists and replaces None elements with a type-appropriate zero/empty default
2087+
(see _LIST_NONE_DEFAULTS).
2088+
"""
2089+
2090+
def test_sanitize_list_value_ndarray(self):
2091+
"""ndarray is converted to a plain Python list."""
2092+
from feast.type_map import _sanitize_list_value
2093+
2094+
arr = np.array(["foo", "bar"], dtype=object)
2095+
result = _sanitize_list_value(arr, ValueType.STRING_LIST)
2096+
assert result == ["foo", "bar"]
2097+
assert isinstance(result, list)
2098+
2099+
def test_sanitize_list_value_empty_ndarray(self):
2100+
"""Empty ndarray is converted to an empty Python list."""
2101+
from feast.type_map import _sanitize_list_value
2102+
2103+
arr = np.array([], dtype=object)
2104+
result = _sanitize_list_value(arr, ValueType.STRING_LIST)
2105+
assert result == []
2106+
2107+
def test_sanitize_list_value_ndarray_with_none(self):
2108+
"""None elements inside a STRING_LIST ndarray are replaced with empty string."""
2109+
from feast.type_map import _sanitize_list_value
2110+
2111+
arr = np.array(["foo", None, "baz"], dtype=object)
2112+
result = _sanitize_list_value(arr, ValueType.STRING_LIST)
2113+
assert result == ["foo", "", "baz"]
2114+
2115+
def test_sanitize_list_value_plain_list(self):
2116+
"""Plain Python lists without None pass through unchanged."""
2117+
from feast.type_map import _sanitize_list_value
2118+
2119+
lst = ["foo", "bar"]
2120+
result = _sanitize_list_value(lst, ValueType.STRING_LIST)
2121+
assert result == ["foo", "bar"]
2122+
2123+
def test_sanitize_list_value_plain_list_with_none(self):
2124+
"""None elements in a STRING_LIST plain list are replaced with empty string."""
2125+
from feast.type_map import _sanitize_list_value
2126+
2127+
lst = ["foo", None]
2128+
result = _sanitize_list_value(lst, ValueType.STRING_LIST)
2129+
assert result == ["foo", ""]
2130+
2131+
def test_sanitize_list_value_numeric_none_replaced(self):
2132+
"""None elements in numeric lists are replaced with a type-appropriate default."""
2133+
from feast.type_map import _sanitize_list_value
2134+
2135+
assert _sanitize_list_value([1, None, 2], ValueType.INT32_LIST) == [1, 0, 2]
2136+
assert _sanitize_list_value([1, None, 2], ValueType.INT64_LIST) == [1, 0, 2]
2137+
assert _sanitize_list_value([1.0, None, 2.0], ValueType.FLOAT_LIST) == [
2138+
1.0,
2139+
0.0,
2140+
2.0,
2141+
]
2142+
assert _sanitize_list_value([1.0, None, 2.0], ValueType.DOUBLE_LIST) == [
2143+
1.0,
2144+
0.0,
2145+
2.0,
2146+
]
2147+
assert _sanitize_list_value([True, None, False], ValueType.BOOL_LIST) == [
2148+
True,
2149+
False,
2150+
False,
2151+
]
2152+
2153+
def test_sanitize_list_value_bytes_none_replaced(self):
2154+
"""None elements in BYTES_LIST are replaced with b''."""
2155+
from feast.type_map import _sanitize_list_value
2156+
2157+
result = _sanitize_list_value([b"x", None], ValueType.BYTES_LIST)
2158+
assert result == [b"x", b""]
2159+
2160+
def test_sanitize_list_value_scalar_passthrough(self):
2161+
"""Non-list, non-ndarray values are returned unchanged."""
2162+
from feast.type_map import _sanitize_list_value
2163+
2164+
assert _sanitize_list_value("hello", ValueType.STRING_LIST) == "hello"
2165+
assert _sanitize_list_value(42, ValueType.INT32_LIST) == 42
2166+
2167+
def test_string_list_from_ndarray(self):
2168+
"""STRING_LIST column with ndarray values materializes without TypeError."""
2169+
values = [
2170+
np.array(["foo", "bar"], dtype=object),
2171+
np.array(["baz"], dtype=object),
2172+
]
2173+
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)
2174+
assert len(protos) == 2
2175+
assert list(protos[0].string_list_val.val) == ["foo", "bar"]
2176+
assert list(protos[1].string_list_val.val) == ["baz"]
2177+
2178+
def test_string_list_from_empty_ndarray(self):
2179+
"""Empty ndarray in a STRING_LIST column must not raise ValueError."""
2180+
values = [
2181+
np.array([], dtype=object),
2182+
np.array(["foo"], dtype=object),
2183+
]
2184+
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)
2185+
assert list(protos[0].string_list_val.val) == []
2186+
assert list(protos[1].string_list_val.val) == ["foo"]
2187+
2188+
def test_string_list_from_ndarray_with_none_elements(self):
2189+
"""None elements inside an ndarray must not cause TypeError in protobuf."""
2190+
values = [
2191+
np.array(["foo", None, "baz"], dtype=object),
2192+
]
2193+
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)
2194+
# None is replaced with empty string
2195+
assert list(protos[0].string_list_val.val) == ["foo", "", "baz"]
2196+
2197+
def test_string_list_null_row_produces_empty_proto(self):
2198+
"""A None row (missing user) produces an empty ProtoValue."""
2199+
from feast.protos.feast.types.Value_pb2 import Value as ProtoValue
2200+
2201+
values = [
2202+
None,
2203+
np.array(["foo"], dtype=object),
2204+
]
2205+
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)
2206+
assert protos[0] == ProtoValue()
2207+
assert list(protos[1].string_list_val.val) == ["foo"]
2208+
2209+
def test_mixed_batch_simulating_athena_chunk(self):
2210+
"""Simulate a real Athena chunk: mix of ndarray, empty ndarray, and None rows.
2211+
2212+
This is the exact scenario that triggered the TypeError during
2213+
string_list_features materialization.
2214+
"""
2215+
from feast.protos.feast.types.Value_pb2 import Value as ProtoValue
2216+
2217+
# tags / labels column from Athena
2218+
values = [
2219+
np.array(["foo", "bar"], dtype=object), # normal entity
2220+
np.array([], dtype=object), # entity with no values set
2221+
None, # missing entity (NULL row)
2222+
np.array(["baz"], dtype=object), # normal entity
2223+
np.array(["qux", None], dtype=object), # entity with partial null
2224+
]
2225+
protos = python_values_to_proto_values(values, ValueType.STRING_LIST)
2226+
2227+
assert list(protos[0].string_list_val.val) == ["foo", "bar"]
2228+
assert list(protos[1].string_list_val.val) == []
2229+
assert protos[2] == ProtoValue()
2230+
assert list(protos[3].string_list_val.val) == ["baz"]
2231+
assert list(protos[4].string_list_val.val) == ["qux", ""]

0 commit comments

Comments
 (0)