Skip to content

Commit e558c81

Browse files
soooojinleeclaude
andcommitted
fix: Preserve inner element types in PyArrow schema inference and optimize JSON nested list detection
- Add _parse_pa_type_str() to reconstruct PyArrow types from type strings for VALUE_LIST/VALUE_SET, avoiding lossy round-trip through placeholder - Optimize proto_json nested list detection: only scan with any() when first element is None, avoiding O(n) scan for flat lists - Add warning log for unrecognized PyArrow type strings Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: soojin <soojin@dable.io>
1 parent 4c8502e commit e558c81

File tree

2 files changed

+42
-10
lines changed

2 files changed

+42
-10
lines changed

sdk/python/feast/infra/offline_stores/offline_utils.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import uuid
23
from dataclasses import asdict, dataclass
34
from datetime import datetime, timedelta, timezone
@@ -21,6 +22,7 @@
2122
from feast.repo_config import RepoConfig
2223
from feast.type_map import feast_value_type_to_pa
2324
from feast.utils import _get_requested_feature_views_to_features_dict, to_naive_utc
25+
from feast.value_type import ValueType
2426

2527
DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL = "event_timestamp"
2628

@@ -241,6 +243,37 @@ def get_offline_store_from_config(offline_store_config: Any) -> OfflineStore:
241243
return offline_store_class()
242244

243245

246+
_PA_BASIC_TYPES = {
247+
"int32": pa.int32(),
248+
"int64": pa.int64(),
249+
"double": pa.float64(),
250+
"float": pa.float32(),
251+
"string": pa.string(),
252+
"binary": pa.binary(),
253+
"bool": pa.bool_(),
254+
"large_string": pa.large_string(),
255+
"null": pa.null(),
256+
}
257+
258+
259+
def _parse_pa_type_str(pa_type_str: str) -> pa.DataType:
260+
"""Parse a PyArrow type string to preserve inner element types for nested lists."""
261+
pa_type_str = pa_type_str.strip()
262+
if pa_type_str.startswith("list<item: ") and pa_type_str.endswith(">"):
263+
inner = pa_type_str[len("list<item: ") : -1]
264+
return pa.list_(_parse_pa_type_str(inner))
265+
if pa_type_str in _PA_BASIC_TYPES:
266+
return _PA_BASIC_TYPES[pa_type_str]
267+
if pa_type_str.startswith("timestamp"):
268+
return pa.timestamp("us")
269+
logger = logging.getLogger(__name__)
270+
logger.warning(
271+
"Unrecognized PyArrow type string '%s', falling back to pa.string()",
272+
pa_type_str,
273+
)
274+
return pa.string()
275+
276+
244277
def get_pyarrow_schema_from_batch_source(
245278
config: RepoConfig, batch_source: DataSource, timestamp_unit: str = "us"
246279
) -> Tuple[pa.Schema, List[str]]:
@@ -250,15 +283,12 @@ def get_pyarrow_schema_from_batch_source(
250283
pa_schema = []
251284
column_names = []
252285
for column_name, column_type in column_names_and_types:
253-
pa_schema.append(
254-
(
255-
column_name,
256-
feast_value_type_to_pa(
257-
batch_source.source_datatype_to_feast_value_type()(column_type),
258-
timestamp_unit=timestamp_unit,
259-
),
260-
)
261-
)
286+
value_type = batch_source.source_datatype_to_feast_value_type()(column_type)
287+
if value_type in (ValueType.VALUE_LIST, ValueType.VALUE_SET):
288+
pa_type = _parse_pa_type_str(column_type)
289+
else:
290+
pa_type = feast_value_type_to_pa(value_type, timestamp_unit=timestamp_unit)
291+
pa_schema.append((column_name, pa_type))
262292
column_names.append(column_name)
263293

264294
return pa.schema(pa_schema), column_names

sdk/python/feast/proto_json.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,9 @@ def from_json_object(
9292
if len(value) == 0:
9393
# Clear will mark the struct as modified so it will be created even if there are no values
9494
message.int64_list_val.Clear()
95-
elif isinstance(value[0], list) or any(isinstance(v, list) for v in value):
95+
elif isinstance(value[0], list) or (
96+
value[0] is None and any(isinstance(v, list) for v in value)
97+
):
9698
# Nested collection (list of lists).
9799
# Check any() to handle cases where the first element is None
98100
# (empty inner collections round-trip through proto as None).

0 commit comments

Comments
 (0)