diff --git a/docs/reference/type-system.md b/docs/reference/type-system.md index 3bda3ad16c5..9f36f6eeaff 100644 --- a/docs/reference/type-system.md +++ b/docs/reference/type-system.md @@ -3,7 +3,7 @@ ## Motivation Feast uses an internal type system to provide guarantees on training and serving data. -Feast supports primitive types, array types, and map types for feature values. +Feast supports primitive types, array types, set types, and map types for feature values. Null types are not supported, although the `UNIX_TIMESTAMP` type is nullable. The type system is controlled by [`Value.proto`](https://github.com/feast-dev/feast/blob/master/protos/feast/types/Value.proto) in protobuf and by [`types.py`](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/types.py) in Python. Type conversion logic can be found in [`type_map.py`](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/type_map.py). @@ -40,6 +40,23 @@ All primitive types have corresponding array (list) types: | `Array(Bool)` | `List[bool]` | List of booleans | | `Array(UnixTimestamp)` | `List[datetime]` | List of timestamps | +### Set Types + +All primitive types (except Map) have corresponding set types for storing unique values: + +| Feast Type | Python Type | Description | +|------------|-------------|-------------| +| `Set(Int32)` | `Set[int]` | Set of unique 32-bit integers | +| `Set(Int64)` | `Set[int]` | Set of unique 64-bit integers | +| `Set(Float32)` | `Set[float]` | Set of unique 32-bit floats | +| `Set(Float64)` | `Set[float]` | Set of unique 64-bit floats | +| `Set(String)` | `Set[str]` | Set of unique strings | +| `Set(Bytes)` | `Set[bytes]` | Set of unique binary data | +| `Set(Bool)` | `Set[bool]` | Set of unique booleans | +| `Set(UnixTimestamp)` | `Set[datetime]` | Set of unique timestamps | + +**Note:** Set types automatically remove duplicate values. When converting from lists or other iterables to sets, duplicates are eliminated. + ### Map Types Map types allow storing dictionary-like data structures: @@ -60,7 +77,7 @@ from datetime import timedelta from feast import Entity, FeatureView, Field, FileSource from feast.types import ( Int32, Int64, Float32, Float64, String, Bytes, Bool, UnixTimestamp, - Array, Map + Array, Set, Map ) # Define a data source @@ -101,6 +118,12 @@ user_features = FeatureView( Field(name="notification_settings", dtype=Array(Bool)), Field(name="login_timestamps", dtype=Array(UnixTimestamp)), + # Set types (unique values only) + Field(name="visited_pages", dtype=Set(String)), + Field(name="unique_categories", dtype=Set(Int32)), + Field(name="tag_ids", dtype=Set(Int64)), + Field(name="preferred_languages", dtype=Set(String)), + # Map types Field(name="user_preferences", dtype=Map), Field(name="metadata", dtype=Map), @@ -110,6 +133,24 @@ user_features = FeatureView( ) ``` +### Set Type Usage Examples + +Sets store unique values and automatically remove duplicates: + +```python +# Simple set +visited_pages = {"home", "products", "checkout", "products"} # "products" appears twice +# Feast will store this as: {"home", "products", "checkout"} + +# Integer set +unique_categories = {1, 2, 3, 2, 1} # duplicates will be removed +# Feast will store this as: {1, 2, 3} + +# Converting a list with duplicates to a set +tag_list = [100, 200, 300, 100, 200] +tag_ids = set(tag_list) # {100, 200, 300} +``` + ### Map Type Usage Examples Maps can store complex nested data structures: diff --git a/protos/feast/types/Value.proto b/protos/feast/types/Value.proto index 703684c3b52..be93235ab36 100644 --- a/protos/feast/types/Value.proto +++ b/protos/feast/types/Value.proto @@ -45,6 +45,14 @@ message ValueType { NULL = 19; MAP = 20; MAP_LIST = 21; + BYTES_SET = 22; + STRING_SET = 23; + INT32_SET = 24; + INT64_SET = 25; + DOUBLE_SET = 26; + FLOAT_SET = 27; + BOOL_SET = 28; + UNIX_TIMESTAMP_SET = 29; } } @@ -72,6 +80,14 @@ message Value { Null null_val = 19; Map map_val = 20; MapList map_list_val = 21; + BytesSet bytes_set_val = 22; + StringSet string_set_val = 23; + Int32Set int32_set_val = 24; + Int64Set int64_set_val = 25; + DoubleSet double_set_val = 26; + FloatSet float_set_val = 27; + BoolSet bool_set_val = 28; + Int64Set unix_timestamp_set_val = 29; } } @@ -107,6 +123,34 @@ message BoolList { repeated bool val = 1; } +message BytesSet { + repeated bytes val = 1; +} + +message StringSet { + repeated string val = 1; +} + +message Int32Set { + repeated int32 val = 1; +} + +message Int64Set { + repeated int64 val = 1; +} + +message DoubleSet { + repeated double val = 1; +} + +message FloatSet { + repeated float val = 1; +} + +message BoolSet { + repeated bool val = 1; +} + message Map { map val = 1; } diff --git a/sdk/python/feast/infra/online_stores/milvus_online_store/milvus.py b/sdk/python/feast/infra/online_stores/milvus_online_store/milvus.py index 9a2ecb935bc..42a8f359107 100644 --- a/sdk/python/feast/infra/online_stores/milvus_online_store/milvus.py +++ b/sdk/python/feast/infra/online_stores/milvus_online_store/milvus.py @@ -167,6 +167,8 @@ def _get_or_create_collection( fields_to_exclude = [ "event_ts", "created_ts", + "event_timestamp", + "created_timestamp", ] fields_to_add = [f for f in table.schema if f.name not in fields_to_exclude] for field in fields_to_add: @@ -202,6 +204,7 @@ def _get_or_create_collection( schema=schema, ) index_params = self.client.prepare_index_params() + indices_added = False for vector_field in schema.fields: if ( vector_field.dtype @@ -222,7 +225,8 @@ def _get_or_create_collection( index_name=f"vector_index_{vector_field.name}", params={"nlist": config.online_store.nlist}, ) - if len(index_params) > 0: + indices_added = True + if indices_added: self.client.create_index( collection_name=collection_name, index_params=index_params, @@ -281,6 +285,16 @@ def online_write_batch( serialize_to_string=True, ) + # Remove timestamp fields that are handled separately to avoid conflicts + timestamp_fields = [ + "event_timestamp", + "created_timestamp", + "event_ts", + "created_ts", + ] + for field in timestamp_fields: + values_dict.pop(field, None) + single_entity_record = { composite_key_name: entity_key_str, "event_ts": timestamp_int, @@ -722,7 +736,7 @@ def _extract_proto_values_to_dict( numeric_vector_list_types = [ k for k in PROTO_VALUE_TO_VALUE_TYPE_MAP.keys() - if k is not None and "list" in k and "string" not in k + if k is not None and ("list" in k or "set" in k) and "string" not in k ] numeric_types = [ "double_val", @@ -747,9 +761,13 @@ def _extract_proto_values_to_dict( if ( serialize_to_string and proto_val_type - not in ["string_val", "bytes_val"] + numeric_types + not in ["string_val", "bytes_val", "unix_timestamp_val"] + + numeric_types ): - vector_values = feature_values.SerializeToString().decode() + # For complex types, use base64 encoding instead of decode + vector_values = base64.b64encode( + feature_values.SerializeToString() + ).decode("utf-8") elif proto_val_type == "bytes_val": byte_data = getattr(feature_values, proto_val_type) vector_values = base64.b64encode(byte_data).decode("utf-8") diff --git a/sdk/python/feast/protos/feast/core/DatastoreTable_pb2.pyi b/sdk/python/feast/protos/feast/core/DatastoreTable_pb2.pyi index 6339a97536e..7b5a629eb7a 100644 --- a/sdk/python/feast/protos/feast/core/DatastoreTable_pb2.pyi +++ b/sdk/python/feast/protos/feast/core/DatastoreTable_pb2.pyi @@ -1,19 +1,19 @@ """ @generated by mypy-protobuf. Do not edit manually! isort:skip_file - -* Copyright 2021 The Feast Authors -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* https://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and + +* Copyright 2021 The Feast Authors +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* https://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and * limitations under the License. """ import builtins diff --git a/sdk/python/feast/protos/feast/core/Entity_pb2.pyi b/sdk/python/feast/protos/feast/core/Entity_pb2.pyi index 025817edfee..a5924a13451 100644 --- a/sdk/python/feast/protos/feast/core/Entity_pb2.pyi +++ b/sdk/python/feast/protos/feast/core/Entity_pb2.pyi @@ -1,19 +1,19 @@ """ @generated by mypy-protobuf. Do not edit manually! isort:skip_file - -* Copyright 2020 The Feast Authors -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* https://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and + +* Copyright 2020 The Feast Authors +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* https://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and * limitations under the License. """ import builtins diff --git a/sdk/python/feast/protos/feast/core/FeatureViewProjection_pb2.pyi b/sdk/python/feast/protos/feast/core/FeatureViewProjection_pb2.pyi index 6b44ad4a931..72426f55c9f 100644 --- a/sdk/python/feast/protos/feast/core/FeatureViewProjection_pb2.pyi +++ b/sdk/python/feast/protos/feast/core/FeatureViewProjection_pb2.pyi @@ -19,7 +19,7 @@ else: DESCRIPTOR: google.protobuf.descriptor.FileDescriptor class FeatureViewProjection(google.protobuf.message.Message): - """A projection to be applied on top of a FeatureView. + """A projection to be applied on top of a FeatureView. Contains the modifications to a FeatureView such as the features subset to use. """ diff --git a/sdk/python/feast/protos/feast/core/Project_pb2.pyi b/sdk/python/feast/protos/feast/core/Project_pb2.pyi index e3cce2ec425..3196304a19b 100644 --- a/sdk/python/feast/protos/feast/core/Project_pb2.pyi +++ b/sdk/python/feast/protos/feast/core/Project_pb2.pyi @@ -1,19 +1,19 @@ """ @generated by mypy-protobuf. Do not edit manually! isort:skip_file - -* Copyright 2020 The Feast Authors -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* https://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and + +* Copyright 2020 The Feast Authors +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* https://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and * limitations under the License. """ import builtins diff --git a/sdk/python/feast/protos/feast/core/Registry_pb2.pyi b/sdk/python/feast/protos/feast/core/Registry_pb2.pyi index fca49c75481..ad09878b77f 100644 --- a/sdk/python/feast/protos/feast/core/Registry_pb2.pyi +++ b/sdk/python/feast/protos/feast/core/Registry_pb2.pyi @@ -1,19 +1,19 @@ """ @generated by mypy-protobuf. Do not edit manually! isort:skip_file - -* Copyright 2020 The Feast Authors -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* https://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and + +* Copyright 2020 The Feast Authors +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* https://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and * limitations under the License. """ import builtins diff --git a/sdk/python/feast/protos/feast/types/Value_pb2.py b/sdk/python/feast/protos/feast/types/Value_pb2.py index 87134112a66..2ab1d2cc8fb 100644 --- a/sdk/python/feast/protos/feast/types/Value_pb2.py +++ b/sdk/python/feast/protos/feast/types/Value_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17\x66\x65\x61st/types/Value.proto\x12\x0b\x66\x65\x61st.types\"\xae\x02\n\tValueType\"\xa0\x02\n\x04\x45num\x12\x0b\n\x07INVALID\x10\x00\x12\t\n\x05\x42YTES\x10\x01\x12\n\n\x06STRING\x10\x02\x12\t\n\x05INT32\x10\x03\x12\t\n\x05INT64\x10\x04\x12\n\n\x06\x44OUBLE\x10\x05\x12\t\n\x05\x46LOAT\x10\x06\x12\x08\n\x04\x42OOL\x10\x07\x12\x12\n\x0eUNIX_TIMESTAMP\x10\x08\x12\x0e\n\nBYTES_LIST\x10\x0b\x12\x0f\n\x0bSTRING_LIST\x10\x0c\x12\x0e\n\nINT32_LIST\x10\r\x12\x0e\n\nINT64_LIST\x10\x0e\x12\x0f\n\x0b\x44OUBLE_LIST\x10\x0f\x12\x0e\n\nFLOAT_LIST\x10\x10\x12\r\n\tBOOL_LIST\x10\x11\x12\x17\n\x13UNIX_TIMESTAMP_LIST\x10\x12\x12\x08\n\x04NULL\x10\x13\x12\x07\n\x03MAP\x10\x14\x12\x0c\n\x08MAP_LIST\x10\x15\"\xd5\x05\n\x05Value\x12\x13\n\tbytes_val\x18\x01 \x01(\x0cH\x00\x12\x14\n\nstring_val\x18\x02 \x01(\tH\x00\x12\x13\n\tint32_val\x18\x03 \x01(\x05H\x00\x12\x13\n\tint64_val\x18\x04 \x01(\x03H\x00\x12\x14\n\ndouble_val\x18\x05 \x01(\x01H\x00\x12\x13\n\tfloat_val\x18\x06 \x01(\x02H\x00\x12\x12\n\x08\x62ool_val\x18\x07 \x01(\x08H\x00\x12\x1c\n\x12unix_timestamp_val\x18\x08 \x01(\x03H\x00\x12\x30\n\x0e\x62ytes_list_val\x18\x0b \x01(\x0b\x32\x16.feast.types.BytesListH\x00\x12\x32\n\x0fstring_list_val\x18\x0c \x01(\x0b\x32\x17.feast.types.StringListH\x00\x12\x30\n\x0eint32_list_val\x18\r \x01(\x0b\x32\x16.feast.types.Int32ListH\x00\x12\x30\n\x0eint64_list_val\x18\x0e \x01(\x0b\x32\x16.feast.types.Int64ListH\x00\x12\x32\n\x0f\x64ouble_list_val\x18\x0f \x01(\x0b\x32\x17.feast.types.DoubleListH\x00\x12\x30\n\x0e\x66loat_list_val\x18\x10 \x01(\x0b\x32\x16.feast.types.FloatListH\x00\x12.\n\rbool_list_val\x18\x11 \x01(\x0b\x32\x15.feast.types.BoolListH\x00\x12\x39\n\x17unix_timestamp_list_val\x18\x12 \x01(\x0b\x32\x16.feast.types.Int64ListH\x00\x12%\n\x08null_val\x18\x13 \x01(\x0e\x32\x11.feast.types.NullH\x00\x12#\n\x07map_val\x18\x14 \x01(\x0b\x32\x10.feast.types.MapH\x00\x12,\n\x0cmap_list_val\x18\x15 \x01(\x0b\x32\x14.feast.types.MapListH\x00\x42\x05\n\x03val\"\x18\n\tBytesList\x12\x0b\n\x03val\x18\x01 \x03(\x0c\"\x19\n\nStringList\x12\x0b\n\x03val\x18\x01 \x03(\t\"\x18\n\tInt32List\x12\x0b\n\x03val\x18\x01 \x03(\x05\"\x18\n\tInt64List\x12\x0b\n\x03val\x18\x01 \x03(\x03\"\x19\n\nDoubleList\x12\x0b\n\x03val\x18\x01 \x03(\x01\"\x18\n\tFloatList\x12\x0b\n\x03val\x18\x01 \x03(\x02\"\x17\n\x08\x42oolList\x12\x0b\n\x03val\x18\x01 \x03(\x08\"m\n\x03Map\x12&\n\x03val\x18\x01 \x03(\x0b\x32\x19.feast.types.Map.ValEntry\x1a>\n\x08ValEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12!\n\x05value\x18\x02 \x01(\x0b\x32\x12.feast.types.Value:\x02\x38\x01\"(\n\x07MapList\x12\x1d\n\x03val\x18\x01 \x03(\x0b\x32\x10.feast.types.Map\"0\n\rRepeatedValue\x12\x1f\n\x03val\x18\x01 \x03(\x0b\x32\x12.feast.types.Value*\x10\n\x04Null\x12\x08\n\x04NULL\x10\x00\x42Q\n\x11\x66\x65\x61st.proto.typesB\nValueProtoZ0github.com/feast-dev/feast/go/protos/feast/typesb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17\x66\x65\x61st/types/Value.proto\x12\x0b\x66\x65\x61st.types\"\xb0\x03\n\tValueType\"\xa2\x03\n\x04\x45num\x12\x0b\n\x07INVALID\x10\x00\x12\t\n\x05\x42YTES\x10\x01\x12\n\n\x06STRING\x10\x02\x12\t\n\x05INT32\x10\x03\x12\t\n\x05INT64\x10\x04\x12\n\n\x06\x44OUBLE\x10\x05\x12\t\n\x05\x46LOAT\x10\x06\x12\x08\n\x04\x42OOL\x10\x07\x12\x12\n\x0eUNIX_TIMESTAMP\x10\x08\x12\x0e\n\nBYTES_LIST\x10\x0b\x12\x0f\n\x0bSTRING_LIST\x10\x0c\x12\x0e\n\nINT32_LIST\x10\r\x12\x0e\n\nINT64_LIST\x10\x0e\x12\x0f\n\x0b\x44OUBLE_LIST\x10\x0f\x12\x0e\n\nFLOAT_LIST\x10\x10\x12\r\n\tBOOL_LIST\x10\x11\x12\x17\n\x13UNIX_TIMESTAMP_LIST\x10\x12\x12\x08\n\x04NULL\x10\x13\x12\x07\n\x03MAP\x10\x14\x12\x0c\n\x08MAP_LIST\x10\x15\x12\r\n\tBYTES_SET\x10\x16\x12\x0e\n\nSTRING_SET\x10\x17\x12\r\n\tINT32_SET\x10\x18\x12\r\n\tINT64_SET\x10\x19\x12\x0e\n\nDOUBLE_SET\x10\x1a\x12\r\n\tFLOAT_SET\x10\x1b\x12\x0c\n\x08\x42OOL_SET\x10\x1c\x12\x16\n\x12UNIX_TIMESTAMP_SET\x10\x1d\"\xe0\x08\n\x05Value\x12\x13\n\tbytes_val\x18\x01 \x01(\x0cH\x00\x12\x14\n\nstring_val\x18\x02 \x01(\tH\x00\x12\x13\n\tint32_val\x18\x03 \x01(\x05H\x00\x12\x13\n\tint64_val\x18\x04 \x01(\x03H\x00\x12\x14\n\ndouble_val\x18\x05 \x01(\x01H\x00\x12\x13\n\tfloat_val\x18\x06 \x01(\x02H\x00\x12\x12\n\x08\x62ool_val\x18\x07 \x01(\x08H\x00\x12\x1c\n\x12unix_timestamp_val\x18\x08 \x01(\x03H\x00\x12\x30\n\x0e\x62ytes_list_val\x18\x0b \x01(\x0b\x32\x16.feast.types.BytesListH\x00\x12\x32\n\x0fstring_list_val\x18\x0c \x01(\x0b\x32\x17.feast.types.StringListH\x00\x12\x30\n\x0eint32_list_val\x18\r \x01(\x0b\x32\x16.feast.types.Int32ListH\x00\x12\x30\n\x0eint64_list_val\x18\x0e \x01(\x0b\x32\x16.feast.types.Int64ListH\x00\x12\x32\n\x0f\x64ouble_list_val\x18\x0f \x01(\x0b\x32\x17.feast.types.DoubleListH\x00\x12\x30\n\x0e\x66loat_list_val\x18\x10 \x01(\x0b\x32\x16.feast.types.FloatListH\x00\x12.\n\rbool_list_val\x18\x11 \x01(\x0b\x32\x15.feast.types.BoolListH\x00\x12\x39\n\x17unix_timestamp_list_val\x18\x12 \x01(\x0b\x32\x16.feast.types.Int64ListH\x00\x12%\n\x08null_val\x18\x13 \x01(\x0e\x32\x11.feast.types.NullH\x00\x12#\n\x07map_val\x18\x14 \x01(\x0b\x32\x10.feast.types.MapH\x00\x12,\n\x0cmap_list_val\x18\x15 \x01(\x0b\x32\x14.feast.types.MapListH\x00\x12.\n\rbytes_set_val\x18\x16 \x01(\x0b\x32\x15.feast.types.BytesSetH\x00\x12\x30\n\x0estring_set_val\x18\x17 \x01(\x0b\x32\x16.feast.types.StringSetH\x00\x12.\n\rint32_set_val\x18\x18 \x01(\x0b\x32\x15.feast.types.Int32SetH\x00\x12.\n\rint64_set_val\x18\x19 \x01(\x0b\x32\x15.feast.types.Int64SetH\x00\x12\x30\n\x0e\x64ouble_set_val\x18\x1a \x01(\x0b\x32\x16.feast.types.DoubleSetH\x00\x12.\n\rfloat_set_val\x18\x1b \x01(\x0b\x32\x15.feast.types.FloatSetH\x00\x12,\n\x0c\x62ool_set_val\x18\x1c \x01(\x0b\x32\x14.feast.types.BoolSetH\x00\x12\x37\n\x16unix_timestamp_set_val\x18\x1d \x01(\x0b\x32\x15.feast.types.Int64SetH\x00\x42\x05\n\x03val\"\x18\n\tBytesList\x12\x0b\n\x03val\x18\x01 \x03(\x0c\"\x19\n\nStringList\x12\x0b\n\x03val\x18\x01 \x03(\t\"\x18\n\tInt32List\x12\x0b\n\x03val\x18\x01 \x03(\x05\"\x18\n\tInt64List\x12\x0b\n\x03val\x18\x01 \x03(\x03\"\x19\n\nDoubleList\x12\x0b\n\x03val\x18\x01 \x03(\x01\"\x18\n\tFloatList\x12\x0b\n\x03val\x18\x01 \x03(\x02\"\x17\n\x08\x42oolList\x12\x0b\n\x03val\x18\x01 \x03(\x08\"\x17\n\x08\x42ytesSet\x12\x0b\n\x03val\x18\x01 \x03(\x0c\"\x18\n\tStringSet\x12\x0b\n\x03val\x18\x01 \x03(\t\"\x17\n\x08Int32Set\x12\x0b\n\x03val\x18\x01 \x03(\x05\"\x17\n\x08Int64Set\x12\x0b\n\x03val\x18\x01 \x03(\x03\"\x18\n\tDoubleSet\x12\x0b\n\x03val\x18\x01 \x03(\x01\"\x17\n\x08\x46loatSet\x12\x0b\n\x03val\x18\x01 \x03(\x02\"\x16\n\x07\x42oolSet\x12\x0b\n\x03val\x18\x01 \x03(\x08\"m\n\x03Map\x12&\n\x03val\x18\x01 \x03(\x0b\x32\x19.feast.types.Map.ValEntry\x1a>\n\x08ValEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12!\n\x05value\x18\x02 \x01(\x0b\x32\x12.feast.types.Value:\x02\x38\x01\"(\n\x07MapList\x12\x1d\n\x03val\x18\x01 \x03(\x0b\x32\x10.feast.types.Map\"0\n\rRepeatedValue\x12\x1f\n\x03val\x18\x01 \x03(\x0b\x32\x12.feast.types.Value*\x10\n\x04Null\x12\x08\n\x04NULL\x10\x00\x42Q\n\x11\x66\x65\x61st.proto.typesB\nValueProtoZ0github.com/feast-dev/feast/go/protos/feast/typesb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -24,34 +24,48 @@ _globals['DESCRIPTOR']._serialized_options = b'\n\021feast.proto.typesB\nValueProtoZ0github.com/feast-dev/feast/go/protos/feast/types' _globals['_MAP_VALENTRY']._options = None _globals['_MAP_VALENTRY']._serialized_options = b'8\001' - _globals['_NULL']._serialized_start=1459 - _globals['_NULL']._serialized_end=1475 + _globals['_NULL']._serialized_start=2160 + _globals['_NULL']._serialized_end=2176 _globals['_VALUETYPE']._serialized_start=41 - _globals['_VALUETYPE']._serialized_end=343 + _globals['_VALUETYPE']._serialized_end=473 _globals['_VALUETYPE_ENUM']._serialized_start=55 - _globals['_VALUETYPE_ENUM']._serialized_end=343 - _globals['_VALUE']._serialized_start=346 - _globals['_VALUE']._serialized_end=1071 - _globals['_BYTESLIST']._serialized_start=1073 - _globals['_BYTESLIST']._serialized_end=1097 - _globals['_STRINGLIST']._serialized_start=1099 - _globals['_STRINGLIST']._serialized_end=1124 - _globals['_INT32LIST']._serialized_start=1126 - _globals['_INT32LIST']._serialized_end=1150 - _globals['_INT64LIST']._serialized_start=1152 - _globals['_INT64LIST']._serialized_end=1176 - _globals['_DOUBLELIST']._serialized_start=1178 - _globals['_DOUBLELIST']._serialized_end=1203 - _globals['_FLOATLIST']._serialized_start=1205 - _globals['_FLOATLIST']._serialized_end=1229 - _globals['_BOOLLIST']._serialized_start=1231 - _globals['_BOOLLIST']._serialized_end=1254 - _globals['_MAP']._serialized_start=1256 - _globals['_MAP']._serialized_end=1365 - _globals['_MAP_VALENTRY']._serialized_start=1303 - _globals['_MAP_VALENTRY']._serialized_end=1365 - _globals['_MAPLIST']._serialized_start=1367 - _globals['_MAPLIST']._serialized_end=1407 - _globals['_REPEATEDVALUE']._serialized_start=1409 - _globals['_REPEATEDVALUE']._serialized_end=1457 + _globals['_VALUETYPE_ENUM']._serialized_end=473 + _globals['_VALUE']._serialized_start=476 + _globals['_VALUE']._serialized_end=1596 + _globals['_BYTESLIST']._serialized_start=1598 + _globals['_BYTESLIST']._serialized_end=1622 + _globals['_STRINGLIST']._serialized_start=1624 + _globals['_STRINGLIST']._serialized_end=1649 + _globals['_INT32LIST']._serialized_start=1651 + _globals['_INT32LIST']._serialized_end=1675 + _globals['_INT64LIST']._serialized_start=1677 + _globals['_INT64LIST']._serialized_end=1701 + _globals['_DOUBLELIST']._serialized_start=1703 + _globals['_DOUBLELIST']._serialized_end=1728 + _globals['_FLOATLIST']._serialized_start=1730 + _globals['_FLOATLIST']._serialized_end=1754 + _globals['_BOOLLIST']._serialized_start=1756 + _globals['_BOOLLIST']._serialized_end=1779 + _globals['_BYTESSET']._serialized_start=1781 + _globals['_BYTESSET']._serialized_end=1804 + _globals['_STRINGSET']._serialized_start=1806 + _globals['_STRINGSET']._serialized_end=1830 + _globals['_INT32SET']._serialized_start=1832 + _globals['_INT32SET']._serialized_end=1855 + _globals['_INT64SET']._serialized_start=1857 + _globals['_INT64SET']._serialized_end=1880 + _globals['_DOUBLESET']._serialized_start=1882 + _globals['_DOUBLESET']._serialized_end=1906 + _globals['_FLOATSET']._serialized_start=1908 + _globals['_FLOATSET']._serialized_end=1931 + _globals['_BOOLSET']._serialized_start=1933 + _globals['_BOOLSET']._serialized_end=1955 + _globals['_MAP']._serialized_start=1957 + _globals['_MAP']._serialized_end=2066 + _globals['_MAP_VALENTRY']._serialized_start=2004 + _globals['_MAP_VALENTRY']._serialized_end=2066 + _globals['_MAPLIST']._serialized_start=2068 + _globals['_MAPLIST']._serialized_end=2108 + _globals['_REPEATEDVALUE']._serialized_start=2110 + _globals['_REPEATEDVALUE']._serialized_end=2158 # @@protoc_insertion_point(module_scope) diff --git a/sdk/python/feast/protos/feast/types/Value_pb2.pyi b/sdk/python/feast/protos/feast/types/Value_pb2.pyi index a1e364ec7e5..0e10849ebad 100644 --- a/sdk/python/feast/protos/feast/types/Value_pb2.pyi +++ b/sdk/python/feast/protos/feast/types/Value_pb2.pyi @@ -74,6 +74,14 @@ class ValueType(google.protobuf.message.Message): NULL: ValueType._Enum.ValueType # 19 MAP: ValueType._Enum.ValueType # 20 MAP_LIST: ValueType._Enum.ValueType # 21 + BYTES_SET: ValueType._Enum.ValueType # 22 + STRING_SET: ValueType._Enum.ValueType # 23 + INT32_SET: ValueType._Enum.ValueType # 24 + INT64_SET: ValueType._Enum.ValueType # 25 + DOUBLE_SET: ValueType._Enum.ValueType # 26 + FLOAT_SET: ValueType._Enum.ValueType # 27 + BOOL_SET: ValueType._Enum.ValueType # 28 + UNIX_TIMESTAMP_SET: ValueType._Enum.ValueType # 29 class Enum(_Enum, metaclass=_EnumEnumTypeWrapper): ... INVALID: ValueType.Enum.ValueType # 0 @@ -96,6 +104,14 @@ class ValueType(google.protobuf.message.Message): NULL: ValueType.Enum.ValueType # 19 MAP: ValueType.Enum.ValueType # 20 MAP_LIST: ValueType.Enum.ValueType # 21 + BYTES_SET: ValueType.Enum.ValueType # 22 + STRING_SET: ValueType.Enum.ValueType # 23 + INT32_SET: ValueType.Enum.ValueType # 24 + INT64_SET: ValueType.Enum.ValueType # 25 + DOUBLE_SET: ValueType.Enum.ValueType # 26 + FLOAT_SET: ValueType.Enum.ValueType # 27 + BOOL_SET: ValueType.Enum.ValueType # 28 + UNIX_TIMESTAMP_SET: ValueType.Enum.ValueType # 29 def __init__( self, @@ -125,6 +141,14 @@ class Value(google.protobuf.message.Message): NULL_VAL_FIELD_NUMBER: builtins.int MAP_VAL_FIELD_NUMBER: builtins.int MAP_LIST_VAL_FIELD_NUMBER: builtins.int + BYTES_SET_VAL_FIELD_NUMBER: builtins.int + STRING_SET_VAL_FIELD_NUMBER: builtins.int + INT32_SET_VAL_FIELD_NUMBER: builtins.int + INT64_SET_VAL_FIELD_NUMBER: builtins.int + DOUBLE_SET_VAL_FIELD_NUMBER: builtins.int + FLOAT_SET_VAL_FIELD_NUMBER: builtins.int + BOOL_SET_VAL_FIELD_NUMBER: builtins.int + UNIX_TIMESTAMP_SET_VAL_FIELD_NUMBER: builtins.int bytes_val: builtins.bytes string_val: builtins.str int32_val: builtins.int @@ -154,6 +178,22 @@ class Value(google.protobuf.message.Message): def map_val(self) -> global___Map: ... @property def map_list_val(self) -> global___MapList: ... + @property + def bytes_set_val(self) -> global___BytesSet: ... + @property + def string_set_val(self) -> global___StringSet: ... + @property + def int32_set_val(self) -> global___Int32Set: ... + @property + def int64_set_val(self) -> global___Int64Set: ... + @property + def double_set_val(self) -> global___DoubleSet: ... + @property + def float_set_val(self) -> global___FloatSet: ... + @property + def bool_set_val(self) -> global___BoolSet: ... + @property + def unix_timestamp_set_val(self) -> global___Int64Set: ... def __init__( self, *, @@ -176,10 +216,18 @@ class Value(google.protobuf.message.Message): null_val: global___Null.ValueType = ..., map_val: global___Map | None = ..., map_list_val: global___MapList | None = ..., + bytes_set_val: global___BytesSet | None = ..., + string_set_val: global___StringSet | None = ..., + int32_set_val: global___Int32Set | None = ..., + int64_set_val: global___Int64Set | None = ..., + double_set_val: global___DoubleSet | None = ..., + float_set_val: global___FloatSet | None = ..., + bool_set_val: global___BoolSet | None = ..., + unix_timestamp_set_val: global___Int64Set | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["bool_list_val", b"bool_list_val", "bool_val", b"bool_val", "bytes_list_val", b"bytes_list_val", "bytes_val", b"bytes_val", "double_list_val", b"double_list_val", "double_val", b"double_val", "float_list_val", b"float_list_val", "float_val", b"float_val", "int32_list_val", b"int32_list_val", "int32_val", b"int32_val", "int64_list_val", b"int64_list_val", "int64_val", b"int64_val", "map_list_val", b"map_list_val", "map_val", b"map_val", "null_val", b"null_val", "string_list_val", b"string_list_val", "string_val", b"string_val", "unix_timestamp_list_val", b"unix_timestamp_list_val", "unix_timestamp_val", b"unix_timestamp_val", "val", b"val"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["bool_list_val", b"bool_list_val", "bool_val", b"bool_val", "bytes_list_val", b"bytes_list_val", "bytes_val", b"bytes_val", "double_list_val", b"double_list_val", "double_val", b"double_val", "float_list_val", b"float_list_val", "float_val", b"float_val", "int32_list_val", b"int32_list_val", "int32_val", b"int32_val", "int64_list_val", b"int64_list_val", "int64_val", b"int64_val", "map_list_val", b"map_list_val", "map_val", b"map_val", "null_val", b"null_val", "string_list_val", b"string_list_val", "string_val", b"string_val", "unix_timestamp_list_val", b"unix_timestamp_list_val", "unix_timestamp_val", b"unix_timestamp_val", "val", b"val"]) -> None: ... - def WhichOneof(self, oneof_group: typing_extensions.Literal["val", b"val"]) -> typing_extensions.Literal["bytes_val", "string_val", "int32_val", "int64_val", "double_val", "float_val", "bool_val", "unix_timestamp_val", "bytes_list_val", "string_list_val", "int32_list_val", "int64_list_val", "double_list_val", "float_list_val", "bool_list_val", "unix_timestamp_list_val", "null_val", "map_val", "map_list_val"] | None: ... + def HasField(self, field_name: typing_extensions.Literal["bool_list_val", b"bool_list_val", "bool_set_val", b"bool_set_val", "bool_val", b"bool_val", "bytes_list_val", b"bytes_list_val", "bytes_set_val", b"bytes_set_val", "bytes_val", b"bytes_val", "double_list_val", b"double_list_val", "double_set_val", b"double_set_val", "double_val", b"double_val", "float_list_val", b"float_list_val", "float_set_val", b"float_set_val", "float_val", b"float_val", "int32_list_val", b"int32_list_val", "int32_set_val", b"int32_set_val", "int32_val", b"int32_val", "int64_list_val", b"int64_list_val", "int64_set_val", b"int64_set_val", "int64_val", b"int64_val", "map_list_val", b"map_list_val", "map_val", b"map_val", "null_val", b"null_val", "string_list_val", b"string_list_val", "string_set_val", b"string_set_val", "string_val", b"string_val", "unix_timestamp_list_val", b"unix_timestamp_list_val", "unix_timestamp_set_val", b"unix_timestamp_set_val", "unix_timestamp_val", b"unix_timestamp_val", "val", b"val"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["bool_list_val", b"bool_list_val", "bool_set_val", b"bool_set_val", "bool_val", b"bool_val", "bytes_list_val", b"bytes_list_val", "bytes_set_val", b"bytes_set_val", "bytes_val", b"bytes_val", "double_list_val", b"double_list_val", "double_set_val", b"double_set_val", "double_val", b"double_val", "float_list_val", b"float_list_val", "float_set_val", b"float_set_val", "float_val", b"float_val", "int32_list_val", b"int32_list_val", "int32_set_val", b"int32_set_val", "int32_val", b"int32_val", "int64_list_val", b"int64_list_val", "int64_set_val", b"int64_set_val", "int64_val", b"int64_val", "map_list_val", b"map_list_val", "map_val", b"map_val", "null_val", b"null_val", "string_list_val", b"string_list_val", "string_set_val", b"string_set_val", "string_val", b"string_val", "unix_timestamp_list_val", b"unix_timestamp_list_val", "unix_timestamp_set_val", b"unix_timestamp_set_val", "unix_timestamp_val", b"unix_timestamp_val", "val", b"val"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["val", b"val"]) -> typing_extensions.Literal["bytes_val", "string_val", "int32_val", "int64_val", "double_val", "float_val", "bool_val", "unix_timestamp_val", "bytes_list_val", "string_list_val", "int32_list_val", "int64_list_val", "double_list_val", "float_list_val", "bool_list_val", "unix_timestamp_list_val", "null_val", "map_val", "map_list_val", "bytes_set_val", "string_set_val", "int32_set_val", "int64_set_val", "double_set_val", "float_set_val", "bool_set_val", "unix_timestamp_set_val"] | None: ... global___Value = Value @@ -288,6 +336,111 @@ class BoolList(google.protobuf.message.Message): global___BoolList = BoolList +class BytesSet(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + VAL_FIELD_NUMBER: builtins.int + @property + def val(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bytes]: ... + def __init__( + self, + *, + val: collections.abc.Iterable[builtins.bytes] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["val", b"val"]) -> None: ... + +global___BytesSet = BytesSet + +class StringSet(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + VAL_FIELD_NUMBER: builtins.int + @property + def val(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ... + def __init__( + self, + *, + val: collections.abc.Iterable[builtins.str] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["val", b"val"]) -> None: ... + +global___StringSet = StringSet + +class Int32Set(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + VAL_FIELD_NUMBER: builtins.int + @property + def val(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: ... + def __init__( + self, + *, + val: collections.abc.Iterable[builtins.int] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["val", b"val"]) -> None: ... + +global___Int32Set = Int32Set + +class Int64Set(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + VAL_FIELD_NUMBER: builtins.int + @property + def val(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: ... + def __init__( + self, + *, + val: collections.abc.Iterable[builtins.int] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["val", b"val"]) -> None: ... + +global___Int64Set = Int64Set + +class DoubleSet(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + VAL_FIELD_NUMBER: builtins.int + @property + def val(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]: ... + def __init__( + self, + *, + val: collections.abc.Iterable[builtins.float] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["val", b"val"]) -> None: ... + +global___DoubleSet = DoubleSet + +class FloatSet(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + VAL_FIELD_NUMBER: builtins.int + @property + def val(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]: ... + def __init__( + self, + *, + val: collections.abc.Iterable[builtins.float] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["val", b"val"]) -> None: ... + +global___FloatSet = FloatSet + +class BoolSet(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + VAL_FIELD_NUMBER: builtins.int + @property + def val(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bool]: ... + def __init__( + self, + *, + val: collections.abc.Iterable[builtins.bool] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["val", b"val"]) -> None: ... + +global___BoolSet = BoolSet + class Map(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 10917150794..b7b97b0a29a 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -39,17 +39,24 @@ from feast.protos.feast.types.Value_pb2 import ( BoolList, + BoolSet, BytesList, + BytesSet, DoubleList, + DoubleSet, FloatList, + FloatSet, Int32List, + Int32Set, Int64List, + Int64Set, Map, MapList, StringList, + StringSet, ) from feast.protos.feast.types.Value_pb2 import Value as ProtoValue -from feast.value_type import ListType, ValueType +from feast.value_type import ListType, SetType, ValueType if TYPE_CHECKING: import pyarrow @@ -82,7 +89,7 @@ def feast_value_type_to_python_type(field_value_proto: ProtoValue) -> Any: elif val_attr == "map_list_val": return _handle_map_list_value(val) - # If it's a _LIST type extract the list. + # If it's a _LIST or _SET type extract the values. if hasattr(val, "val"): val = list(val.val) @@ -96,12 +103,26 @@ def feast_value_type_to_python_type(field_value_proto: ProtoValue) -> Any: ) for v in val ] + elif val_attr == "unix_timestamp_set_val": + val = set( + [ + ( + datetime.fromtimestamp(v, tz=timezone.utc) + if v != NULL_TIMESTAMP_INT_VALUE + else None + ) + for v in val + ] + ) elif val_attr == "unix_timestamp_val": val = ( datetime.fromtimestamp(val, tz=timezone.utc) if val != NULL_TIMESTAMP_INT_VALUE else None ) + # Convert _SET types to Python sets + elif val_attr.endswith("_set_val") and val_attr != "unix_timestamp_set_val": + val = set(val) return val @@ -140,7 +161,11 @@ def feast_value_type_to_pandas_type(value_type: ValueType) -> Any: ValueType.BOOL: "bool", ValueType.UNIX_TIMESTAMP: "datetime64[ns]", } - if value_type.name == "MAP" or value_type.name.endswith("_LIST"): + if ( + value_type.name == "MAP" + or value_type.name.endswith("_LIST") + or value_type.name.endswith("_SET") + ): return "object" if value_type in value_type_to_pandas_type: return value_type_to_pandas_type[value_type] @@ -259,6 +284,40 @@ def python_type_to_feast_value_type( return ValueType.UNKNOWN return ValueType[common_item_value_type.name + "_LIST"] + # Check if it's a set (Set type) + if isinstance(value, set): + if not recurse: + raise ValueError( + f"Value type for field {name} is {type(value)} but " + f"recursion is not allowed. Set types can only be one level " + f"deep." + ) + + # Infer the type from set elements + common_set_item_type = None + for item in value: + if isinstance(item, ProtoValue): + current_set_item_type: ValueType = _proto_value_to_value_type(item) + else: + # Get the type from the current item, only one level deep + current_set_item_type = python_type_to_feast_value_type( + name=name, value=item, recurse=False + ) + # Validate whether the type stays consistent + if ( + common_set_item_type + and not common_set_item_type == current_set_item_type + ): + raise ValueError( + f"Set value type for field {name} is inconsistent. " + f"{common_set_item_type} different from " + f"{current_set_item_type}." + ) + common_set_item_type = current_set_item_type + if common_set_item_type is None: + return ValueType.UNKNOWN + return ValueType[common_set_item_type.name + "_SET"] + # Check if it's a dictionary (Map type) if isinstance(value, dict): return ValueType.MAP @@ -349,6 +408,31 @@ def _type_err(item, dtype): ValueType.BYTES_LIST: (BytesList, "bytes_list_val", [np.bytes_, bytes]), } +PYTHON_SET_VALUE_TYPE_TO_PROTO_VALUE: Dict[ + ValueType, Tuple[SetType, str, List[Type]] +] = { + ValueType.FLOAT_SET: ( + FloatSet, + "float_set_val", + [np.float32, np.float64, float], + ), + ValueType.DOUBLE_SET: ( + DoubleSet, + "double_set_val", + [np.float64, np.float32, float], + ), + ValueType.INT32_SET: (Int32Set, "int32_set_val", [np.int64, np.int32, int]), + ValueType.INT64_SET: (Int64Set, "int64_set_val", [np.int64, np.int32, int]), + ValueType.UNIX_TIMESTAMP_SET: ( + Int64Set, + "unix_timestamp_set_val", + [np.datetime64, np.int64, np.int32, int, datetime, Timestamp], + ), + ValueType.STRING_SET: (StringSet, "string_set_val", [np.str_, str]), + ValueType.BOOL_SET: (BoolSet, "bool_set_val", [np.bool_, bool]), + ValueType.BYTES_SET: (BytesSet, "bytes_set_val", [np.bytes_, bytes]), +} + PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE: Dict[ ValueType, Tuple[str, Any, Optional[Set[Type]]] ] = { @@ -399,6 +483,124 @@ def _python_datetime_to_int_timestamp( return int_timestamps +def _python_set_to_proto_values( + feast_value_type: ValueType, values: List[Any] +) -> List[ProtoValue]: + """ + Converts Python set values to Feast Proto Values. + + Args: + feast_value_type: The target set value type + values: List of set values that will be converted + + Returns: + List of Feast Value Proto + """ + # Feature can be set but None is still valid + if feast_value_type not in PYTHON_SET_VALUE_TYPE_TO_PROTO_VALUE: + return [] + + set_proto_type, set_field_name, set_valid_types = ( + PYTHON_SET_VALUE_TYPE_TO_PROTO_VALUE[feast_value_type] + ) + + # Convert set to list for proto (proto doesn't have native set type) + # We store unique values in a repeated field + def convert_set_to_list(value): + if value is None: + return None + # If it's already a set, convert to list + if isinstance(value, set): + return list(value) + # If it's a list/tuple/ndarray, remove duplicates + elif isinstance(value, (list, tuple, np.ndarray)): + return list(set(value)) + else: + return value + + converted_values = [convert_set_to_list(v) for v in values] + sample = next(filter(_non_empty_value, converted_values), None) + + # Bytes to array type conversion + if isinstance(sample, (bytes, bytearray)): + # Bytes of an array containing elements of bytes not supported + if feast_value_type == ValueType.BYTES_SET: + raise _type_err(sample, ValueType.BYTES_SET) + + json_sample = json.loads(sample) + if isinstance(json_sample, list): + json_values = [ + json.loads(value) if value is not None else None + for value in converted_values + ] + if feast_value_type == ValueType.BOOL_SET: + json_values = [ + [bool(item) for item in list_item] + if list_item is not None + else None + for list_item in json_values + ] + return [ + ( + ProtoValue(**{set_field_name: set_proto_type(val=v)}) # type: ignore + if v is not None + else ProtoValue() + ) + for v in json_values + ] + raise _type_err(sample, set_valid_types[0]) + + if sample is not None and not all(type(item) in set_valid_types for item in sample): + for item in sample: + if type(item) not in set_valid_types: + if feast_value_type in [ + ValueType.INT32_SET, + ValueType.INT64_SET, + ]: + if not any(np.isnan(item) for item in sample): + logger.error("Set of Int32 or Int64 type has NULL values.") + raise _type_err(item, set_valid_types[0]) + + if feast_value_type == ValueType.UNIX_TIMESTAMP_SET: + result = [] + for value in converted_values: + if value is not None: + result.append( + ProtoValue( + unix_timestamp_set_val=Int64Set( + val=_python_datetime_to_int_timestamp(value) # type: ignore + ) + ) + ) + else: + result.append(ProtoValue()) + return result + if feast_value_type == ValueType.BOOL_SET: + result = [] + for value in converted_values: + if value is not None: + result.append( + ProtoValue( + **{ + set_field_name: set_proto_type( + val=[bool(e) for e in value] # type: ignore + ) + } + ) + ) + else: + result.append(ProtoValue()) + return result + return [ + ( + ProtoValue(**{set_field_name: set_proto_type(val=value)}) # type: ignore + if value is not None + else ProtoValue() + ) + for value in converted_values + ] + + def _python_value_to_proto_value( feast_value_type: ValueType, values: List[Any] ) -> List[ProtoValue]: @@ -478,31 +680,33 @@ def _python_value_to_proto_value( raise _type_err(item, valid_types[0]) if feast_value_type == ValueType.UNIX_TIMESTAMP_LIST: - return [ - ( + result = [] + for value in values: + if value is not None: # ProtoValue does actually accept `np.int_` but the typing complains. - ProtoValue( - unix_timestamp_list_val=Int64List( - val=_python_datetime_to_int_timestamp(value) # type: ignore + result.append( + ProtoValue( + unix_timestamp_list_val=Int64List( + val=_python_datetime_to_int_timestamp(value) # type: ignore + ) ) ) - if value is not None - else ProtoValue() - ) - for value in values - ] + else: + result.append(ProtoValue()) + return result if feast_value_type == ValueType.BOOL_LIST: # ProtoValue does not support conversion of np.bool_ so we need to convert it to support np.bool_. - return [ - ( - ProtoValue( - **{field_name: proto_type(val=[bool(e) for e in value])} # type: ignore + result = [] + for value in values: + if value is not None: + result.append( + ProtoValue( + **{field_name: proto_type(val=[bool(e) for e in value])} # type: ignore + ) ) - if value is not None - else ProtoValue() - ) - for value in values - ] + else: + result.append(ProtoValue()) + return result return [ ( ProtoValue(**{field_name: proto_type(val=value)}) # type: ignore @@ -512,6 +716,10 @@ def _python_value_to_proto_value( for value in values ] + # Detect set type and handle separately + if "set" in feast_value_type.name.lower(): + return _python_set_to_proto_values(feast_value_type, values) + # Handle scalar types below else: if sample is None: @@ -647,6 +855,7 @@ def python_values_to_proto_values( "string_val": ValueType.STRING, "bytes_val": ValueType.BYTES, "bool_val": ValueType.BOOL, + "unix_timestamp_val": ValueType.UNIX_TIMESTAMP, "int32_list_val": ValueType.INT32_LIST, "int64_list_val": ValueType.INT64_LIST, "double_list_val": ValueType.DOUBLE_LIST, @@ -654,8 +863,17 @@ def python_values_to_proto_values( "string_list_val": ValueType.STRING_LIST, "bytes_list_val": ValueType.BYTES_LIST, "bool_list_val": ValueType.BOOL_LIST, + "unix_timestamp_list_val": ValueType.UNIX_TIMESTAMP_LIST, "map_val": ValueType.MAP, "map_list_val": ValueType.MAP_LIST, + "int32_set_val": ValueType.INT32_SET, + "int64_set_val": ValueType.INT64_SET, + "double_set_val": ValueType.DOUBLE_SET, + "float_set_val": ValueType.FLOAT_SET, + "string_set_val": ValueType.STRING_SET, + "bytes_set_val": ValueType.BYTES_SET, + "bool_set_val": ValueType.BOOL_SET, + "unix_timestamp_set_val": ValueType.UNIX_TIMESTAMP_SET, } VALUE_TYPE_TO_PROTO_VALUE_MAP: Dict[ValueType, str] = { diff --git a/sdk/python/feast/types.py b/sdk/python/feast/types.py index 2dab0ba2b0a..922b3cce0ac 100644 --- a/sdk/python/feast/types.py +++ b/sdk/python/feast/types.py @@ -178,6 +178,36 @@ def __str__(self): return f"Array({self.base_type})" +class Set(ComplexFeastType): + """ + A Set represents a set of unique values of a given type. + + Attributes: + base_type: The base type of the set. + """ + + base_type: Union[PrimitiveFeastType, ComplexFeastType] + + def __init__(self, base_type: Union[PrimitiveFeastType, ComplexFeastType]): + # Sets do not support MAP as a base type + supported_set_types = [t for t in SUPPORTED_BASE_TYPES if t != Map] + if base_type not in supported_set_types: + raise ValueError( + f"Type {type(base_type)} is currently not supported as a base type for Set." + ) + + self.base_type = base_type + + def to_value_type(self) -> ValueType: + assert isinstance(self.base_type, PrimitiveFeastType) + value_type_name = PRIMITIVE_FEAST_TYPES_TO_VALUE_TYPES[self.base_type.name] + value_type_set_name = value_type_name + "_SET" + return ValueType[value_type_set_name] + + def __str__(self): + return f"Set({self.base_type})" + + FeastType = Union[ComplexFeastType, PrimitiveFeastType] VALUE_TYPES_TO_FEAST_TYPES: Dict["ValueType", FeastType] = { @@ -202,6 +232,14 @@ def __str__(self): ValueType.UNIX_TIMESTAMP_LIST: Array(UnixTimestamp), ValueType.MAP: Map, ValueType.MAP_LIST: Array(Map), + ValueType.BYTES_SET: Set(Bytes), + ValueType.STRING_SET: Set(String), + ValueType.INT32_SET: Set(Int32), + ValueType.INT64_SET: Set(Int64), + ValueType.DOUBLE_SET: Set(Float64), + ValueType.FLOAT_SET: Set(Float32), + ValueType.BOOL_SET: Set(Bool), + ValueType.UNIX_TIMESTAMP_SET: Set(UnixTimestamp), } FEAST_TYPES_TO_PYARROW_TYPES = { diff --git a/sdk/python/feast/value_type.py b/sdk/python/feast/value_type.py index b3a1b35c248..bdd47952dc6 100644 --- a/sdk/python/feast/value_type.py +++ b/sdk/python/feast/value_type.py @@ -16,12 +16,19 @@ from feast.protos.feast.types.Value_pb2 import ( BoolList, + BoolSet, BytesList, + BytesSet, DoubleList, + DoubleSet, FloatList, + FloatSet, Int32List, + Int32Set, Int64List, + Int64Set, StringList, + StringSet, ) @@ -50,8 +57,16 @@ class ValueType(enum.Enum): NULL = 19 MAP = 20 MAP_LIST = 21 - PDF_BYTES = 22 - IMAGE_BYTES = 23 + BYTES_SET = 22 + STRING_SET = 23 + INT32_SET = 24 + INT64_SET = 25 + DOUBLE_SET = 26 + FLOAT_SET = 27 + BOOL_SET = 28 + UNIX_TIMESTAMP_SET = 29 + PDF_BYTES = 30 + IMAGE_BYTES = 31 ListType = Union[ @@ -63,3 +78,13 @@ class ValueType(enum.Enum): Type[Int64List], Type[StringList], ] + +SetType = Union[ + Type[BoolSet], + Type[BytesSet], + Type[DoubleSet], + Type[FloatSet], + Type[Int32Set], + Type[Int64Set], + Type[StringSet], +] diff --git a/sdk/python/tests/unit/online_store/test_online_retrieval.py b/sdk/python/tests/unit/online_store/test_online_retrieval.py index 26d9430d1b6..501586f7828 100644 --- a/sdk/python/tests/unit/online_store/test_online_retrieval.py +++ b/sdk/python/tests/unit/online_store/test_online_retrieval.py @@ -1208,7 +1208,7 @@ def test_milvus_lite_retrieve_online_documents_v2() -> None: ) documents_df = pd.DataFrame( { - "item_id": [str(i) for i in range(n)], + "item_id": [i for i in range(n)], "author_id": [f"author_{i}" for i in range(n)], "vector": [ np.random.random( diff --git a/sdk/python/tests/unit/test_type_map.py b/sdk/python/tests/unit/test_type_map.py index 945b9114f96..8508b490d78 100644 --- a/sdk/python/tests/unit/test_type_map.py +++ b/sdk/python/tests/unit/test_type_map.py @@ -379,3 +379,85 @@ def test_roundtrip_conversion_consistency(self): assert len(converted["list_of_maps"]) == len(original_map["list_of_maps"]) assert converted["list_of_maps"][0]["item"] == "first" assert converted["list_of_maps"][1]["item"] == "second" + + +class TestSetTypes: + """Test cases for SET value types.""" + + def test_simple_set_conversion(self): + """Test basic SET type conversion from Python set to proto and back.""" + test_set = {1, 2, 3, 4, 5} + + protos = python_values_to_proto_values([test_set], ValueType.INT32_SET) + converted = feast_value_type_to_python_type(protos[0]) + + assert isinstance(converted, set) + assert converted == test_set + + def test_string_set_conversion(self): + """Test STRING_SET type conversion.""" + test_set = {"apple", "banana", "cherry"} + + protos = python_values_to_proto_values([test_set], ValueType.STRING_SET) + converted = feast_value_type_to_python_type(protos[0]) + + assert isinstance(converted, set) + assert converted == test_set + + def test_float_set_conversion(self): + """Test FLOAT_SET type conversion.""" + test_set = {1.5, 2.5, 3.5} + + protos = python_values_to_proto_values([test_set], ValueType.FLOAT_SET) + converted = feast_value_type_to_python_type(protos[0]) + + assert isinstance(converted, set) + assert converted == test_set + + def test_bool_set_conversion(self): + """Test BOOL_SET type conversion.""" + test_set = {True, False} + + protos = python_values_to_proto_values([test_set], ValueType.BOOL_SET) + converted = feast_value_type_to_python_type(protos[0]) + + assert isinstance(converted, set) + assert converted == test_set + + def test_set_from_list_with_duplicates(self): + """Test that duplicate values in lists are removed when converted to sets.""" + test_list = [1, 2, 2, 3, 3, 3, 4, 5, 5] + + protos = python_values_to_proto_values([test_list], ValueType.INT32_SET) + converted = feast_value_type_to_python_type(protos[0]) + + assert isinstance(converted, set) + assert converted == {1, 2, 3, 4, 5} + + def test_empty_set(self): + """Test empty SET conversion.""" + test_set = set() + + protos = python_values_to_proto_values([test_set], ValueType.STRING_SET) + converted = feast_value_type_to_python_type(protos[0]) + + assert isinstance(converted, set) + assert len(converted) == 0 + + def test_null_set(self): + """Test None SET conversion.""" + protos = python_values_to_proto_values([None], ValueType.INT32_SET) + converted = feast_value_type_to_python_type(protos[0]) + + assert converted is None + + def test_multiple_set_values(self): + """Test conversion of multiple set values.""" + test_sets = [{1, 2, 3}, {4, 5}, {6}] + + protos = python_values_to_proto_values(test_sets, ValueType.INT32_SET) + + assert len(protos) == 3 + assert feast_value_type_to_python_type(protos[0]) == {1, 2, 3} + assert feast_value_type_to_python_type(protos[1]) == {4, 5} + assert feast_value_type_to_python_type(protos[2]) == {6} diff --git a/sdk/python/tests/unit/test_types.py b/sdk/python/tests/unit/test_types.py index af490b4f3a9..438735d213a 100644 --- a/sdk/python/tests/unit/test_types.py +++ b/sdk/python/tests/unit/test_types.py @@ -1,6 +1,6 @@ import pytest -from feast.types import Array, Float32, String, from_value_type +from feast.types import Array, Float32, Set, String, from_value_type from feast.value_type import ValueType @@ -27,6 +27,22 @@ def test_array_feast_type(): _ = Array(Array(String)) +def test_set_feast_type(): + set_string = Set(String) + assert set_string.to_value_type() == ValueType.STRING_SET + assert from_value_type(set_string.to_value_type()) == set_string + + set_float_32 = Set(Float32) + assert set_float_32.to_value_type() == ValueType.FLOAT_SET + assert from_value_type(set_float_32.to_value_type()) == set_float_32 + + with pytest.raises(ValueError): + _ = Set(Set) + + with pytest.raises(ValueError): + _ = Set(Set(String)) + + def test_all_value_types(): for value in ValueType: # We do not support the NULL type.