Adding initial type support related tests for BQ (feast-dev#1768)

adchia · web-flow · commit 3c392610df93 · 2021-09-01T11:46:18.000-04:00
* Adding initial type support related tests for BQ

Signed-off-by: Danny Chiao &lt;danny@tecton.ai&gt;
diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py
@@ -29,9 +29,11 @@ def update_entities_with_inferred_types_from_feature_views(
         col_names_and_types = view.batch_source.get_table_column_names_and_types(config)
         for entity_name in view.entities:
             if entity_name in incomplete_entities:
+                entity = incomplete_entities[entity_name]
+
                 # get entity information from information extracted from the view batch source
                 extracted_entity_name_type_pairs = list(
-                    filter(lambda tup: tup[0] == entity_name, col_names_and_types)
+                    filter(lambda tup: tup[0] == entity.join_key, col_names_and_types,)
                 )
                 if len(extracted_entity_name_type_pairs) == 0:
                     # Doesn't mention inference error because would also be an error without inferencing
@@ -40,7 +42,6 @@ def update_entities_with_inferred_types_from_feature_views(
                         its entity's name."""
                     )
 
-                entity = incomplete_entities[entity_name]
                 inferred_value_type = view.batch_source.source_datatype_to_feast_value_type()(
                     extracted_entity_name_type_pairs[0][1]
                 )
diff --git a/sdk/python/tests/data/data_creator.py b/sdk/python/tests/data/data_creator.py
@@ -1,15 +1,22 @@
 from datetime import datetime, timedelta
+from typing import List
 
 import pandas as pd
 from pytz import timezone, utc
 
+from feast.value_type import ValueType
 
-def create_dataset() -> pd.DataFrame:
-    now = datetime.utcnow()
+
+def create_dataset(
+    entity_type: ValueType = ValueType.INT32,
+    feature_dtype: str = None,
+    feature_is_list: bool = False,
+) -> pd.DataFrame:
+    now = datetime.now().replace(microsecond=0, second=0, minute=0)
     ts = pd.Timestamp(now).round("ms")
     data = {
-        "id": [1, 2, 1, 3, 3],
-        "value": [0.1, None, 0.3, 4, 5],
+        "driver_id": get_entities_for_value_type(entity_type),
+        "value": get_feature_values_for_dtype(feature_dtype, feature_is_list),
         "ts_1": [
             ts - timedelta(hours=4),
             ts,
@@ -25,3 +32,33 @@ def create_dataset() -> pd.DataFrame:
         "created_ts": [ts, ts, ts, ts, ts],
     }
     return pd.DataFrame.from_dict(data)
+
+
+def get_entities_for_value_type(value_type: ValueType) -> List:
+    value_type_map = {
+        ValueType.INT32: [1, 2, 1, 3, 3],
+        ValueType.INT64: [1, 2, 1, 3, 3],
+        ValueType.FLOAT: [1.0, 2.0, 1.0, 3.0, 3.0],
+        ValueType.STRING: ["1", "2", "1", "3", "3"],
+    }
+    return value_type_map[value_type]
+
+
+def get_feature_values_for_dtype(dtype: str, is_list: bool) -> List:
+    if dtype is None:
+        return [0.1, None, 0.3, 4, 5]
+    # TODO(adchia): for int columns, consider having a better error when dealing with None values (pandas int dfs can't
+    #  have na)
+    dtype_map = {
+        "int32": [1, 2, 3, 4, 5],
+        "int64": [1, 2, 3, 4, 5],
+        "float": [1.0, None, 3.0, 4.0, 5.0],
+        "string": ["1", None, "3", "4", "5"],
+        "bool": [True, None, False, True, False],
+    }
+    non_list_val = dtype_map[dtype]
+    # Duplicate the value once if this is a list
+    if is_list:
+        return [[n, n] if n is not None else None for n in non_list_val]
+    else:
+        return non_list_val
diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py
@@ -78,7 +78,7 @@ def check_offline_and_online_features(
 def run_offline_online_store_consistency_test(
     fs: FeatureStore, fv: FeatureView
 ) -> None:
-    now = datetime.utcnow()
+    now = datetime.now()
 
     full_feature_names = True
     check_offline_store: bool = True
diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py
@@ -10,6 +10,7 @@
 
 from feast import FeatureStore, FeatureView, RepoConfig, driver_test_data, importer
 from feast.data_source import DataSource
+from feast.value_type import ValueType
 from tests.data.data_creator import create_dataset
 from tests.integration.feature_repos.universal.data_source_creator import (
     DataSourceCreator,
@@ -70,7 +71,6 @@ def ds_creator_path(cls: str):
     ),
 ]
 
-
 OFFLINE_STORES: List[str] = []
 ONLINE_STORES: List[str] = []
 PROVIDERS: List[str] = []
@@ -83,6 +83,9 @@ class Environment:
     feature_store: FeatureStore
     data_source: DataSource
     data_source_creator: DataSourceCreator
+    entity_type: ValueType
+    feature_dtype: str
+    feature_is_list: bool
 
     end_date = datetime.now().replace(microsecond=0, second=0, minute=0)
     start_date = end_date - timedelta(days=7)
@@ -199,6 +202,9 @@ def construct_test_environment(
     test_repo_config: TestRepoConfig,
     create_and_apply: bool = False,
     materialize: bool = False,
+    entity_type: ValueType = ValueType.INT32,
+    feature_dtype: str = None,
+    feature_is_list: bool = False,
 ) -> Environment:
     """
     This method should take in the parameters from the test repo config and created a feature repo, apply it,
@@ -208,9 +214,14 @@ def construct_test_environment(
     The user is *not* expected to perform any clean up actions.
 
     :param test_repo_config: configuration
+    :param create_and_apply: whether to create and apply the repo config
+    :param materialize: whether to materialize features to online store
+    :param entity_type: the data type for the entity column (i.e. id)
+    :param feature_dtype: the data type for the feature column (i.e. value)
+    :param feature_is_list: whether the feature column (i.e. value) should be a list feature
     :return: A feature store built using the supplied configuration.
     """
-    df = create_dataset()
+    df = create_dataset(entity_type, feature_dtype, feature_is_list)
 
     project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}"
 
@@ -221,9 +232,7 @@ def construct_test_environment(
     offline_creator: DataSourceCreator = importer.get_class_from_type(
         module_name, config_class_name, "DataSourceCreator"
     )(project)
-    ds = offline_creator.create_data_source(
-        project, df, field_mapping={"ts_1": "ts", "id": "driver_id"}
-    )
+    ds = offline_creator.create_data_source(project, df, field_mapping={"ts_1": "ts"})
     offline_store = offline_creator.create_offline_store_config()
     online_store = test_repo_config.online_store
 
@@ -243,6 +252,9 @@ def construct_test_environment(
             feature_store=fs,
             data_source=ds,
             data_source_creator=offline_creator,
+            entity_type=entity_type,
+            feature_dtype=feature_dtype,
+            feature_is_list=feature_is_list,
         )
 
         fvs = []
@@ -341,3 +353,80 @@ def inner_test(config):
             online_test(environment)
 
     return inner_test
+
+
+def parametrize_types_no_materialize_test(types_test):
+    """
+    This decorator should be used by tests that want to parametrize by different kinds of entity + feature types and
+    not materialize said features
+    """
+    return _parametrize_types_test_internal(types_test, create_apply_materialize=False)
+
+
+def parametrize_types_materialize_test(types_test):
+    """
+    This decorator should be used by tests that want to parametrize by different kinds of entity + feature types and
+    materialize said features
+    """
+    return _parametrize_types_test_internal(types_test, create_apply_materialize=True)
+
+
+def parametrize_types_no_materialize_test_no_list(types_test):
+    """
+    This decorator should be used by tests that want to parametrize by different kinds of entity + feature types, but
+    not materializing and not allowing for feature list types
+    """
+    return _parametrize_types_test_internal(
+        types_test, create_apply_materialize=False, vary_feature_is_list=False
+    )
+
+
+def _parametrize_types_test_internal(
+    types_test, create_apply_materialize: bool, vary_feature_is_list: bool = True
+):
+    def entity_feature_types_ids(entity_type: ValueType, feature_dtype: str):
+        return f"entity_type:{str(entity_type)}-feature_dtype:{feature_dtype}"
+
+    # TODO(adchia): consider adding timestamp / bytes for feature_dtypes
+    # TODO(adchia): test materializing float entity types and ensure we throw an error before querying BQ
+    entity_type_feature_dtypes = [
+        (ValueType.INT32, "int32"),
+        (ValueType.INT64, "int64"),
+        (ValueType.STRING, "float"),
+        (ValueType.STRING, "bool"),
+    ]
+
+    # TODO(adchia): fix conversion to allow for lists in materialization
+    feature_is_list = [True, False] if vary_feature_is_list else [False]
+
+    @pytest.mark.integration
+    @pytest.mark.parametrize(
+        "entity_type,feature_dtype",
+        entity_type_feature_dtypes,
+        ids=[
+            entity_feature_types_ids(entity_type, feature_dtype)
+            for entity_type, feature_dtype in entity_type_feature_dtypes
+        ],
+    )
+    @pytest.mark.parametrize(
+        "feature_is_list", feature_is_list, ids=lambda v: f"feature_is_list:{str(v)}"
+    )
+    def inner_test(entity_type: ValueType, feature_dtype: str, feature_is_list: bool):
+        # TODO: parametrize config
+        with construct_test_environment(
+            TestRepoConfig(
+                provider="gcp",
+                offline_store_creator=ds_creator_path(
+                    "bigquery.BigQueryDataSourceCreator"
+                ),
+                online_store="datastore",
+            ),
+            create_and_apply=create_apply_materialize,
+            materialize=create_apply_materialize,
+            entity_type=entity_type,
+            feature_dtype=feature_dtype,
+            feature_is_list=feature_is_list,
+        ) as environment:
+            types_test(environment)
+
+    return inner_test
diff --git a/sdk/python/tests/integration/feature_repos/universal/entities.py b/sdk/python/tests/integration/feature_repos/universal/entities.py
@@ -1,10 +1,10 @@
 from feast import Entity, ValueType
 
 
-def driver():
+def driver(value_type: ValueType = ValueType.INT64):
     return Entity(
         name="driver",  # The name is derived from this argument, not object name.
-        value_type=ValueType.INT64,
+        value_type=value_type,
         description="driver id",
         join_key="driver_id",
     )
diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py
@@ -5,12 +5,14 @@
 
 
 def driver_feature_view(
-    data_source: DataSource, name="test_correctness"
+    data_source: DataSource,
+    name="test_correctness",
+    value_type: ValueType = ValueType.FLOAT,
 ) -> FeatureView:
     return FeatureView(
         name=name,
         entities=["driver"],
-        features=[Feature("value", ValueType.FLOAT)],
+        features=[Feature("value", value_type)],
         ttl=timedelta(days=5),
         input=data_source,
     )
diff --git a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py
@@ -59,7 +59,7 @@ def prep_bq_fs_and_fv(
         event_timestamp_column="ts",
         created_timestamp_column="created_ts",
         date_partition_column="",
-        field_mapping={"ts_1": "ts", "id": "driver_id"},
+        field_mapping={"ts_1": "ts"},
     )
 
     fv = driver_feature_view(bigquery_source)
@@ -122,7 +122,7 @@ def prep_redshift_fs_and_fv(
         event_timestamp_column="ts",
         created_timestamp_column="created_ts",
         date_partition_column="",
-        field_mapping={"ts_1": "ts", "id": "driver_id"},
+        field_mapping={"ts_1": "ts"},
     )
 
     fv = driver_feature_view(redshift_source)
@@ -171,7 +171,7 @@ def prep_local_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
             event_timestamp_column="ts",
             created_timestamp_column="created_ts",
             date_partition_column="",
-            field_mapping={"ts_1": "ts", "id": "driver_id"},
+            field_mapping={"ts_1": "ts"},
         )
         fv = driver_feature_view(file_source)
         e = Entity(
@@ -212,7 +212,7 @@ def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
             event_timestamp_column="ts",
             created_timestamp_column="created_ts",
             date_partition_column="",
-            field_mapping={"ts_1": "ts", "id": "driver_id"},
+            field_mapping={"ts_1": "ts"},
         )
         fv = driver_feature_view(file_source)
         e = Entity(
@@ -254,7 +254,7 @@ def prep_dynamodb_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
             event_timestamp_column="ts",
             created_timestamp_column="created_ts",
             date_partition_column="",
-            field_mapping={"ts_1": "ts", "id": "driver_id"},
+            field_mapping={"ts_1": "ts"},
         )
         fv = driver_feature_view(file_source)
         e = Entity(
@@ -332,7 +332,7 @@ def check_offline_and_online_features(
 def run_offline_online_store_consistency_test(
     fs: FeatureStore, fv: FeatureView, full_feature_names: bool,
 ) -> None:
-    now = datetime.utcnow()
+    now = datetime.now()
     # Run materialize()
     # use both tz-naive & tz-aware timestamps to test that they're both correctly handled
     start_date = (now - timedelta(hours=5)).replace(tzinfo=utc)
diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py

Original file line number	Diff line number	Diff line change
`@@ -29,9 +29,11 @@ def update_entities_with_inferred_types_from_feature_views(`
`29`	`29`	`col_names_and_types = view.batch_source.get_table_column_names_and_types(config)`
`30`	`30`	`for entity_name in view.entities:`
`31`	`31`	`if entity_name in incomplete_entities:`
	`32`	`+ entity = incomplete_entities[entity_name]`
	`33`	`+`
`32`	`34`	`# get entity information from information extracted from the view batch source`
`33`	`35`	`extracted_entity_name_type_pairs = list(`
`34`		`- filter(lambda tup: tup[0] == entity_name, col_names_and_types)`
	`36`	`+ filter(lambda tup: tup[0] == entity.join_key, col_names_and_types,)`
`35`	`37`	`)`
`36`	`38`	`if len(extracted_entity_name_type_pairs) == 0:`
`37`	`39`	`# Doesn't mention inference error because would also be an error without inferencing`
`@@ -40,7 +42,6 @@ def update_entities_with_inferred_types_from_feature_views(`
`40`	`42`	`its entity's name."""`
`41`	`43`	`)`
`42`	`44`
`43`		`- entity = incomplete_entities[entity_name]`
`44`	`45`	`inferred_value_type = view.batch_source.source_datatype_to_feast_value_type()(`
`45`	`46`	`extracted_entity_name_type_pairs[0][1]`
`46`	`47`	`)`