chore: Collection of small improvements for feature validation (#2742)

pyalex · web-flow · commit 3d2b43111ae4 · 2022-05-27T18:24:37.000-07:00
* Feature logging configurable via feature_store.yaml

Signed-off-by: Oleksii Moskalenko &lt;moskalenko.alexey@gmail.com&gt;

* set nan when status is not found

Signed-off-by: Oleksii Moskalenko &lt;moskalenko.alexey@gmail.com&gt;

* improvements

Signed-off-by: Oleksii Moskalenko &lt;moskalenko.alexey@gmail.com&gt;

* fix tests

Signed-off-by: Oleksii Moskalenko &lt;moskalenko.alexey@gmail.com&gt;
diff --git a/go/types/typeconversion.go b/go/types/typeconversion.go
@@ -285,6 +285,10 @@ func ArrowValuesToProtoValues(arr arrow.Array) ([]*types.Value, error) {
 				&types.Value{Val: &types.Value_UnixTimestampVal{
 					UnixTimestampVal: int64(arr.(*array.Timestamp).Value(idx))}})
 		}
+	case arrow.Null:
+		for idx := 0; idx < arr.Len(); idx++ {
+			values = append(values, &types.Value{})
+		}
 	default:
 		return nil, fmt.Errorf("unsupported arrow to proto conversion for type %s", arr.DataType())
 	}
diff --git a/sdk/python/feast/dqm/profilers/ge_profiler.py b/sdk/python/feast/dqm/profilers/ge_profiler.py
@@ -37,6 +37,12 @@ def _prepare_dataset(dataset: PandasDataset) -> PandasDataset:
             # This could cause error on comparison => so better to convert to double prematurely
             dataset_copy[column] = dataset[column].astype(np.float64)
 
+        status_column = f"{column}__status"
+        if status_column in dataset.columns:
+            dataset_copy[column] = dataset_copy[column].mask(
+                dataset[status_column] == FieldStatus.NOT_FOUND, np.nan
+            )
+
     return dataset_copy
 
 
@@ -169,6 +175,8 @@ def errors(self) -> List["ValidationError"]:
                 check_config=res.expectation_config.kwargs,
                 missing_count=res["result"].get("missing_count"),
                 missing_percent=res["result"].get("missing_percent"),
+                unexpected_count=res["result"].get("unexpected_count"),
+                unexpected_percent=res["result"].get("unexpected_percent"),
             )
             for res in self._validation_result["results"]
             if not res["success"]
diff --git a/sdk/python/feast/dqm/profilers/profiler.py b/sdk/python/feast/dqm/profilers/profiler.py
@@ -70,6 +70,8 @@ class ValidationError:
     missing_count: Optional[int]
     missing_percent: Optional[float]
     observed_value: Optional[float]
+    unexpected_count: Optional[int]
+    unexpected_percent: Optional[float]
 
     def __init__(
         self,
@@ -79,13 +81,17 @@ def __init__(
         missing_count: Optional[int] = None,
         missing_percent: Optional[float] = None,
         observed_value: Optional[float] = None,
+        unexpected_count: Optional[int] = None,
+        unexpected_percent: Optional[float] = None,
     ):
         self.check_name = check_name
         self.column_name = column_name
         self.check_config = check_config
         self.missing_count = missing_count
         self.missing_percent = missing_percent
         self.observed_value = observed_value
+        self.unexpected_count = unexpected_count
+        self.unexpected_percent = unexpected_percent
 
     def __repr__(self):
         return f"<ValidationError {self.check_name}:{self.column_name}>"
@@ -98,4 +104,6 @@ def to_dict(self):
             missing_count=self.missing_count,
             missing_percent=self.missing_percent,
             observed_value=self.observed_value,
+            unexpected_count=self.unexpected_count,
+            unexpected_percent=self.unexpected_percent,
         )
diff --git a/sdk/python/tests/integration/e2e/test_validation.py b/sdk/python/tests/integration/e2e/test_validation.py
@@ -317,8 +317,8 @@ def test_e2e_validation_via_cli(environment, universal_data_sources):
             feature_service.name,
             "--reference",
             reference.name,
-            (datetime.datetime.utcnow() - datetime.timedelta(days=7)).isoformat(),
-            datetime.datetime.utcnow().isoformat(),
+            (datetime.datetime.now() - datetime.timedelta(days=7)).isoformat(),
+            datetime.datetime.now().isoformat(),
         ]
         p = runner.run(validate_args, cwd=local_repo.repo_path)
 
@@ -335,7 +335,10 @@ def test_e2e_validation_via_cli(environment, universal_data_sources):
                 "current_balance": [0],
                 "avg_passenger_count": [0],
                 "lifetime_trip_count": [0],
-                "event_timestamp": [make_tzaware(datetime.datetime.utcnow())],
+                "event_timestamp": [
+                    make_tzaware(datetime.datetime.utcnow())
+                    - datetime.timedelta(hours=1)
+                ],
             }
         )
         invalid_logs = prepare_logs(invalid_data, feature_service, store)

Original file line number	Diff line number	Diff line change
`@@ -285,6 +285,10 @@ func ArrowValuesToProtoValues(arr arrow.Array) ([]*types.Value, error) {`
`285`	`285`	`&types.Value{Val: &types.Value_UnixTimestampVal{`
`286`	`286`	`UnixTimestampVal: int64(arr.(*array.Timestamp).Value(idx))}})`
`287`	`287`	`}`
	`288`	`+ case arrow.Null:`
	`289`	`+ for idx := 0; idx < arr.Len(); idx++ {`
	`290`	`+ values = append(values, &types.Value{})`
	`291`	`+ }`
`288`	`292`	`default:`
`289`	`293`	`return nil, fmt.Errorf("unsupported arrow to proto conversion for type %s", arr.DataType())`
`290`	`294`	`}`