Skip to content

Commit 3d2b431

Browse files
authored
chore: Collection of small improvements for feature validation (#2742)
* Feature logging configurable via feature_store.yaml Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com> * set nan when status is not found Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com> * improvements Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com> * fix tests Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>
1 parent 92c1f34 commit 3d2b431

File tree

4 files changed

+26
-3
lines changed

4 files changed

+26
-3
lines changed

go/types/typeconversion.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,10 @@ func ArrowValuesToProtoValues(arr arrow.Array) ([]*types.Value, error) {
285285
&types.Value{Val: &types.Value_UnixTimestampVal{
286286
UnixTimestampVal: int64(arr.(*array.Timestamp).Value(idx))}})
287287
}
288+
case arrow.Null:
289+
for idx := 0; idx < arr.Len(); idx++ {
290+
values = append(values, &types.Value{})
291+
}
288292
default:
289293
return nil, fmt.Errorf("unsupported arrow to proto conversion for type %s", arr.DataType())
290294
}

sdk/python/feast/dqm/profilers/ge_profiler.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ def _prepare_dataset(dataset: PandasDataset) -> PandasDataset:
3737
# This could cause error on comparison => so better to convert to double prematurely
3838
dataset_copy[column] = dataset[column].astype(np.float64)
3939

40+
status_column = f"{column}__status"
41+
if status_column in dataset.columns:
42+
dataset_copy[column] = dataset_copy[column].mask(
43+
dataset[status_column] == FieldStatus.NOT_FOUND, np.nan
44+
)
45+
4046
return dataset_copy
4147

4248

@@ -169,6 +175,8 @@ def errors(self) -> List["ValidationError"]:
169175
check_config=res.expectation_config.kwargs,
170176
missing_count=res["result"].get("missing_count"),
171177
missing_percent=res["result"].get("missing_percent"),
178+
unexpected_count=res["result"].get("unexpected_count"),
179+
unexpected_percent=res["result"].get("unexpected_percent"),
172180
)
173181
for res in self._validation_result["results"]
174182
if not res["success"]

sdk/python/feast/dqm/profilers/profiler.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ class ValidationError:
7070
missing_count: Optional[int]
7171
missing_percent: Optional[float]
7272
observed_value: Optional[float]
73+
unexpected_count: Optional[int]
74+
unexpected_percent: Optional[float]
7375

7476
def __init__(
7577
self,
@@ -79,13 +81,17 @@ def __init__(
7981
missing_count: Optional[int] = None,
8082
missing_percent: Optional[float] = None,
8183
observed_value: Optional[float] = None,
84+
unexpected_count: Optional[int] = None,
85+
unexpected_percent: Optional[float] = None,
8286
):
8387
self.check_name = check_name
8488
self.column_name = column_name
8589
self.check_config = check_config
8690
self.missing_count = missing_count
8791
self.missing_percent = missing_percent
8892
self.observed_value = observed_value
93+
self.unexpected_count = unexpected_count
94+
self.unexpected_percent = unexpected_percent
8995

9096
def __repr__(self):
9197
return f"<ValidationError {self.check_name}:{self.column_name}>"
@@ -98,4 +104,6 @@ def to_dict(self):
98104
missing_count=self.missing_count,
99105
missing_percent=self.missing_percent,
100106
observed_value=self.observed_value,
107+
unexpected_count=self.unexpected_count,
108+
unexpected_percent=self.unexpected_percent,
101109
)

sdk/python/tests/integration/e2e/test_validation.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,8 @@ def test_e2e_validation_via_cli(environment, universal_data_sources):
317317
feature_service.name,
318318
"--reference",
319319
reference.name,
320-
(datetime.datetime.utcnow() - datetime.timedelta(days=7)).isoformat(),
321-
datetime.datetime.utcnow().isoformat(),
320+
(datetime.datetime.now() - datetime.timedelta(days=7)).isoformat(),
321+
datetime.datetime.now().isoformat(),
322322
]
323323
p = runner.run(validate_args, cwd=local_repo.repo_path)
324324

@@ -335,7 +335,10 @@ def test_e2e_validation_via_cli(environment, universal_data_sources):
335335
"current_balance": [0],
336336
"avg_passenger_count": [0],
337337
"lifetime_trip_count": [0],
338-
"event_timestamp": [make_tzaware(datetime.datetime.utcnow())],
338+
"event_timestamp": [
339+
make_tzaware(datetime.datetime.utcnow())
340+
- datetime.timedelta(hours=1)
341+
],
339342
}
340343
)
341344
invalid_logs = prepare_logs(invalid_data, feature_service, store)

0 commit comments

Comments
 (0)