Skip to content

Commit bf23111

Browse files
authored
fix: Timestamp update (feast-dev#2486)
* Timestamps Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Update md files Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Update more Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Update batch source creators Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix data source Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Temp fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fixed Signed-off-by: Kevin Zhang <kzhang@tecton.ai>
1 parent 0c9e5b7 commit bf23111

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+216
-180
lines changed

docs/how-to-guides/adding-a-new-offline-store.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ There are two methods that deal with reading data from the offline stores`get_hi
5454
data_source: DataSource,
5555
join_key_columns: List[str],
5656
feature_name_columns: List[str],
57-
event_timestamp_column: str,
57+
timestamp_field: str,
5858
created_timestamp_column: Optional[str],
5959
start_date: datetime,
6060
end_date: datetime) -> RetrievalJob:
@@ -63,7 +63,7 @@ There are two methods that deal with reading data from the offline stores`get_hi
6363
data_source,
6464
join_key_columns,
6565
feature_name_columns,
66-
event_timestamp_column,
66+
timestamp_field=timestamp_field,
6767
created_timestamp_column,
6868
start_date,
6969
end_date)
@@ -165,14 +165,14 @@ class CustomFileDataSource(FileSource):
165165
"""Custom data source class for local files"""
166166
def __init__(
167167
self,
168-
event_timestamp_column: Optional[str] = "",
168+
timestamp_field: Optional[str] = "",
169169
path: Optional[str] = None,
170170
field_mapping: Optional[Dict[str, str]] = None,
171171
created_timestamp_column: Optional[str] = "",
172172
date_partition_column: Optional[str] = "",
173173
):
174174
super(CustomFileDataSource, self).__init__(
175-
event_timestamp_column,
175+
timestamp_field=timestamp_field,
176176
created_timestamp_column,
177177
field_mapping,
178178
date_partition_column,
@@ -189,7 +189,7 @@ class CustomFileDataSource(FileSource):
189189
return CustomFileDataSource(
190190
field_mapping=dict(data_source.field_mapping),
191191
path=path,
192-
event_timestamp_column=data_source.event_timestamp_column,
192+
timestamp_field=data_source.timestamp_field,
193193
created_timestamp_column=data_source.created_timestamp_column,
194194
date_partition_column=data_source.date_partition_column,
195195
)
@@ -203,7 +203,7 @@ class CustomFileDataSource(FileSource):
203203
),
204204
)
205205
206-
data_source_proto.event_timestamp_column = self.event_timestamp_column
206+
data_source_proto.timestamp_field = self.timestamp_field
207207
data_source_proto.created_timestamp_column = self.created_timestamp_column
208208
data_source_proto.date_partition_column = self.date_partition_column
209209

docs/reference/data-sources/spark.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import
4545
my_spark_source = SparkSource(
4646
path=f"{CURRENT_DIR}/data/driver_hourly_stats",
4747
file_format="parquet",
48-
event_timestamp_column="event_timestamp",
48+
timestamp_field="event_timestamp",
4949
created_timestamp_column="created",
5050
)
5151
```

docs/reference/feature-repository/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Typically, users store their feature repositories in a Git repository, especiall
2626
The structure of a feature repository is as follows:
2727

2828
* The root of the repository should contain a `feature_store.yaml` file and may contain a `.feastignore` file.
29-
* The repository should contain Python files that contain feature definitions.
29+
* The repository should contain Python files that contain feature definitions.
3030
* The repository can contain other files as well, including documentation and potentially data files.
3131

3232
An example structure of a feature repository is shown below:
@@ -98,7 +98,7 @@ from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType
9898
9999
driver_locations_source = BigQuerySource(
100100
table_ref="rh_prod.ride_hailing_co.drivers",
101-
event_timestamp_column="event_timestamp",
101+
timestamp_field="event_timestamp",
102102
created_timestamp_column="created_timestamp",
103103
)
104104

docs/reference/feature-repository/registration-inferencing.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22

33
## Overview
44

5-
* FeatureView - When the `features` parameter is left out of the feature view definition, upon a `feast apply` call, Feast will automatically consider every column in the data source as a feature to be registered other than the specific timestamp columns associated with the underlying data source definition (e.g. event_timestamp_column) and the columns associated with the feature view's entities.
6-
* DataSource - When the `event_timestamp_column` parameter is left out of the data source definition, upon a 'feast apply' call, Feast will automatically find the sole timestamp column in the table underlying the data source and use that as the `event_timestamp_column`. If there are no columns of timestamp type or multiple columns of timestamp type, `feast apply` will throw an exception.
5+
* FeatureView - When the `features` parameter is left out of the feature view definition, upon a `feast apply` call, Feast will automatically consider every column in the data source as a feature to be registered other than the specific timestamp columns associated with the underlying data source definition (e.g. timestamp_field) and the columns associated with the feature view's entities.
6+
* DataSource - When the `timestamp_field` parameter is left out of the data source definition, upon a 'feast apply' call, Feast will automatically find the sole timestamp column in the table underlying the data source and use that as the `timestamp_field`. If there are no columns of timestamp type or multiple columns of timestamp type, `feast apply` will throw an exception.
77
* Entity - When the `value_type` parameter is left out of the entity definition, upon a `feast apply` call, Feast will automatically find the column corresponding with the entity's `join_key` and take that column's data type to be the `value_type`. If the column doesn't exist, `feast apply` will throw an exception.

docs/tutorials/validating-historical-features.md

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# Validating historical features with Great Expectations
22

3-
In this tutorial, we will use the public dataset of Chicago taxi trips to present data validation capabilities of Feast.
4-
- The original dataset is stored in BigQuery and consists of raw data for each taxi trip (one row per trip) since 2013.
3+
In this tutorial, we will use the public dataset of Chicago taxi trips to present data validation capabilities of Feast.
4+
- The original dataset is stored in BigQuery and consists of raw data for each taxi trip (one row per trip) since 2013.
55
- We will generate several training datasets (aka historical features in Feast) for different periods and evaluate expectations made on one dataset against another.
66

77
Types of features we're ingesting and generating:
8-
- Features that aggregate raw data with daily intervals (eg, trips per day, average fare or speed for a specific day, etc.).
9-
- Features using SQL while pulling data from BigQuery (like total trips time or total miles travelled).
8+
- Features that aggregate raw data with daily intervals (eg, trips per day, average fare or speed for a specific day, etc.).
9+
- Features using SQL while pulling data from BigQuery (like total trips time or total miles travelled).
1010
- Features calculated on the fly when requested using Feast's on-demand transformations
1111

1212
Our plan:
@@ -31,7 +31,7 @@ Install Feast Python SDK and great expectations:
3131
```
3232

3333

34-
### 1. Dataset preparation (Optional)
34+
### 1. Dataset preparation (Optional)
3535

3636
**You can skip this step if you don't have GCP account. Please use parquet files that are coming with this tutorial instead**
3737

@@ -56,15 +56,15 @@ Running some basic aggregations while pulling data from BigQuery. Grouping by ta
5656

5757

5858
```python
59-
data_query = """SELECT
59+
data_query = """SELECT
6060
taxi_id,
6161
TIMESTAMP_TRUNC(trip_start_timestamp, DAY) as day,
6262
SUM(trip_miles) as total_miles_travelled,
6363
SUM(trip_seconds) as total_trip_seconds,
6464
SUM(fare) as total_earned,
6565
COUNT(*) as trip_count
66-
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
67-
WHERE
66+
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
67+
WHERE
6868
trip_miles > 0 AND trip_seconds > 60 AND
6969
trip_start_timestamp BETWEEN '2019-01-01' and '2020-12-31' AND
7070
trip_total < 1000
@@ -84,7 +84,7 @@ pyarrow.parquet.write_table(driver_stats_table, "trips_stats.parquet")
8484
def entities_query(year):
8585
return f"""SELECT
8686
distinct taxi_id
87-
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
87+
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
8888
WHERE
8989
trip_miles > 0 AND trip_seconds > 0 AND
9090
trip_start_timestamp BETWEEN '{year}-01-01' and '{year}-12-31'
@@ -120,7 +120,7 @@ from google.protobuf.duration_pb2 import Duration
120120

121121
```python
122122
batch_source = FileSource(
123-
event_timestamp_column="day",
123+
timestamp_field="day",
124124
path="trips_stats.parquet", # using parquet file that we created on previous step
125125
file_format=ParquetFormat()
126126
)
@@ -141,7 +141,7 @@ trips_stats_fv = FeatureView(
141141
Feature("total_trip_seconds", ValueType.DOUBLE),
142142
Feature("total_earned", ValueType.DOUBLE),
143143
Feature("trip_count", ValueType.INT64),
144-
144+
145145
],
146146
ttl=Duration(seconds=86400),
147147
batch_source=batch_source,
@@ -317,8 +317,8 @@ store.create_saved_dataset(
317317

318318
Dataset profiler is a function that accepts dataset and generates set of its characteristics. This charasteristics will be then used to evaluate (validate) next datasets.
319319

320-
**Important: datasets are not compared to each other!
321-
Feast use a reference dataset and a profiler function to generate a reference profile.
320+
**Important: datasets are not compared to each other!
321+
Feast use a reference dataset and a profiler function to generate a reference profile.
322322
This profile will be then used during validation of the tested dataset.**
323323

324324

@@ -523,37 +523,37 @@ def stats_profiler(ds: PandasDataset) -> ExpectationSuite:
523523
max_value=60,
524524
mostly=0.99 # allow some outliers
525525
)
526-
526+
527527
ds.expect_column_values_to_be_between(
528528
"total_miles_travelled",
529529
min_value=0,
530530
max_value=500,
531531
mostly=0.99 # allow some outliers
532532
)
533-
533+
534534
# expectation of means based on observed values
535535
observed_mean = ds.trip_count.mean()
536536
ds.expect_column_mean_to_be_between("trip_count",
537537
min_value=observed_mean * (1 - DELTA),
538538
max_value=observed_mean * (1 + DELTA))
539-
539+
540540
observed_mean = ds.earned_per_hour.mean()
541541
ds.expect_column_mean_to_be_between("earned_per_hour",
542542
min_value=observed_mean * (1 - DELTA),
543543
max_value=observed_mean * (1 + DELTA))
544-
545-
544+
545+
546546
# expectation of quantiles
547547
qs = [0.5, 0.75, 0.9, 0.95]
548548
observed_quantiles = ds.avg_fare.quantile(qs)
549-
549+
550550
ds.expect_column_quantile_values_to_be_between(
551551
"avg_fare",
552552
quantile_ranges={
553553
"quantiles": qs,
554554
"value_ranges": [[None, max_value] for max_value in observed_quantiles]
555-
})
556-
555+
})
556+
557557
return ds.get_expectation_suite()
558558
```
559559

@@ -663,7 +663,7 @@ _ = job.to_df(validation_reference=validation_reference)
663663
Validation successfully passed as no exception were raised.
664664

665665

666-
### 5. Validating new historical retrieval
666+
### 5. Validating new historical retrieval
667667

668668
Creating new timestamps for Dec 2020:
669669

examples/java-demo/feature_repo/driver_repo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
driver_hourly_stats = FileSource(
99
path="data/driver_stats_with_string.parquet",
10-
event_timestamp_column="event_timestamp",
10+
timestamp_field="event_timestamp",
1111
created_timestamp_column="created",
1212
)
1313
driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id",)

java/serving/src/test/java/feast/serving/util/DataGenerator.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ public static FeatureTableSpec createFeatureTableSpec(
158158
.setMaxAge(Duration.newBuilder().setSeconds(3600).build())
159159
.setBatchSource(
160160
DataSource.newBuilder()
161-
.setEventTimestampColumn("ts")
161+
.setTimestampField("ts")
162162
.setType(DataSource.SourceType.BATCH_FILE)
163163
.setFileOptions(
164164
FileOptions.newBuilder()
@@ -204,7 +204,7 @@ public static DataSource createFileDataSourceSpec(
204204
.setType(DataSource.SourceType.BATCH_FILE)
205205
.setFileOptions(
206206
FileOptions.newBuilder().setFileFormat(createParquetFormat()).setUri(fileURL).build())
207-
.setEventTimestampColumn(timestampColumn)
207+
.setTimestampField(timestampColumn)
208208
.setDatePartitionColumn(datePartitionColumn)
209209
.build();
210210
}
@@ -215,7 +215,7 @@ public static DataSource createBigQueryDataSourceSpec(
215215
.setType(DataSource.SourceType.BATCH_BIGQUERY)
216216
.setBigqueryOptions(
217217
DataSource.BigQueryOptions.newBuilder().setTableRef(bigQueryTableRef).build())
218-
.setEventTimestampColumn(timestampColumn)
218+
.setTimestampField(timestampColumn)
219219
.setDatePartitionColumn(datePartitionColumn)
220220
.build();
221221
}
@@ -230,7 +230,7 @@ public static DataSource createKafkaDataSourceSpec(
230230
.setBootstrapServers(servers)
231231
.setMessageFormat(createProtoFormat("class.path"))
232232
.build())
233-
.setEventTimestampColumn(timestampColumn)
233+
.setTimestampField(timestampColumn)
234234
.build();
235235
}
236236

@@ -292,7 +292,7 @@ public static DataSource createKinesisDataSourceSpec(
292292
.setStreamName("stream")
293293
.setRecordFormat(createProtoFormat(classPath))
294294
.build())
295-
.setEventTimestampColumn(timestampColumn)
295+
.setTimestampField(timestampColumn)
296296
.build();
297297
}
298298

protos/feast/core/DataSource.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ message DataSource {
6666
map<string, string> field_mapping = 2;
6767

6868
// Must specify event timestamp column name
69-
string event_timestamp_column = 3;
69+
string timestamp_field = 3;
7070

7171
// (Optional) Specify partition column
7272
// useful for file sources

0 commit comments

Comments
 (0)