Skip to content

Commit 1655b79

Browse files
authored
Add proper template generation to Feast CLI (feast-dev#1460)
* Add templating to Feast init Signed-off-by: Willem Pienaar <git@willem.co> * Add GCP template Signed-off-by: Willem Pienaar <git@willem.co> * Fix Mypy warnings Signed-off-by: Willem Pienaar <git@willem.co> * Refactor template comments Signed-off-by: Willem Pienaar <git@willem.co> * Fix bug in gcp template Signed-off-by: Willem Pienaar <git@willem.co> * Fix template generation Signed-off-by: Willem Pienaar <git@willem.co>
1 parent 34b7d38 commit 1655b79

14 files changed

Lines changed: 282 additions & 64 deletions

File tree

docs/concepts/feature-repository.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ project: my_feature_repo_1
3838
registry: data/metadata.db
3939
provider: local
4040
online_store:
41-
local:
42-
path: data/online_store.db
41+
path: data/online_store.db
4342
```
4443
{% endcode %}
4544

sdk/python/feast/cli.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from feast.repo_operations import (
3434
apply_total,
3535
cli_check_repo,
36+
generate_project_name,
3637
init_repo,
3738
registry_dump,
3839
teardown,
@@ -454,10 +455,34 @@ def materialize_incremental_command(end_ts: str, repo_path: str, views: List[str
454455

455456

456457
@cli.command("init")
457-
@click.option("--minimal", "-m", is_flag=True, help="Only generate the config")
458-
def init_command(minimal: bool):
459-
repo_path = Path.cwd()
460-
init_repo(repo_path, minimal)
458+
@click.argument("PROJECT_DIRECTORY", required=False)
459+
@click.option(
460+
"--minimal", "-m", is_flag=True, help="Create an empty project repository"
461+
)
462+
@click.option(
463+
"--template",
464+
"-t",
465+
type=click.Choice(["local", "gcp"], case_sensitive=False),
466+
help="Specify a template for the created project",
467+
default="local",
468+
)
469+
def init_command(project_directory, minimal: bool, template: str):
470+
"""Create a new Feast repository"""
471+
if not project_directory:
472+
project_directory = generate_project_name()
473+
if template and minimal:
474+
from colorama import Fore, Style
475+
476+
click.echo(
477+
f"Please select either a {Style.BRIGHT + Fore.GREEN}template{Style.RESET_ALL} or "
478+
f"{Style.BRIGHT + Fore.GREEN}minimal{Style.RESET_ALL}, not both"
479+
)
480+
exit(1)
481+
482+
if minimal:
483+
template = "minimal"
484+
485+
init_repo(project_directory, template)
461486

462487

463488
if __name__ == "__main__":

sdk/python/feast/registry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,8 @@ def update_registry_proto(self, updater: Callable[[RegistryProto], RegistryProto
403403
def _write_registry(self, registry_proto: RegistryProto):
404404
registry_proto.version_id = str(uuid.uuid4())
405405
registry_proto.last_updated.FromDatetime(datetime.utcnow())
406+
file_dir = self._filepath.parent
407+
file_dir.mkdir(exist_ok=True)
406408
self._filepath.write_bytes(registry_proto.SerializeToString())
407409
return
408410

sdk/python/feast/repo_operations.py

Lines changed: 58 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,12 @@
22
import os
33
import random
44
import sys
5-
from datetime import datetime, timedelta
5+
from datetime import timedelta
6+
from importlib.abc import Loader
67
from pathlib import Path
7-
from textwrap import dedent
88
from typing import List, NamedTuple, Union
99

1010
from feast import Entity, FeatureTable
11-
from feast.driver_test_data import create_driver_hourly_stats_df
1211
from feast.feature_view import FeatureView
1312
from feast.infra.provider import get_provider
1413
from feast.names import adjectives, animals
@@ -64,7 +63,9 @@ def apply_total(repo_config: RepoConfig, repo_path: Path):
6463
registry_path=registry_config.path,
6564
cache_ttl=timedelta(seconds=registry_config.cache_ttl_seconds),
6665
)
66+
sys.dont_write_bytecode = True
6767
repo = parse_repo(repo_path)
68+
sys.dont_write_bytecode = False
6869

6970
for entity in repo.entities:
7071
registry.apply_entity(entity, project=project)
@@ -159,69 +160,71 @@ def cli_check_repo(repo_path: Path):
159160
sys.exit(1)
160161

161162

162-
def init_repo(repo_path: Path, minimal: bool):
163-
repo_config = repo_path / "feature_store.yaml"
163+
def init_repo(repo_name: str, template: str):
164+
import os
165+
from distutils.dir_util import copy_tree
166+
from pathlib import Path
164167

165-
if repo_config.exists():
166-
print("Feature repository is already initialized, nothing to do.")
167-
sys.exit(1)
168+
from colorama import Fore, Style
169+
170+
repo_path = Path(os.path.join(Path.cwd(), repo_name))
171+
repo_path.mkdir(exist_ok=True)
172+
repo_config_path = repo_path / "feature_store.yaml"
173+
174+
if repo_config_path.exists():
175+
new_directory = os.path.relpath(repo_path, os.getcwd())
168176

169-
project_id = generate_project_name()
170-
171-
if minimal:
172-
repo_config.write_text(
173-
dedent(
174-
f"""
175-
project: {project_id}
176-
registry: /path/to/registry.db
177-
provider: local
178-
online_store:
179-
path: /path/to/online_store.db
180-
"""
181-
)
182-
)
183177
print(
184-
"Generated example feature_store.yaml. Please edit registry and online_store"
185-
"location before running apply"
178+
f"The directory {Style.BRIGHT + Fore.GREEN}{new_directory}{Style.RESET_ALL} contains an existing feature "
179+
f"store repository that may cause a conflict"
186180
)
181+
print()
182+
sys.exit(1)
187183

188-
else:
189-
example_py = (Path(__file__).parent / "example_repo.py").read_text()
184+
# Copy template directory
185+
template_path = str(Path(Path(__file__).parent / "templates" / template).absolute())
186+
if not os.path.exists(template_path):
187+
raise IOError(f"Could not find template {template}")
188+
copy_tree(template_path, str(repo_path))
189+
190+
# Seed the repository
191+
bootstrap_path = repo_path / "bootstrap.py"
192+
if os.path.exists(bootstrap_path):
193+
import importlib.util
194+
195+
spec = importlib.util.spec_from_file_location("bootstrap", str(bootstrap_path))
196+
bootstrap = importlib.util.module_from_spec(spec)
197+
assert isinstance(spec.loader, Loader)
198+
spec.loader.exec_module(bootstrap)
199+
bootstrap.bootstrap() # type: ignore
200+
os.remove(bootstrap_path)
201+
202+
# Template the feature_store.yaml file
203+
feature_store_yaml_path = repo_path / "feature_store.yaml"
204+
replace_str_in_file(
205+
feature_store_yaml_path, "project: my_project", f"project: {repo_name}"
206+
)
190207

191-
data_path = repo_path / "data"
192-
data_path.mkdir(exist_ok=True)
208+
# Remove the __pycache__ folder if it exists
209+
import shutil
193210

194-
end_date = datetime.now().replace(microsecond=0, second=0, minute=0)
195-
start_date = end_date - timedelta(days=15)
211+
shutil.rmtree(repo_path / "__pycache__", ignore_errors=True)
196212

197-
driver_entities = [1001, 1002, 1003, 1004, 1005]
198-
driver_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date)
213+
import click
199214

200-
driver_stats_path = data_path / "driver_stats.parquet"
201-
driver_df.to_parquet(
202-
path=str(driver_stats_path), allow_truncated_timestamps=True
203-
)
215+
click.echo()
216+
click.echo(
217+
f"Creating a new Feast repository in {Style.BRIGHT + Fore.GREEN}{repo_path}{Style.RESET_ALL}."
218+
)
219+
click.echo()
204220

205-
with open(repo_path / "example.py", "wt") as f:
206-
f.write(example_py.replace("%PARQUET_PATH%", str(driver_stats_path)))
207-
208-
# Generate config
209-
repo_config.write_text(
210-
dedent(
211-
f"""
212-
project: {project_id}
213-
registry: {"data/registry.db"}
214-
provider: local
215-
online_store:
216-
path: {"data/online_store.db"}
217-
"""
218-
)
219-
)
220221

221-
print("Generated feature_store.yaml and example features in example_repo.py")
222-
print(
223-
"Now try running `feast apply` to apply and `feast materialize` to sync data to the online store"
224-
)
222+
def replace_str_in_file(file_path, match_str, sub_str):
223+
with open(file_path, "r") as f:
224+
contents = f.read()
225+
contents = contents.replace(match_str, sub_str)
226+
with open(file_path, "wt") as f:
227+
f.write(contents)
225228

226229

227230
def generate_project_name() -> str:
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Feast Templates
2+
3+
Each folder in this module is a template that comes packaged with Feast.
4+
5+
* A template is installed with `feast init -t template_name`
6+
* The template name provided during `init` maps directly to the folder name
7+
* It is possible to provide a bootstrap.py script with a template. The script must provide a bootstrap() function. This
8+
script will automatically be executed and can be used to set up data or sources for the user.
9+
* The feature_store.yaml will have its `project` name templated based on the project name provided by the user. The
10+
default project name should be `my_project`. If a different name is chosen then no templating will occur.
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from datetime import timedelta
2+
3+
from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType
4+
5+
# Define an entity for the driver. Entities can be thought of as primary keys used to
6+
# retrieve features. Entities are also used to join multiple tables/views during the
7+
# construction of feature vectors
8+
driver = Entity(
9+
# Name of the entity. Must be unique within a project
10+
name="driver",
11+
# The join key of an entity describes the storage level field/column on which
12+
# features can be looked up. The join key is also used to join feature
13+
# tables/views when building feature vectors
14+
join_key="driver_id",
15+
# The storage level type for an entity
16+
value_type=ValueType.INT64,
17+
)
18+
19+
# Indicates a data source from which feature values can be retrieved. Sources are queried when building training
20+
# datasets or materializing features into an online store.
21+
driver_stats_source = BigQuerySource(
22+
# The BigQuery table where features can be found
23+
table_ref="feast-oss.demo_data.driver_stats",
24+
# The event timestamp is used for point-in-time joins and for ensuring only
25+
# features within the TTL are returned
26+
event_timestamp_column="datetime",
27+
# The (optional) created timestamp is used to ensure there are no duplicate
28+
# feature rows in the offline store or when building training datasets
29+
created_timestamp_column="created",
30+
)
31+
32+
# Feature views are a grouping based on how features are stored in either the
33+
# online or offline store.
34+
driver_stats_fv = FeatureView(
35+
# The unique name of this feature view. Two feature views in a single
36+
# project cannot have the same name
37+
name="driver_stats",
38+
# The list of entities specifies the keys required for joining or looking
39+
# up features from this feature view. The reference provided in this field
40+
# correspond to the name of a defined entity (or entities)
41+
entities=["driver"],
42+
# The timedelta is the maximum age that each feature value may have
43+
# relative to its lookup time. For historical features (used in training),
44+
# TTL is relative to each timestamp provided in the entity dataframe.
45+
# TTL also allows for eviction of keys from online stores and limits the
46+
# amount of historical scanning required for historical feature values
47+
# during retrieval
48+
ttl=timedelta(weeks=52),
49+
# The list of features defined below act as a schema to both define features
50+
# for both materialization of features into a store, and are used as references
51+
# during retrieval for building a training dataset or serving features
52+
features=[
53+
Feature(name="conv_rate", dtype=ValueType.FLOAT),
54+
Feature(name="acc_rate", dtype=ValueType.FLOAT),
55+
Feature(name="avg_daily_trips", dtype=ValueType.INT64),
56+
],
57+
# Inputs are used to find feature values. In the case of this feature
58+
# view we will query a source table on BigQuery for driver statistics
59+
# features
60+
input=driver_stats_source,
61+
# Tags are user defined key/value pairs that are attached to each
62+
# feature view
63+
tags={"team": "driver_performance"},
64+
)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
project: my_project
2+
registry: data/registry.db
3+
provider: gcp
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
from datetime import datetime, timedelta
2+
3+
import pandas as pd
4+
from driver_repo import driver, driver_stats_fv
5+
6+
from feast import FeatureStore
7+
8+
9+
def main():
10+
pd.set_option("display.max_columns", None)
11+
pd.set_option("display.width", 1000)
12+
13+
# Load the feature store from the current path
14+
fs = FeatureStore(repo_path=".")
15+
16+
# Deploy the feature store to GCP
17+
print("Deploying feature store to GCP...")
18+
fs.apply([driver, driver_stats_fv])
19+
20+
# Select features
21+
feature_refs = ["driver_stats:conv_rate", "driver_stats:acc_rate"]
22+
23+
# Create an entity dataframe. This is the dataframe that will be enriched with historical features
24+
entity_df = pd.DataFrame(
25+
{
26+
"event_timestamp": [
27+
pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
28+
for dt in pd.date_range(
29+
start=datetime.now() - timedelta(days=3),
30+
end=datetime.now(),
31+
periods=3,
32+
)
33+
],
34+
"driver_id": [5001, 5002, 5003],
35+
}
36+
)
37+
38+
print("Retrieving training data...")
39+
40+
# Retrieve historical features by joining the entity dataframe to the BigQuery table source
41+
training_df = fs.get_historical_features(
42+
feature_refs=feature_refs, entity_df=entity_df
43+
).to_df()
44+
45+
print()
46+
print(training_df)
47+
48+
print()
49+
print("Loading features into the online store...")
50+
fs.materialize_incremental(end_date=datetime.now())
51+
52+
print()
53+
print("Retrieving online features...")
54+
55+
# Retrieve features from the online store (Firestore)
56+
online_features = fs.get_online_features(
57+
feature_refs=feature_refs, entity_rows=[{"driver": 5001}, {"driver": 5002}]
58+
).to_dict()
59+
60+
print()
61+
print(pd.DataFrame.from_dict(online_features))
62+
63+
64+
if __name__ == "__main__()":
65+
main()
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
def bootstrap():
2+
# Bootstrap() will automatically be called from the init_repo() during `feast init`
3+
4+
import pathlib
5+
from datetime import datetime, timedelta
6+
7+
from feast.driver_test_data import create_driver_hourly_stats_df
8+
9+
repo_path = pathlib.Path(__file__).parent.absolute()
10+
data_path = repo_path / "data"
11+
data_path.mkdir(exist_ok=True)
12+
13+
end_date = datetime.now().replace(microsecond=0, second=0, minute=0)
14+
start_date = end_date - timedelta(days=15)
15+
16+
driver_entities = [1001, 1002, 1003, 1004, 1005]
17+
driver_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date)
18+
19+
driver_stats_path = data_path / "driver_stats.parquet"
20+
driver_df.to_parquet(path=str(driver_stats_path), allow_truncated_timestamps=True)
21+
22+
example_py_file = repo_path / "example.py"
23+
replace_str_in_file(example_py_file, "%PARQUET_PATH%", str(driver_stats_path))
24+
25+
26+
def replace_str_in_file(file_path, match_str, sub_str):
27+
with open(file_path, "r") as f:
28+
contents = f.read()
29+
contents = contents.replace(match_str, sub_str)
30+
with open(file_path, "wt") as f:
31+
f.write(contents)
32+
33+
34+
if __name__ == "__main__":
35+
bootstrap()
File renamed without changes.

0 commit comments

Comments
 (0)