diff --git a/docs/how-to-guides/feathr-configuration-and-env.md b/docs/how-to-guides/feathr-configuration-and-env.md index 381cd6d41..bfbb5b3b8 100644 --- a/docs/how-to-guides/feathr-configuration-and-env.md +++ b/docs/how-to-guides/feathr-configuration-and-env.md @@ -76,9 +76,10 @@ Feathr will get the configurations in the following order: | ONLINE_STORE__REDIS__PORT | Redis port number to access Redis cluster. | Required if using Redis as online store. | | ONLINE_STORE__REDIS__SSL_ENABLED | Whether SSL is enabled to access Redis cluster. | Required if using Redis as online store. | | REDIS_PASSWORD | Password for the Redis cluster. | Required if using Redis as online store. | -| FEATURE_REGISTRY__PURVIEW__PURVIEW_NAME | Configure the name of the purview endpoint. | Required if using Purview as the endpoint. | -| FEATURE_REGISTRY__PURVIEW__DELIMITER | See [here](#FEATURE_REGISTRY__PURVIEW__DELIMITER) for more details. | Required | -| FEATURE_REGISTRY__PURVIEW__TYPE_SYSTEM_INITIALIZATION | Controls whether the type system (think this as the "schema" for the registry) will be initialized or not. Usually this is only required to be set to `True` to initialize schema, and then you can set it to `False` to shorten the initialization time. | Required | +| FEATURE_REGISTRY__API_ENDPOINT | Specifies registry endpoint. | Required if using registry service. | +| FEATURE_REGISTRY__PURVIEW__PURVIEW_NAME | Configure the name of the purview endpoint. | Required if using Purview directly without registry service. Deprecate soon, see [here](#deprecation) for more details.| +| FEATURE_REGISTRY__PURVIEW__DELIMITER | See [here](#FEATURE_REGISTRY__PURVIEW__DELIMITER) for more details. | Required if using Purview directly without registry service. Deprecate soon, see [here](#deprecation) for more details.| +| FEATURE_REGISTRY__PURVIEW__TYPE_SYSTEM_INITIALIZATION | Controls whether the type system (think this as the "schema" for the registry) will be initialized or not. Usually this is only required to be set to `True` to initialize schema, and then you can set it to `False` to shorten the initialization time. | Required if using Purview directly without registry service. Deprecate soon, see [here](#deprecation) for more details.| # Explanation for selected configurations @@ -121,6 +122,11 @@ Another use case is to use `instance_pool_id`, where instead of creating the Spa Other advanced settings includes `idempotency_token` to guarantee the idempotency of job run requests, etc. +## Deprecation + +This section is about feature deprecation in Feathr. Deprecated features are still available but planned for removal. Please migrate to the supported features as early as possible. +- Connect to Purview directly without registry service is deprecated soon. Please migrate to use registry service and select Purview as registry provider. + ## FEATURE_REGISTRY__PURVIEW__DELIMITER Delimiter indicates that how the project name, feature names etc. are delimited. By default it will be '__'. this is for global reference (mainly for feature sharing). For example, when we setup a project called foo, and we have an anchor called 'taxi_driver' and the feature name is called 'f_daily_trips'. the feature will have a global unique name called 'foo__taxi_driver__f_daily_trips' diff --git a/docs/samples/fraud_detection_demo.ipynb b/docs/samples/fraud_detection_demo.ipynb index 1408de700..412141d30 100644 --- a/docs/samples/fraud_detection_demo.ipynb +++ b/docs/samples/fraud_detection_demo.ipynb @@ -184,12 +184,10 @@ "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", "os.environ['online_store__redis__host'] = redis_host\n", "os.environ['online_store__redis__port'] = redis_port\n", "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", "os.environ['REDIS_PASSWORD']=redis_password\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" ] }, @@ -998,9 +996,6 @@ "notebookOrigID": 1891349682974490, "widgets": {} }, - "interpreter": { - "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" - }, "kernelspec": { "display_name": "Python 3.10.4 64-bit", "language": "python", @@ -1018,8 +1013,13 @@ "pygments_lexer": "ipython3", "version": "3.10.4" }, - "orig_nbformat": 4 + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "6eea572ac5b43246b7c51fa33510c93fb6df4c34b515a6e4994c858f44841967" + } + } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/docs/samples/product_recommendation_demo.ipynb b/docs/samples/product_recommendation_demo.ipynb index ed81644b8..d5834a459 100644 --- a/docs/samples/product_recommendation_demo.ipynb +++ b/docs/samples/product_recommendation_demo.ipynb @@ -184,12 +184,10 @@ "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", "os.environ['online_store__redis__host'] = redis_host\n", "os.environ['online_store__redis__port'] = redis_port\n", "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", "os.environ['REDIS_PASSWORD']=redis_password\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" ] }, diff --git a/docs/samples/product_recommendation_demo_advanced.ipynb b/docs/samples/product_recommendation_demo_advanced.ipynb index cb018598c..f52905a4d 100644 --- a/docs/samples/product_recommendation_demo_advanced.ipynb +++ b/docs/samples/product_recommendation_demo_advanced.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e5545a38-44a7-4aca-be6d-a66c51c75ec8","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Azure Demo Notebook\n","\n","This notebook illustrates the use of Feathr Feature Store to create a model that predict users' rating for different products for a e-commerce website.\n","\n","## Model Problem Statement\n","The e-commerce website has collected past user ratings for various products. The websie also collected data about user and product, like user age, product category etc. Now we want to predict users' product rating for new product so that we can recommend the new product to users that give a high rating for those products.\n","\n","After the model is trained, given a user_id, product_id pair and features, we should be able to predict the product rating that the user will give for this product_id.\n","\n","(Compared with [the beginner version of product recommendation](product_recommendation_demo.ipynb), this tutorial expanded the example by predicting ratings for all products.)\n","\n","## Feature Creation Illustration\n","In this example, our observation data has compound entity key where a record is uniquely identified by user_id and product_id. So there might be 3 types of features:\n","* User features that are different for different users but are the same for different products. For example, user age is different for different users but it's the same for all products(or it's product-agnostic).\n","* Product features that are different for different products but are the same for different users.\n","* User-to-product features that are different for different users AND different products. For example, a feature to represent if the user has bought this product before or not.\n","\n","We will focus on the first two in our example.\n","\n","The feature creation flow is as below:\n","![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/product_recommendation_advanced.jpg?raw=true)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"52b7d651-19d4-44b0-a7a8-03549f49e524","showTitle":false,"title":""}},"source":["## Prerequisite: Use Quick Start Template to Provision Azure Resources\n","\n","First step is to provision required cloud resources if you want to use Feathr. Feathr provides a python based client to interact with cloud resources.\n","\n","Please follow the steps [here](https://linkedin.github.io/feathr/how-to-guides/azure-deployment-arm.html) to provision required cloud resources. Due to the complexity of the possible cloud environment, it is almost impossible to create a script that works for all the use cases. Because of this, [azure_resource_provision.sh](https://github.com/linkedin/feathr/blob/main/docs/how-to-guides/azure_resource_provision.sh) is a full end to end command line to create all the required resources, and you can tailor the script as needed, while [the companion documentation](https://linkedin.github.io/feathr/how-to-guides/azure-deployment-cli.html) can be used as a complete guide for using that shell script. \n","\n","\n","![Architecture](https://github.com/linkedin/feathr/blob/main/docs/images/architecture.png?raw=true)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1ec709d2-62ef-48c7-b915-9790afdac589","showTitle":false,"title":""}},"source":["## Prerequisite: Install Feathr \n","\n","Install Feathr using pip:\n","\n","`pip install -U feathr pandavro scikit-learn`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab5d219b-b827-4f25-9918-d7cb7b47938e","showTitle":false,"title":""}},"source":["## Prerequisite: Configure the required environment with Feathr Quick Start Template\n","\n","In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"146a1443-ce8b-4b8e-8169-2417af8bcb62","showTitle":false,"title":""}},"source":["**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"99b2d855-dae1-4ac8-8492-406dad242326","showTitle":false,"title":""}},"outputs":[],"source":["resource_prefix = \"ckim2\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"95ad2a97-b8e7-4189-8463-51fe419d29c5","showTitle":false,"title":""}},"outputs":[],"source":["! pip install feathr azure-cli pandavro scikit-learn\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a4d7dc6a-d753-4fb6-9683-2766f9a046c7","showTitle":false,"title":""}},"source":["Login to Azure with a device code (You will see instructions in the output):"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"42cf1691-b8de-48d2-b174-0c269950d470","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0f3135eb-15c5-4f46-90ff-881a21cc59df","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","import pandas as pd\n","import pandavro as pdx\n","from feathr import FeathrClient\n","from feathr import BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.metrics import mean_squared_error\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a58b69e8-fbd2-48dd-81cb-85163dfbb676","showTitle":false,"title":""}},"source":["**Permission**\n","\n","To proceed with the following steps, you may need additional permission: permission to access the keyvault, permission to access the Storage Blob as a Contributor and permission to submit jobs to Synapse cluster. Skip this step if you have already given yourself the access. Otherwise, run the following lines of command in the Cloud Shell before running the cell below.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","synapse_workspace_name=\"${resource_prefix}syws\"\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","az synapse role assignment create --workspace-name $synapse_workspace_name --role \"Synapse Contributor\" --assignee $userId\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"510120a8-d456-4aa1-9b0b-6e10bd774b78","showTitle":false,"title":""}},"source":["**Get all the required credentials from Azure KeyVault**"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b589fc31-11f9-4bea-963a-9dab88cd6689","showTitle":true,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4a1f37e9-eb40-4791-9904-19e13a98f5c9","showTitle":false,"title":""}},"source":["## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n","\n","In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n","\n","The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c7cd2bc7-237c-4170-a9b7-ae94f279bbba","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'feathr_getting_started'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/feathr_getting_started'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91548af7-5d87-4743-9db4-8fac7ba67804","showTitle":false,"title":""}},"source":["## Setup necessary environment variables (Skip if using the above Quick Start Template)\n","\n","You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n","\n","To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n","To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"794492ed-66b0-4787-adc6-3f234c4739a9","showTitle":false,"title":""}},"source":["# Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c748f9d-210b-4c1d-a414-b30328d5e219","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46b45998-d933-4417-b152-7db091c0d5bd","showTitle":false,"title":""}},"source":["## Explore the raw source data\n","We have 4 datasets to work with: one observation dataset(a.k.a. label dataset), two raw datasets to generate features for users, one raw datasets to generate features for product."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"591b1801-5783-4d88-b7b7-ff3bbcfa0a9e","showTitle":false,"title":""}},"outputs":[],"source":["# Observation dataset(a.k.a. label dataset)\n","# Observation dataset usually comes with a event_timestamp to denote when the observation happened.\n","# The label here is product_rating. Our model objective is to predict a user's rating for this product.\n","import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_observation_mock_data.csv\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"11b8a74f-c0e1-4556-9a97-f17f8a90a795","showTitle":false,"title":""}},"outputs":[],"source":["# User profile dataset\n","# Used to generate user features\n","import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_profile_mock_data.csv\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"12f237da-a7fb-48c2-985e-a8cdfa3bb3fc","showTitle":false,"title":""}},"outputs":[],"source":["# User purchase history dataset.\n","# Used to generate user features. This is activity type data, so we need to use aggregation to genearte features.\n","import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"333ef001-50c8-4556-b484-78715b657dbb","showTitle":false,"title":""}},"outputs":[],"source":["# Product detail dataset.\n","# Used to generate product features.\n","import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/product_detail_mock_data.csv\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bdc5a2e1-ccd4-4d61-9168-b0e4f571587b","showTitle":false,"title":""}},"source":["## Defining Features with Feathr\n","Let's try to create features from those raw source data.\n","In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n","\n","\n","1. The typed key (a.k.a. entity key) identifies the subject of feature, e.g. a user id, 123.\n","2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","4. The timestamp indicates when the event happened. For example, the user purchased certain product on a certain timestamp. This is usually used for point-in-time join."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"30e2c57d-6487-4d72-bd78-80d17325f1a9","showTitle":false,"title":""}},"source":["Note: in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n","It is merely a function/transformation executing against request data at runtime.\n","For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n","(We won't cover this in the tutorial.)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"64fc4ef8-ccde-4724-8eff-1263c08de39f","showTitle":false,"title":""}},"source":["### Define Sources Section with UDFs\n","\n","#### Define Anchors and Features\n","A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature.\n","\n","#### Feature source\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c32249b5-599b-4337-bebf-c33693354685","showTitle":false,"title":""}},"outputs":[],"source":["from pyspark.sql import SparkSession, DataFrame\n","def feathr_udf_preprocessing(df: DataFrame) -> DataFrame:\n"," from pyspark.sql.functions import col\n"," df = df.withColumn(\"tax_rate_decimal\", col(\"tax_rate\")/100)\n"," df.show(10)\n"," return df\n","\n","batch_source = HdfsSource(name=\"userProfileData\",\n"," path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_profile_mock_data.csv\",\n"," preprocessing=feathr_udf_preprocessing)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2961afe9-4bdc-48ba-a63f-229081f557a3","showTitle":false,"title":""}},"outputs":[],"source":["# Let's define some features for users so our recommendation can be customized for users.\n","user_id = TypedKey(key_column=\"user_id\",\n"," key_column_type=ValueType.INT32,\n"," description=\"user id\",\n"," full_name=\"product_recommendation.user_id\")\n","\n","feature_user_age = Feature(name=\"feature_user_age\",\n"," key=user_id,\n"," feature_type=INT32, transform=\"age\")\n","feature_user_tax_rate = Feature(name=\"feature_user_tax_rate\",\n"," key=user_id,\n"," feature_type=FLOAT,\n"," transform=\"tax_rate_decimal\")\n","feature_user_gift_card_balance = Feature(name=\"feature_user_gift_card_balance\",\n"," key=user_id,\n"," feature_type=FLOAT,\n"," transform=\"gift_card_balance\")\n","feature_user_has_valid_credit_card = Feature(name=\"feature_user_has_valid_credit_card\",\n"," key=user_id,\n"," feature_type=BOOLEAN,\n"," transform=\"number_of_credit_cards > 0\")\n"," \n","features = [\n"," feature_user_age,\n"," feature_user_tax_rate,\n"," feature_user_gift_card_balance,\n"," feature_user_has_valid_credit_card\n","]\n","\n","user_feature_anchor = FeatureAnchor(name=\"anchored_features\",\n"," source=batch_source,\n"," features=features)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4da453e8-a8fd-40b8-a1e6-2a0e7cac3f6e","showTitle":false,"title":""}},"outputs":[],"source":["# Let's define some features for the products so our recommendation can be customized for proudcts.\n","product_batch_source = HdfsSource(name=\"productProfileData\",\n"," path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/product_detail_mock_data.csv\")\n","\n","product_id = TypedKey(key_column=\"product_id\",\n"," key_column_type=ValueType.INT32,\n"," description=\"product id\",\n"," full_name=\"product_recommendation.product_id\")\n","\n","feature_product_quantity = Feature(name=\"feature_product_quantity\",\n"," key=product_id,\n"," feature_type=FLOAT, \n"," transform=\"quantity\")\n","feature_product_price = Feature(name=\"feature_product_price\",\n"," key=product_id,\n"," feature_type=FLOAT,\n"," transform=\"price\")\n"," \n","product_features = [\n"," feature_product_quantity,\n"," feature_product_price\n","]\n","\n","product_anchor = FeatureAnchor(name=\"product_anchored_features\",\n"," source=product_batch_source,\n"," features=product_features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"78e240b4-dcab-499f-b6ed-72a14bfab968","showTitle":false,"title":""}},"source":["### Window aggregation features\n","\n","For window aggregation features, see the supported fields below:\n","\n","Note that the `agg_func` should be any of these:\n","\n","| Aggregation Type | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |\n","\n","\n","After you have defined features and sources, bring them together to build an anchor:\n","\n","\n","Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b62a9041-73dc-45e1-add5-8fe01ebf355f","showTitle":false,"title":""}},"outputs":[],"source":["purchase_history_data = HdfsSource(name=\"purchase_history_data\",\n"," path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\",\n"," event_timestamp_column=\"purchase_date\",\n"," timestamp_format=\"yyyy-MM-dd\")\n"," \n","agg_features = [Feature(name=\"feature_user_totla_purchase_in_90days\",\n"," key=user_id,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(purchase_amount)\",\n"," agg_func=\"AVG\",\n"," window=\"90d\"))\n"," ]\n","\n","user_agg_feature_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n"," source=purchase_history_data,\n"," features=agg_features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a04373b5-8ab9-4c36-892f-6aa8129df999","showTitle":false,"title":""}},"source":["### Derived Features Section\n","Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"688a4562-d8e9-468a-a900-77e750a3c903","showTitle":false,"title":""}},"outputs":[],"source":["feature_user_purchasing_power = DerivedFeature(name=\"feature_user_purchasing_power\",\n"," key=user_id,\n"," feature_type=FLOAT,\n"," input_features=[\n"," feature_user_gift_card_balance, feature_user_has_valid_credit_card],\n"," transform=\"feature_user_gift_card_balance + if_else(toBoolean(feature_user_has_valid_credit_card), 100, 0)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f4d8f829-bfbc-4d6f-bc32-3a419a32e3d3","showTitle":false,"title":""}},"source":["And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c617bb8-2605-4d40-acc9-2156c86dfc56","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[user_agg_feature_anchor, user_feature_anchor, product_anchor], derived_feature_list=[\n"," feature_user_purchasing_power])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6b2877d0-2ab8-4c07-99d4-effc7336ee8a","showTitle":false,"title":""}},"source":["## Create training data using point-in-time correct feature join\n","\n","A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"30302a53-561f-4b85-ba25-8de9fc843c63","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrazure_test.avro'\n","else:\n","# output_path = 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/test123_temp/product_rec_new'\n"," output_path = feathr_output_path\n","\n","\n","user_feature_query = FeatureQuery(\n"," feature_list=[\"feature_user_age\", \n"," \"feature_user_tax_rate\", \n"," \"feature_user_gift_card_balance\", \n"," \"feature_user_has_valid_credit_card\", \n"," \"feature_user_totla_purchase_in_90days\",\n"," \"feature_user_purchasing_power\"\n"," ], \n"," key=user_id)\n","\n","product_feature_query = FeatureQuery(\n"," feature_list=[\n"," \"feature_product_quantity\",\n"," \"feature_product_price\"\n"," ], \n"," key=product_id)\n","\n","settings = ObservationSettings(\n"," observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_observation_mock_data.csv\",\n"," event_timestamp_column=\"event_timestamp\",\n"," timestamp_format=\"yyyy-MM-dd\")\n","client.get_offline_features(observation_settings=settings,\n"," feature_query=[user_feature_query, product_feature_query],\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=1000)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cc7b6276-70c1-494f-83ca-53d442e3198a","showTitle":false,"title":""}},"source":["## Download the training dataset and show the result\n","\n","Let's use the helper function `get_result_df` to download the result and view it:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"120c9a21-1e1d-4ef5-8fe9-00d35a93cbf1","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)\n","\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"497d6a3b-94e2-4087-94b1-0a5d7baf3ab3","showTitle":false,"title":""}},"source":["## Train a machine learning model\n","After getting all the features, let's train a machine learning model with the converted feature by Feathr:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9bd661ae-430e-449b-9a62-9155828de099","showTitle":false,"title":""}},"outputs":[],"source":["from sklearn.ensemble import GradientBoostingRegressor\n","final_df = df_res\n","\n","final_df.drop([\"event_timestamp\"], axis=1, inplace=True, errors='ignore')\n","final_df.fillna(0, inplace=True)\n","final_df['product_rating'] = final_df['product_rating'].astype(\"float64\")\n","\n","train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"product_rating\"], axis=1),\n"," final_df[\"product_rating\"],\n"," test_size=0.2,\n"," random_state=42)\n","model = GradientBoostingRegressor()\n","model.fit(train_x, train_y)\n","\n","y_predict = model.predict(test_x)\n","\n","y_actual = test_y.values.flatten().tolist()\n","rmse = sqrt(mean_squared_error(y_actual, y_predict))\n","\n","sum_actuals = sum_errors = 0\n","\n","for actual_val, predict_val in zip(y_actual, y_predict):\n"," abs_error = actual_val - predict_val\n"," if abs_error < 0:\n"," abs_error = abs_error * -1\n","\n"," sum_errors = sum_errors + abs_error\n"," sum_actuals = sum_actuals + actual_val\n","\n","mean_abs_percent_error = sum_errors / sum_actuals\n","print(\"Model MAPE:\")\n","print(mean_abs_percent_error)\n","print()\n","print(\"Model Accuracy:\")\n","print(1 - mean_abs_percent_error)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fda62a21-e7d6-4044-879f-bc05f77d248e","showTitle":false,"title":""}},"source":["## Materialize feature value into offline/online storage\n","\n","While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n","and materialize the feature value to offline and/or online storage. \n","\n","We can push the generated features to the online store like below:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3375f18d-cb64-4f13-8789-07b9d9c5835e","showTitle":false,"title":""}},"outputs":[],"source":["# Materialize user features\n","# (You can only materialize features of same entity key into one table so we can only materialize user features first.)\n","backfill_time = BackfillTime(start=datetime(\n"," 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"user_features\")\n","settings = MaterializationSettings(\"user_feature_setting\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"feature_user_age\", \"feature_user_gift_card_balance\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=1000)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fb61ed8-6db4-461c-bd86-a5ff268a7c3d","showTitle":false,"title":""}},"source":["We can then get the features from the online store (Redis):"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ed5da7df-8095-403e-91a6-c5d2104eaf68","showTitle":false,"title":""}},"source":["## Fetching feature value for online inference\n","\n","For features that are already materialized by the previous step, their latest value can be queried via the client's\n","`get_online_features` or `multi_get_online_features` API."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9d8f3710-d2d4-463a-b452-99bd56bb3482","showTitle":false,"title":""}},"outputs":[],"source":["client.get_online_features('user_features', '2', [\n"," 'feature_user_age', 'feature_user_gift_card_balance'])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e8aa6e5f-5b2d-4778-bafa-5a3a45fdd3b5","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('user_features', ['1', '2'], [\n"," 'feature_user_age', 'feature_user_gift_card_balance'])\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b19b73c6-7b0e-4b22-8eb1-8afdc328df74","showTitle":false,"title":""}},"source":["## Materialize product features\n","\n","We can also materialize product features into a separate table."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7a28cc6f-06f7-4915-9f3e-0a057467b77b","showTitle":false,"title":""}},"outputs":[],"source":["# Materialize product features\n","backfill_time = BackfillTime(start=datetime(\n"," 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"product_features\")\n","settings = MaterializationSettings(\"product_feature_setting\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"feature_product_price\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=1000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8732aad1-7b22-4efc-8e2c-722030ae8bfb","showTitle":false,"title":""}},"outputs":[],"source":["client.get_online_features('product_feature_setting', '2', [\n"," 'feature_product_price'])\n","\n","client.get_online_features('product_features', '2', [\n"," 'feature_product_price'])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"acd29f4d-715b-4889-954d-b648ea8e2a0f","showTitle":false,"title":""}},"source":["### Registering and Fetching features\n","\n","We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1255ed12-5030-43b6-b733-5a467874b708","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"feathr_getting_started\")"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"product_recommendation_demo_advanced","notebookOrigID":411375353096492,"widgets":{}},"interpreter":{"hash":"b3c5d8fd79e029a19bf620c04a250a0cafa2291ba3ed87972a3e2a099b099985"},"kernelspec":{"display_name":"Python 3.9.12 ('product_env': venv)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.12"}},"nbformat":4,"nbformat_minor":0} +{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e5545a38-44a7-4aca-be6d-a66c51c75ec8","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Azure Demo Notebook\n","\n","This notebook illustrates the use of Feathr Feature Store to create a model that predict users' rating for different products for a e-commerce website.\n","\n","## Model Problem Statement\n","The e-commerce website has collected past user ratings for various products. The websie also collected data about user and product, like user age, product category etc. Now we want to predict users' product rating for new product so that we can recommend the new product to users that give a high rating for those products.\n","\n","After the model is trained, given a user_id, product_id pair and features, we should be able to predict the product rating that the user will give for this product_id.\n","\n","(Compared with [the beginner version of product recommendation](product_recommendation_demo.ipynb), this tutorial expanded the example by predicting ratings for all products.)\n","\n","## Feature Creation Illustration\n","In this example, our observation data has compound entity key where a record is uniquely identified by user_id and product_id. So there might be 3 types of features:\n","* User features that are different for different users but are the same for different products. For example, user age is different for different users but it's the same for all products(or it's product-agnostic).\n","* Product features that are different for different products but are the same for different users.\n","* User-to-product features that are different for different users AND different products. For example, a feature to represent if the user has bought this product before or not.\n","\n","We will focus on the first two in our example.\n","\n","The feature creation flow is as below:\n","![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/product_recommendation_advanced.jpg?raw=true)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"52b7d651-19d4-44b0-a7a8-03549f49e524","showTitle":false,"title":""}},"source":["## Prerequisite: Use Quick Start Template to Provision Azure Resources\n","\n","First step is to provision required cloud resources if you want to use Feathr. Feathr provides a python based client to interact with cloud resources.\n","\n","Please follow the steps [here](https://linkedin.github.io/feathr/how-to-guides/azure-deployment-arm.html) to provision required cloud resources. Due to the complexity of the possible cloud environment, it is almost impossible to create a script that works for all the use cases. Because of this, [azure_resource_provision.sh](https://github.com/linkedin/feathr/blob/main/docs/how-to-guides/azure_resource_provision.sh) is a full end to end command line to create all the required resources, and you can tailor the script as needed, while [the companion documentation](https://linkedin.github.io/feathr/how-to-guides/azure-deployment-cli.html) can be used as a complete guide for using that shell script. \n","\n","\n","![Architecture](https://github.com/linkedin/feathr/blob/main/docs/images/architecture.png?raw=true)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1ec709d2-62ef-48c7-b915-9790afdac589","showTitle":false,"title":""}},"source":["## Prerequisite: Install Feathr \n","\n","Install Feathr using pip:\n","\n","`pip install -U feathr pandavro scikit-learn`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab5d219b-b827-4f25-9918-d7cb7b47938e","showTitle":false,"title":""}},"source":["## Prerequisite: Configure the required environment with Feathr Quick Start Template\n","\n","In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"146a1443-ce8b-4b8e-8169-2417af8bcb62","showTitle":false,"title":""}},"source":["**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"99b2d855-dae1-4ac8-8492-406dad242326","showTitle":false,"title":""}},"outputs":[],"source":["resource_prefix = \"ckim2\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"95ad2a97-b8e7-4189-8463-51fe419d29c5","showTitle":false,"title":""}},"outputs":[],"source":["! pip install feathr azure-cli pandavro scikit-learn\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a4d7dc6a-d753-4fb6-9683-2766f9a046c7","showTitle":false,"title":""}},"source":["Login to Azure with a device code (You will see instructions in the output):"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"42cf1691-b8de-48d2-b174-0c269950d470","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0f3135eb-15c5-4f46-90ff-881a21cc59df","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","import pandas as pd\n","import pandavro as pdx\n","from feathr import FeathrClient\n","from feathr import BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.metrics import mean_squared_error\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a58b69e8-fbd2-48dd-81cb-85163dfbb676","showTitle":false,"title":""}},"source":["**Permission**\n","\n","To proceed with the following steps, you may need additional permission: permission to access the keyvault, permission to access the Storage Blob as a Contributor and permission to submit jobs to Synapse cluster. Skip this step if you have already given yourself the access. Otherwise, run the following lines of command in the Cloud Shell before running the cell below.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","synapse_workspace_name=\"${resource_prefix}syws\"\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","az synapse role assignment create --workspace-name $synapse_workspace_name --role \"Synapse Contributor\" --assignee $userId\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"510120a8-d456-4aa1-9b0b-6e10bd774b78","showTitle":false,"title":""}},"source":["**Get all the required credentials from Azure KeyVault**"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b589fc31-11f9-4bea-963a-9dab88cd6689","showTitle":true,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4a1f37e9-eb40-4791-9904-19e13a98f5c9","showTitle":false,"title":""}},"source":["## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n","\n","In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n","\n","The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c7cd2bc7-237c-4170-a9b7-ae94f279bbba","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'feathr_getting_started'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/feathr_getting_started'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91548af7-5d87-4743-9db4-8fac7ba67804","showTitle":false,"title":""}},"source":["## Setup necessary environment variables (Skip if using the above Quick Start Template)\n","\n","You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n","\n","To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n","To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"794492ed-66b0-4787-adc6-3f234c4739a9","showTitle":false,"title":""}},"source":["# Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c748f9d-210b-4c1d-a414-b30328d5e219","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46b45998-d933-4417-b152-7db091c0d5bd","showTitle":false,"title":""}},"source":["## Explore the raw source data\n","We have 4 datasets to work with: one observation dataset(a.k.a. label dataset), two raw datasets to generate features for users, one raw datasets to generate features for product."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"591b1801-5783-4d88-b7b7-ff3bbcfa0a9e","showTitle":false,"title":""}},"outputs":[],"source":["# Observation dataset(a.k.a. label dataset)\n","# Observation dataset usually comes with a event_timestamp to denote when the observation happened.\n","# The label here is product_rating. Our model objective is to predict a user's rating for this product.\n","import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_observation_mock_data.csv\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"11b8a74f-c0e1-4556-9a97-f17f8a90a795","showTitle":false,"title":""}},"outputs":[],"source":["# User profile dataset\n","# Used to generate user features\n","import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_profile_mock_data.csv\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"12f237da-a7fb-48c2-985e-a8cdfa3bb3fc","showTitle":false,"title":""}},"outputs":[],"source":["# User purchase history dataset.\n","# Used to generate user features. This is activity type data, so we need to use aggregation to genearte features.\n","import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"333ef001-50c8-4556-b484-78715b657dbb","showTitle":false,"title":""}},"outputs":[],"source":["# Product detail dataset.\n","# Used to generate product features.\n","import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/product_detail_mock_data.csv\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bdc5a2e1-ccd4-4d61-9168-b0e4f571587b","showTitle":false,"title":""}},"source":["## Defining Features with Feathr\n","Let's try to create features from those raw source data.\n","In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n","\n","\n","1. The typed key (a.k.a. entity key) identifies the subject of feature, e.g. a user id, 123.\n","2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","4. The timestamp indicates when the event happened. For example, the user purchased certain product on a certain timestamp. This is usually used for point-in-time join."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"30e2c57d-6487-4d72-bd78-80d17325f1a9","showTitle":false,"title":""}},"source":["Note: in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n","It is merely a function/transformation executing against request data at runtime.\n","For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n","(We won't cover this in the tutorial.)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"64fc4ef8-ccde-4724-8eff-1263c08de39f","showTitle":false,"title":""}},"source":["### Define Sources Section with UDFs\n","\n","#### Define Anchors and Features\n","A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature.\n","\n","#### Feature source\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c32249b5-599b-4337-bebf-c33693354685","showTitle":false,"title":""}},"outputs":[],"source":["from pyspark.sql import SparkSession, DataFrame\n","def feathr_udf_preprocessing(df: DataFrame) -> DataFrame:\n"," from pyspark.sql.functions import col\n"," df = df.withColumn(\"tax_rate_decimal\", col(\"tax_rate\")/100)\n"," df.show(10)\n"," return df\n","\n","batch_source = HdfsSource(name=\"userProfileData\",\n"," path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_profile_mock_data.csv\",\n"," preprocessing=feathr_udf_preprocessing)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2961afe9-4bdc-48ba-a63f-229081f557a3","showTitle":false,"title":""}},"outputs":[],"source":["# Let's define some features for users so our recommendation can be customized for users.\n","user_id = TypedKey(key_column=\"user_id\",\n"," key_column_type=ValueType.INT32,\n"," description=\"user id\",\n"," full_name=\"product_recommendation.user_id\")\n","\n","feature_user_age = Feature(name=\"feature_user_age\",\n"," key=user_id,\n"," feature_type=INT32, transform=\"age\")\n","feature_user_tax_rate = Feature(name=\"feature_user_tax_rate\",\n"," key=user_id,\n"," feature_type=FLOAT,\n"," transform=\"tax_rate_decimal\")\n","feature_user_gift_card_balance = Feature(name=\"feature_user_gift_card_balance\",\n"," key=user_id,\n"," feature_type=FLOAT,\n"," transform=\"gift_card_balance\")\n","feature_user_has_valid_credit_card = Feature(name=\"feature_user_has_valid_credit_card\",\n"," key=user_id,\n"," feature_type=BOOLEAN,\n"," transform=\"number_of_credit_cards > 0\")\n"," \n","features = [\n"," feature_user_age,\n"," feature_user_tax_rate,\n"," feature_user_gift_card_balance,\n"," feature_user_has_valid_credit_card\n","]\n","\n","user_feature_anchor = FeatureAnchor(name=\"anchored_features\",\n"," source=batch_source,\n"," features=features)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4da453e8-a8fd-40b8-a1e6-2a0e7cac3f6e","showTitle":false,"title":""}},"outputs":[],"source":["# Let's define some features for the products so our recommendation can be customized for proudcts.\n","product_batch_source = HdfsSource(name=\"productProfileData\",\n"," path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/product_detail_mock_data.csv\")\n","\n","product_id = TypedKey(key_column=\"product_id\",\n"," key_column_type=ValueType.INT32,\n"," description=\"product id\",\n"," full_name=\"product_recommendation.product_id\")\n","\n","feature_product_quantity = Feature(name=\"feature_product_quantity\",\n"," key=product_id,\n"," feature_type=FLOAT, \n"," transform=\"quantity\")\n","feature_product_price = Feature(name=\"feature_product_price\",\n"," key=product_id,\n"," feature_type=FLOAT,\n"," transform=\"price\")\n"," \n","product_features = [\n"," feature_product_quantity,\n"," feature_product_price\n","]\n","\n","product_anchor = FeatureAnchor(name=\"product_anchored_features\",\n"," source=product_batch_source,\n"," features=product_features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"78e240b4-dcab-499f-b6ed-72a14bfab968","showTitle":false,"title":""}},"source":["### Window aggregation features\n","\n","For window aggregation features, see the supported fields below:\n","\n","Note that the `agg_func` should be any of these:\n","\n","| Aggregation Type | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |\n","\n","\n","After you have defined features and sources, bring them together to build an anchor:\n","\n","\n","Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b62a9041-73dc-45e1-add5-8fe01ebf355f","showTitle":false,"title":""}},"outputs":[],"source":["purchase_history_data = HdfsSource(name=\"purchase_history_data\",\n"," path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\",\n"," event_timestamp_column=\"purchase_date\",\n"," timestamp_format=\"yyyy-MM-dd\")\n"," \n","agg_features = [Feature(name=\"feature_user_totla_purchase_in_90days\",\n"," key=user_id,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(purchase_amount)\",\n"," agg_func=\"AVG\",\n"," window=\"90d\"))\n"," ]\n","\n","user_agg_feature_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n"," source=purchase_history_data,\n"," features=agg_features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a04373b5-8ab9-4c36-892f-6aa8129df999","showTitle":false,"title":""}},"source":["### Derived Features Section\n","Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"688a4562-d8e9-468a-a900-77e750a3c903","showTitle":false,"title":""}},"outputs":[],"source":["feature_user_purchasing_power = DerivedFeature(name=\"feature_user_purchasing_power\",\n"," key=user_id,\n"," feature_type=FLOAT,\n"," input_features=[\n"," feature_user_gift_card_balance, feature_user_has_valid_credit_card],\n"," transform=\"feature_user_gift_card_balance + if_else(toBoolean(feature_user_has_valid_credit_card), 100, 0)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f4d8f829-bfbc-4d6f-bc32-3a419a32e3d3","showTitle":false,"title":""}},"source":["And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c617bb8-2605-4d40-acc9-2156c86dfc56","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[user_agg_feature_anchor, user_feature_anchor, product_anchor], derived_feature_list=[\n"," feature_user_purchasing_power])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6b2877d0-2ab8-4c07-99d4-effc7336ee8a","showTitle":false,"title":""}},"source":["## Create training data using point-in-time correct feature join\n","\n","A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"30302a53-561f-4b85-ba25-8de9fc843c63","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrazure_test.avro'\n","else:\n","# output_path = 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/test123_temp/product_rec_new'\n"," output_path = feathr_output_path\n","\n","\n","user_feature_query = FeatureQuery(\n"," feature_list=[\"feature_user_age\", \n"," \"feature_user_tax_rate\", \n"," \"feature_user_gift_card_balance\", \n"," \"feature_user_has_valid_credit_card\", \n"," \"feature_user_totla_purchase_in_90days\",\n"," \"feature_user_purchasing_power\"\n"," ], \n"," key=user_id)\n","\n","product_feature_query = FeatureQuery(\n"," feature_list=[\n"," \"feature_product_quantity\",\n"," \"feature_product_price\"\n"," ], \n"," key=product_id)\n","\n","settings = ObservationSettings(\n"," observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_observation_mock_data.csv\",\n"," event_timestamp_column=\"event_timestamp\",\n"," timestamp_format=\"yyyy-MM-dd\")\n","client.get_offline_features(observation_settings=settings,\n"," feature_query=[user_feature_query, product_feature_query],\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=1000)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cc7b6276-70c1-494f-83ca-53d442e3198a","showTitle":false,"title":""}},"source":["## Download the training dataset and show the result\n","\n","Let's use the helper function `get_result_df` to download the result and view it:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"120c9a21-1e1d-4ef5-8fe9-00d35a93cbf1","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)\n","\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"497d6a3b-94e2-4087-94b1-0a5d7baf3ab3","showTitle":false,"title":""}},"source":["## Train a machine learning model\n","After getting all the features, let's train a machine learning model with the converted feature by Feathr:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9bd661ae-430e-449b-9a62-9155828de099","showTitle":false,"title":""}},"outputs":[],"source":["from sklearn.ensemble import GradientBoostingRegressor\n","final_df = df_res\n","\n","final_df.drop([\"event_timestamp\"], axis=1, inplace=True, errors='ignore')\n","final_df.fillna(0, inplace=True)\n","final_df['product_rating'] = final_df['product_rating'].astype(\"float64\")\n","\n","train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"product_rating\"], axis=1),\n"," final_df[\"product_rating\"],\n"," test_size=0.2,\n"," random_state=42)\n","model = GradientBoostingRegressor()\n","model.fit(train_x, train_y)\n","\n","y_predict = model.predict(test_x)\n","\n","y_actual = test_y.values.flatten().tolist()\n","rmse = sqrt(mean_squared_error(y_actual, y_predict))\n","\n","sum_actuals = sum_errors = 0\n","\n","for actual_val, predict_val in zip(y_actual, y_predict):\n"," abs_error = actual_val - predict_val\n"," if abs_error < 0:\n"," abs_error = abs_error * -1\n","\n"," sum_errors = sum_errors + abs_error\n"," sum_actuals = sum_actuals + actual_val\n","\n","mean_abs_percent_error = sum_errors / sum_actuals\n","print(\"Model MAPE:\")\n","print(mean_abs_percent_error)\n","print()\n","print(\"Model Accuracy:\")\n","print(1 - mean_abs_percent_error)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fda62a21-e7d6-4044-879f-bc05f77d248e","showTitle":false,"title":""}},"source":["## Materialize feature value into offline/online storage\n","\n","While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n","and materialize the feature value to offline and/or online storage. \n","\n","We can push the generated features to the online store like below:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3375f18d-cb64-4f13-8789-07b9d9c5835e","showTitle":false,"title":""}},"outputs":[],"source":["# Materialize user features\n","# (You can only materialize features of same entity key into one table so we can only materialize user features first.)\n","backfill_time = BackfillTime(start=datetime(\n"," 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"user_features\")\n","settings = MaterializationSettings(\"user_feature_setting\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"feature_user_age\", \"feature_user_gift_card_balance\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=1000)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fb61ed8-6db4-461c-bd86-a5ff268a7c3d","showTitle":false,"title":""}},"source":["We can then get the features from the online store (Redis):"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ed5da7df-8095-403e-91a6-c5d2104eaf68","showTitle":false,"title":""}},"source":["## Fetching feature value for online inference\n","\n","For features that are already materialized by the previous step, their latest value can be queried via the client's\n","`get_online_features` or `multi_get_online_features` API."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9d8f3710-d2d4-463a-b452-99bd56bb3482","showTitle":false,"title":""}},"outputs":[],"source":["client.get_online_features('user_features', '2', [\n"," 'feature_user_age', 'feature_user_gift_card_balance'])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e8aa6e5f-5b2d-4778-bafa-5a3a45fdd3b5","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('user_features', ['1', '2'], [\n"," 'feature_user_age', 'feature_user_gift_card_balance'])\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b19b73c6-7b0e-4b22-8eb1-8afdc328df74","showTitle":false,"title":""}},"source":["## Materialize product features\n","\n","We can also materialize product features into a separate table."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7a28cc6f-06f7-4915-9f3e-0a057467b77b","showTitle":false,"title":""}},"outputs":[],"source":["# Materialize product features\n","backfill_time = BackfillTime(start=datetime(\n"," 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"product_features\")\n","settings = MaterializationSettings(\"product_feature_setting\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"feature_product_price\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=1000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8732aad1-7b22-4efc-8e2c-722030ae8bfb","showTitle":false,"title":""}},"outputs":[],"source":["client.get_online_features('product_feature_setting', '2', [\n"," 'feature_product_price'])\n","\n","client.get_online_features('product_features', '2', [\n"," 'feature_product_price'])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"acd29f4d-715b-4889-954d-b648ea8e2a0f","showTitle":false,"title":""}},"source":["### Registering and Fetching features\n","\n","We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1255ed12-5030-43b6-b733-5a467874b708","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"feathr_getting_started\")"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"product_recommendation_demo_advanced","notebookOrigID":411375353096492,"widgets":{}},"interpreter":{"hash":"b3c5d8fd79e029a19bf620c04a250a0cafa2291ba3ed87972a3e2a099b099985"},"kernelspec":{"display_name":"Python 3.9.12 ('product_env': venv)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.12"}},"nbformat":4,"nbformat_minor":0} diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb index 988bd5c34..3de4b7d8c 100644 --- a/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb +++ b/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb @@ -162,12 +162,10 @@ "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", "os.environ['online_store__redis__host'] = redis_host\n", "os.environ['online_store__redis__port'] = redis_port\n", "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", "os.environ['REDIS_PASSWORD']=redis_password\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" ] }, @@ -236,10 +234,7 @@ " port: 6380\n", " ssl_enabled: True\n", "feature_registry:\n", - " purview:\n", - " type_system_initialization: true\n", - " purview_name: 'feathrazuretest3-purview1'\n", - " delimiter: '__'\n", + " api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n", "\"\"\"\n", "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", "with open(tmp.name, \"w\") as text_file:\n",