diff --git a/.github/fork_workflows/fork_pr_integration_tests_aws.yml b/.github/fork_workflows/fork_pr_integration_tests_aws.yml new file mode 100644 index 0000000000..ef53fc1c7d --- /dev/null +++ b/.github/fork_workflows/fork_pr_integration_tests_aws.yml @@ -0,0 +1,159 @@ +name: fork-pr-integration-tests-aws + +on: [pull_request] + +jobs: + build-docker-image: + if: github.repository == 'your github repo' # swap here with your project id + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + # pull_request_target runs the workflow in the context of the base repo + # as such actions/checkout needs to be explicit configured to retrieve + # code from the PR. + ref: refs/pull/${{ github.event.pull_request.number }}/merge + submodules: recursive + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + install: true + - name: Set up AWS SDK + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-west-2 + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + - name: Set ECR image tag + id: image-tag + run: echo "::set-output name=DOCKER_IMAGE_TAG::`git rev-parse HEAD`" + - name: Cache Public ECR Image + id: lambda_python_3_9 + uses: actions/cache@v2 + with: + path: ~/cache + key: lambda_python_3_9 + - name: Handle Cache Miss (pull public ECR image & save it to tar file) + if: steps.cache-primes.outputs.cache-hit != 'true' + run: | + mkdir -p ~/cache + docker pull public.ecr.aws/lambda/python:3.9 + docker save public.ecr.aws/lambda/python:3.9 -o ~/cache/lambda_python_3_9.tar + - name: Handle Cache Hit (load docker image from tar file) + if: steps.cache-primes.outputs.cache-hit == 'true' + run: | + docker load -i ~/cache/lambda_python_3_9.tar + - name: Build and push + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: feast-python-server + run: | + docker build \ + --file sdk/python/feast/infra/feature_servers/aws_lambda/Dockerfile \ + --tag $ECR_REGISTRY/$ECR_REPOSITORY:${{ steps.image-tag.outputs.DOCKER_IMAGE_TAG }} \ + --load \ + . + docker push $ECR_REGISTRY/$ECR_REPOSITORY:${{ steps.image-tag.outputs.DOCKER_IMAGE_TAG }} + outputs: + DOCKER_IMAGE_TAG: ${{ steps.image-tag.outputs.DOCKER_IMAGE_TAG }} + integration-test-python: + if: github.repository == 'your github repo' # swap here with your project id + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8" ] + os: [ ubuntu-latest ] + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python-version }} + services: + redis: + image: redis + ports: + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: + - uses: actions/checkout@v2 + with: + # pull_request_target runs the workflow in the context of the base repo + # as such actions/checkout needs to be explicit configured to retrieve + # code from the PR. + ref: refs/pull/${{ github.event.pull_request.number }}/merge + submodules: recursive + - name: Setup Python + uses: actions/setup-python@v2 + id: setup-python + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + - name: Setup Go + id: setup-go + uses: actions/setup-go@v2 + with: + go-version: 1.18.0 + - name: Set up AWS SDK + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-west-2 + - name: Use AWS CLI + run: aws sts get-caller-identity + - name: Upgrade pip version + run: | + pip install --upgrade "pip>=21.3.1,<22.1" + - name: Get pip cache dir + id: pip-cache + run: | + echo "::set-output name=dir::$(pip cache dir)" + - name: pip cache + uses: actions/cache@v2 + with: + path: | + ${{ steps.pip-cache.outputs.dir }} + /opt/hostedtoolcache/Python + /Users/runner/hostedtoolcache/Python + key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-${{ hashFiles(format('**/py{0}-ci-requirements.txt', env.PYTHON)) }} + restore-keys: | + ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- + - name: Install pip-tools + run: pip install pip-tools + - name: Install apache-arrow on ubuntu + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install apache-arrow on macos + if: matrix.os == 'macOS-latest' + run: brew install apache-arrow + - name: Install dependencies + run: make install-python-ci-dependencies + - name: Setup Redis Cluster + run: | + docker pull vishnunair/docker-redis-cluster:latest + docker run -d -p 6001:6379 -p 6002:6380 -p 6003:6381 -p 6004:6382 -p 6005:6383 -p 6006:6384 --name redis-cluster vishnunair/docker-redis-cluster + - name: Test python + if: ${{ always() }} # this will guarantee that step won't be canceled and resources won't leak + env: + FEAST_SERVER_DOCKER_IMAGE_TAG: ${{ needs.build-docker-image.outputs.DOCKER_IMAGE_TAG }} + run: | + pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread -k "aws and not Snowflake and not BigQuery" + pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread -k "File and not Snowflake and not BigQuery" + pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread -k "dynamo and not Snowflake and not BigQuery" + pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread -k "Redshift and not Snowflake and not BigQuery" + + diff --git a/.github/fork_workflows/fork_pr_integration_tests_gcp.yml b/.github/fork_workflows/fork_pr_integration_tests_gcp.yml new file mode 100644 index 0000000000..d53aef0155 --- /dev/null +++ b/.github/fork_workflows/fork_pr_integration_tests_gcp.yml @@ -0,0 +1,97 @@ +name: fork-pr-integration-tests-gcp + +on: [pull_request] + +jobs: + integration-test-python: + if: github.repository == 'your github repo' # swap here with your project id + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8" ] + os: [ ubuntu-latest ] + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python-version }} + services: + redis: + image: redis + ports: + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: + - uses: actions/checkout@v2 + with: + # pull_request_target runs the workflow in the context of the base repo + # as such actions/checkout needs to be explicit configured to retrieve + # code from the PR. + ref: refs/pull/${{ github.event.pull_request.number }}/merge + submodules: recursive + - name: Setup Python + uses: actions/setup-python@v2 + id: setup-python + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + - name: Setup Go + id: setup-go + uses: actions/setup-go@v2 + with: + go-version: 1.18.0 + - name: Set up gcloud SDK + uses: google-github-actions/setup-gcloud@v0 + with: + project_id: ${{ secrets.GCP_PROJECT_ID }} + service_account_key: ${{ secrets.GCP_SA_KEY }} + export_default_credentials: true + - name: Use gcloud CLI + run: gcloud info + - name: Upgrade pip version + run: | + pip install --upgrade "pip>=21.3.1,<22.1" + - name: Get pip cache dir + id: pip-cache + run: | + echo "::set-output name=dir::$(pip cache dir)" + - name: pip cache + uses: actions/cache@v2 + with: + path: | + ${{ steps.pip-cache.outputs.dir }} + /opt/hostedtoolcache/Python + /Users/runner/hostedtoolcache/Python + key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-${{ hashFiles(format('**/py{0}-ci-requirements.txt', env.PYTHON)) }} + restore-keys: | + ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- + - name: Install pip-tools + run: pip install pip-tools + - name: Install apache-arrow on ubuntu + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install apache-arrow on macos + if: matrix.os == 'macOS-latest' + run: brew install apache-arrow + - name: Install dependencies + run: make install-python-ci-dependencies + - name: Setup Redis Cluster + run: | + docker pull vishnunair/docker-redis-cluster:latest + docker run -d -p 6001:6379 -p 6002:6380 -p 6003:6381 -p 6004:6382 -p 6005:6383 -p 6006:6384 --name redis-cluster vishnunair/docker-redis-cluster + - name: Test python + if: ${{ always() }} # this will guarantee that step won't be canceled and resources won't leak + # Run only BigQuery and File tests without dynamo and redshift tests. + run: | + pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread -k "BigQuery and not dynamo and not Redshift and not Snowflake" + pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread -k "File and not dynamo and not Redshift and not Snowflake" + diff --git a/.github/fork_workflows/fork_pr_integration_tests_snowflake.yml b/.github/fork_workflows/fork_pr_integration_tests_snowflake.yml new file mode 100644 index 0000000000..8832c75fca --- /dev/null +++ b/.github/fork_workflows/fork_pr_integration_tests_snowflake.yml @@ -0,0 +1,96 @@ +name: fork-pr-integration-tests-snowflake + +on: [pull_request] + +jobs: + integration-test-python: + if: github.repository == 'your github repo' # swap here with your project id + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8" ] + os: [ ubuntu-latest ] + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python-version }} + services: + redis: + image: redis + ports: + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: + - uses: actions/checkout@v2 + with: + # pull_request_target runs the workflow in the context of the base repo + # as such actions/checkout needs to be explicit configured to retrieve + # code from the PR. + ref: refs/pull/${{ github.event.pull_request.number }}/merge + submodules: recursive + - name: Setup Python + uses: actions/setup-python@v2 + id: setup-python + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + - name: Setup Go + id: setup-go + uses: actions/setup-go@v2 + with: + go-version: 1.18.0 + + - name: Upgrade pip version + run: | + pip install --upgrade "pip>=21.3.1,<22.1" + - name: Get pip cache dir + id: pip-cache + run: | + echo "::set-output name=dir::$(pip cache dir)" + - name: pip cache + uses: actions/cache@v2 + with: + path: | + ${{ steps.pip-cache.outputs.dir }} + /opt/hostedtoolcache/Python + /Users/runner/hostedtoolcache/Python + key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-${{ hashFiles(format('**/py{0}-ci-requirements.txt', env.PYTHON)) }} + restore-keys: | + ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- + - name: Install pip-tools + run: pip install pip-tools + - name: Install apache-arrow on ubuntu + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install apache-arrow on macos + if: matrix.os == 'macOS-latest' + run: brew install apache-arrow + - name: Install dependencies + run: make install-python-ci-dependencies + - name: Setup Redis Cluster + run: | + docker pull vishnunair/docker-redis-cluster:latest + docker run -d -p 6001:6379 -p 6002:6380 -p 6003:6381 -p 6004:6382 -p 6005:6383 -p 6006:6384 --name redis-cluster vishnunair/docker-redis-cluster + - name: Test python + if: ${{ always() }} # this will guarantee that step won't be canceled and resources won't leak + env: + SNOWFLAKE_CI_DEPLOYMENT: ${{ secrets.SNOWFLAKE_CI_DEPLOYMENT }} + SNOWFLAKE_CI_USER: ${{ secrets.SNOWFLAKE_CI_USER }} + SNOWFLAKE_CI_PASSWORD: ${{ secrets.SNOWFLAKE_CI_PASSWORD }} + SNOWFLAKE_CI_ROLE: ${{ secrets.SNOWFLAKE_CI_ROLE }} + SNOWFLAKE_CI_WAREHOUSE: ${{ secrets.SNOWFLAKE_CI_WAREHOUSE }} + # Run only Snowflake BigQuery and File tests without dynamo and redshift tests. + run: | + pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread -k "Snowflake and not dynamo and not Redshift and not Bigquery and not gcp" + pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread -k "File and not dynamo and not Redshift and not Bigquery and not gcp" + diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index c47a8ec5c3..278be10b89 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -89,6 +89,7 @@ jobs: make install-go-ci-dependencies git status git restore go.mod go.sum + git restore sdk/python/feast/ui/yarn.lock CIBW_BEFORE_TEST: "cd {project} && git status" # py3.10 on MacOS does not work with Go so we have to install separately. Issue is tracked here: https://github.com/feast-dev/feast/issues/2881. - name: Build py310 specific wheels for macos @@ -104,6 +105,7 @@ jobs: CIBW_BEFORE_BUILD: | git status git restore go.mod go.sum + git restore sdk/python/feast/ui/yarn.lock brew install apache-arrow - uses: actions/upload-artifact@v2 with: @@ -136,6 +138,7 @@ jobs: make build-ui git status git restore go.mod go.sum + git restore sdk/python/feast/ui/yarn.lock - name: Build run: | python3 setup.py sdist @@ -232,7 +235,7 @@ jobs: - name: Smoke test run: | feast init test_repo - cd test_repo/ + cd test_repo/feature_repo feast apply echo "$TEST_SCRIPT" > run-and-wait.sh bash run-and-wait.sh feast serve @@ -241,7 +244,7 @@ jobs: - name: Smoke test with go if: matrix.python-version != '3.10' || matrix.os == 'ubuntu-latest' run: | - cd test_repo/ + cd test_repo/feature_repo feast apply echo "$TEST_SCRIPT" > run-and-wait.sh pip install cffi diff --git a/.github/workflows/java_master_only.yml b/.github/workflows/java_master_only.yml index fc2bb52387..c3548991bb 100644 --- a/.github/workflows/java_master_only.yml +++ b/.github/workflows/java_master_only.yml @@ -9,6 +9,7 @@ on: jobs: build-docker-images: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest strategy: matrix: @@ -46,6 +47,7 @@ jobs: fi lint-java: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -55,6 +57,7 @@ jobs: run: make lint-java unit-test-java: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -66,6 +69,12 @@ jobs: java-version: '11' java-package: jdk architecture: x64 + - uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-it-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-it-maven- - uses: actions/cache@v2 with: path: ~/.m2/repository @@ -80,7 +89,10 @@ jobs: path: ${{ github.workspace }}/docs/coverage/java/target/site/jacoco-aggregate/ integration-test: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest + env: + PYTHON: 3.8 steps: - uses: actions/checkout@v2 with: @@ -91,10 +103,46 @@ jobs: java-version: '11' java-package: jdk architecture: x64 - - uses: actions/setup-python@v2 + - name: Setup Python (to call feast apply) + uses: actions/setup-python@v2 + id: setup-python with: - python-version: '3.8' - architecture: 'x64' + python-version: 3.8 + architecture: x64 + - name: Setup Go + id: setup-go + uses: actions/setup-go@v2 + with: + go-version: 1.18.0 + - name: Upgrade pip version + run: | + pip install --upgrade "pip>=21.3.1,<22.1" + - name: Get pip cache dir + id: pip-cache + run: | + echo "::set-output name=dir::$(pip cache dir)" + - name: pip cache + uses: actions/cache@v2 + with: + path: | + ${{ steps.pip-cache.outputs.dir }} + /opt/hostedtoolcache/Python + /Users/runner/hostedtoolcache/Python + key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-${{ hashFiles(format('**/py{0}-ci-requirements.txt', env.PYTHON)) }} + restore-keys: | + ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- + - name: Install pip-tools + run: pip install pip-tools + - name: Install apache-arrow on ubuntu + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install Python dependencies + run: make install-python-ci-dependencies - uses: actions/cache@v2 with: path: ~/.m2/repository diff --git a/.github/workflows/java_pr.yml b/.github/workflows/java_pr.yml index 39593f02ce..72f419e409 100644 --- a/.github/workflows/java_pr.yml +++ b/.github/workflows/java_pr.yml @@ -9,6 +9,7 @@ on: jobs: lint-java: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -22,6 +23,7 @@ jobs: run: make lint-java unit-test-java: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest needs: lint-java steps: @@ -38,6 +40,12 @@ jobs: java-version: '11' java-package: jdk architecture: x64 + - uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-it-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-it-maven- - uses: actions/cache@v2 with: path: ~/.m2/repository @@ -51,13 +59,44 @@ jobs: name: java-coverage-report path: ${{ github.workspace }}/docs/coverage/java/target/site/jacoco-aggregate/ - integration-test: + build-docker-image-java: + if: github.repository == 'feast-dev/feast' + runs-on: ubuntu-latest + strategy: + matrix: + component: [ feature-server-java ] + env: + MAVEN_CACHE: gs://feast-templocation-kf-feast/.m2.2020-08-19.tar + REGISTRY: gcr.io/kf-feast + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - uses: google-github-actions/setup-gcloud@v0 + with: + version: '290.0.1' + export_default_credentials: true + project_id: ${{ secrets.GCP_PROJECT_ID }} + service_account_key: ${{ secrets.GCP_SA_KEY }} + - run: gcloud auth configure-docker --quiet + - name: Get m2 cache + run: | + infra/scripts/download-maven-cache.sh \ + --archive-uri ${MAVEN_CACHE} \ + --output-dir . + - name: Build image + run: make build-${{ matrix.component }}-docker REGISTRY=${REGISTRY} VERSION=${GITHUB_SHA} + + integration-test-java-pr: # all jobs MUST have this if check for 'ok-to-test' or 'approved' for security purposes. if: - (github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'ok-to-test')) || - (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved'))) + ((github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'ok-to-test')) || + (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved')))) && + github.repository == 'feast-dev/feast' runs-on: ubuntu-latest needs: unit-test-java + env: + PYTHON: 3.8 steps: - uses: actions/checkout@v2 with: @@ -98,6 +137,46 @@ jobs: aws-region: us-west-2 - name: Use AWS CLI run: aws sts get-caller-identity + - name: Setup Python (to call feast apply) + uses: actions/setup-python@v2 + id: setup-python + with: + python-version: 3.8 + architecture: x64 + - name: Setup Go + id: setup-go + uses: actions/setup-go@v2 + with: + go-version: 1.18.0 + - name: Upgrade pip version + run: | + pip install --upgrade "pip>=21.3.1,<22.1" + - name: Get pip cache dir + id: pip-cache + run: | + echo "::set-output name=dir::$(pip cache dir)" + - name: pip cache + uses: actions/cache@v2 + with: + path: | + ${{ steps.pip-cache.outputs.dir }} + /opt/hostedtoolcache/Python + /Users/runner/hostedtoolcache/Python + key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-${{ hashFiles(format('**/py{0}-ci-requirements.txt', env.PYTHON)) }} + restore-keys: | + ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- + - name: Install pip-tools + run: pip install pip-tools + - name: Install apache-arrow on ubuntu + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install Python dependencies + run: make install-python-ci-dependencies - name: Run integration tests run: make test-java-integration - name: Save report diff --git a/.github/workflows/lint_pr.yml b/.github/workflows/lint_pr.yml index 40c3dead00..f9af8b27c7 100644 --- a/.github/workflows/lint_pr.yml +++ b/.github/workflows/lint_pr.yml @@ -9,6 +9,7 @@ on: jobs: validate-title: + if: github.repository == 'feast-dev/feast' name: Validate PR title runs-on: ubuntu-latest steps: diff --git a/.github/workflows/master_only.yml b/.github/workflows/master_only.yml index c9ebcdaf04..51e3830fe6 100644 --- a/.github/workflows/master_only.yml +++ b/.github/workflows/master_only.yml @@ -7,6 +7,7 @@ on: jobs: build-lambda-docker-image: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -58,6 +59,7 @@ jobs: outputs: DOCKER_IMAGE_TAG: ${{ steps.image-tag.outputs.DOCKER_IMAGE_TAG }} integration-test-python-and-go: + if: github.repository == 'feast-dev/feast' needs: build-lambda-docker-image runs-on: ${{ matrix.os }} strategy: @@ -180,6 +182,7 @@ jobs: run: aws s3 cp --recursive .benchmarks s3://feast-ci-pytest-benchmarks build-all-docker-images: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest strategy: matrix: diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml index fead512408..e1370b10b1 100644 --- a/.github/workflows/nightly-ci.yml +++ b/.github/workflows/nightly-ci.yml @@ -11,6 +11,7 @@ on: jobs: check_date: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest name: Check latest commit outputs: @@ -23,7 +24,34 @@ jobs: name: Check if there were commits in the last day if: ${{ github.event_name == 'schedule' }} run: echo '::set-output name=WAS_EDITED::'$(test -n "$(git log --format=%H --since='24 hours ago')" && echo 'true' || echo 'false') + cleanup_dynamo_tables: + if: github.repository == 'feast-dev/feast' + runs-on: ubuntu-latest + name: Cleanup dynamo tables which can fail to cleanup + steps: + - uses: actions/checkout@v2 + with: + ref: master + - name: Setup Python + uses: actions/setup-python@v2 + id: setup-python + with: + python-version: "3.8" + architecture: x64 + - name: Set up AWS SDK + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-west-2 + - name: Install Python dependencies + run: | + pip install boto3 + pip install tqdm + - name: Run DynamoDB cleanup script + run: python infra/scripts/cleanup_dynamo_ci.py build-docker-image: + if: github.repository == 'feast-dev/feast' needs: [check_date] runs-on: ubuntu-latest steps: @@ -79,7 +107,8 @@ jobs: outputs: DOCKER_IMAGE_TAG: ${{ steps.image-tag.outputs.DOCKER_IMAGE_TAG }} integration-test-python: - needs: [check_date, build-docker-image] + if: github.repository == 'feast-dev/feast' + needs: [check_date, build-docker-image, cleanup_dynamo_tables] runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.github/workflows/pr_integration_tests.yml b/.github/workflows/pr_integration_tests.yml index 58bf45c687..ab8a79760f 100644 --- a/.github/workflows/pr_integration_tests.yml +++ b/.github/workflows/pr_integration_tests.yml @@ -16,8 +16,9 @@ jobs: build-docker-image: # all jobs MUST have this if check for 'ok-to-test' or 'approved' for security purposes. if: - (github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'lgtm' || github.event.label.name == 'ok-to-test')) || - (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved') || contains(github.event.pull_request.labels.*.name, 'lgtm'))) + ((github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'lgtm' || github.event.label.name == 'ok-to-test')) || + (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved') || contains(github.event.pull_request.labels.*.name, 'lgtm')))) && + github.repository == 'feast-dev/feast' runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -77,8 +78,9 @@ jobs: integration-test-python: # all jobs MUST have this if check for 'ok-to-test' or 'approved' for security purposes. if: - (github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'lgtm' || github.event.label.name == 'ok-to-test')) || - (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved') || contains(github.event.pull_request.labels.*.name, 'lgtm'))) + ((github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'lgtm' || github.event.label.name == 'ok-to-test')) || + (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved') || contains(github.event.pull_request.labels.*.name, 'lgtm')))) && + github.repository == 'feast-dev/feast' needs: build-docker-image runs-on: ${{ matrix.os }} strategy: diff --git a/.github/workflows/pr_local_integration_tests.yml b/.github/workflows/pr_local_integration_tests.yml index d4db8a3a7c..4705771911 100644 --- a/.github/workflows/pr_local_integration_tests.yml +++ b/.github/workflows/pr_local_integration_tests.yml @@ -12,8 +12,9 @@ jobs: integration-test-python-local: # all jobs MUST have this if check for 'ok-to-test' or 'approved' for security purposes. if: - (github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'lgtm' || github.event.label.name == 'ok-to-test')) || - (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved') || contains(github.event.pull_request.labels.*.name, 'lgtm'))) + ((github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'lgtm' || github.event.label.name == 'ok-to-test')) || + (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved') || contains(github.event.pull_request.labels.*.name, 'lgtm')))) || + github.repository != 'feast-dev/feast' runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -67,14 +68,11 @@ jobs: sudo apt install -y -V libarrow-dev - name: Install dependencies run: make install-python-ci-dependencies - - name: Set up gcloud SDK # TODO(adchia): remove this dependency - uses: google-github-actions/setup-gcloud@v0 - with: - project_id: ${{ secrets.GCP_PROJECT_ID }} - service_account_key: ${{ secrets.GCP_SA_KEY }} - export_default_credentials: true - - name: Use gcloud CLI - run: gcloud info - name: Test local integration tests if: ${{ always() }} # this will guarantee that step won't be canceled and resources won't leak - run: make test-python-integration-local + env: + FEAST_USAGE: "False" + IS_TEST: "True" + FEAST_LOCAL_ONLINE_CONTAINER: "True" + FEAST_IS_LOCAL_TEST: "True" + run: pytest -n 8 --cov=./ --cov-report=xml --color=yes --integration -k "not gcs_registry and not s3_registry and not test_lambda_materialization and not test_snowflake_materialization" sdk/python/tests diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 184fdb3cb6..46e1665754 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -7,6 +7,7 @@ on: jobs: get-version: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest outputs: release_version: ${{ steps.get_release_version.outputs.release_version }} @@ -48,7 +49,7 @@ jobs: needs: get-version strategy: matrix: - component: [feature-server-python-aws, feature-server-java, feature-transformation-server] + component: [feature-server-python, feature-server-python-aws, feature-server-java, feature-transformation-server] env: MAVEN_CACHE: gs://feast-templocation-kf-feast/.m2.2020-08-19.tar REGISTRY: feastdev @@ -100,6 +101,7 @@ jobs: fi publish-helm-charts: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest needs: get-version env: @@ -129,6 +131,7 @@ jobs: uses: ./.github/workflows/build_wheels.yml publish-python-sdk: + if: github.repository == 'feast-dev/feast' runs-on: ubuntu-latest needs: [build_wheels] steps: @@ -142,6 +145,7 @@ jobs: password: ${{ secrets.PYPI_PASSWORD }} publish-java-sdk: + if: github.repository == 'feast-dev/feast' container: maven:3.6-jdk-11 runs-on: ubuntu-latest needs: get-version @@ -177,23 +181,3 @@ jobs: mkdir -p /root/.m2/ echo -n "$MAVEN_SETTINGS" > /root/.m2/settings.xml infra/scripts/publish-java-sdk.sh --revision ${VERSION_WITHOUT_PREFIX} --gpg-key-import-dir /root - - publish-web-ui-npm: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-node@v2 - with: - node-version: '17.x' - registry-url: 'https://registry.npmjs.org' - - name: Install yarn dependencies - working-directory: ./ui - run: yarn install - - name: Build yarn rollup - working-directory: ./ui - run: yarn build:lib - - name: Publish UI package - working-directory: ./ui - run: npm publish - env: - NODE_AUTH_TOKEN: ${{secrets.NPM_TOKEN}} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2f4d15590a..feab7b0eef 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,9 +15,67 @@ on: type: string jobs: + + get_dry_release_versions: + runs-on: ubuntu-latest + env: + GITHUB_TOKEN: ${{ github.event.inputs.token }} + outputs: + current_version: ${{ steps.get_versions.outputs.current_version }} + next_version: ${{ steps.get_versions.outputs.next_version }} + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + persist-credentials: false + - name: Setup Node.js + uses: actions/setup-node@v2 + with: + node-version: '16' + - name: Release (Dry Run) + id: get_versions + run: | + CURRENT_VERSION=$(npx -p @semantic-release/changelog -p @semantic-release/git -p @semantic-release/exec -p semantic-release semantic-release --dry-run | grep "associated with version " | sed -E 's/.* version//' | sed -E 's/ on.*//') + NEXT_VERSION=$(npx -p @semantic-release/changelog -p @semantic-release/git -p @semantic-release/exec -p semantic-release semantic-release --dry-run | grep 'The next release version is' | sed -E 's/.* ([[:digit:].]+)$/\1/') + echo ::set-output name=current_version::$CURRENT_VERSION + echo ::set-output name=next_version::$NEXT_VERSION + echo "Current version is ${CURRENT_VERSION}" + echo "Next version is ${NEXT_VERSION}" + + # publish-web-ui-npm: + # if: github.repository == 'feast-dev/feast' + # needs: get_dry_release_versions + # runs-on: ubuntu-latest + # env: + # # This publish is working using an NPM automation token to bypass 2FA + # NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + # CURRENT_VERSION: ${{ needs.get_dry_release_versions.outputs.current_version }} + # NEXT_VERSION: ${{ needs.get_dry_release_versions.outputs.next_version }} + # steps: + # - uses: actions/checkout@v2 + # - uses: actions/setup-node@v2 + # with: + # node-version: '17.x' + # registry-url: 'https://registry.npmjs.org' + # - name: Bump file versions (temporarily for Web UI publish) + # run: python ./infra/scripts/release/bump_file_versions.py ${CURRENT_VERSION} ${NEXT_VERSION} + # - name: Install yarn dependencies + # working-directory: ./ui + # run: yarn install + # - name: Build yarn rollup + # working-directory: ./ui + # run: yarn build:lib + # - name: Publish UI package + # working-directory: ./ui + # run: npm publish + # env: + # # This publish is working using an NPM automation token to bypass 2FA + # NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + release: name: release runs-on: ubuntu-latest + #needs: publish-web-ui-npm env: GITHUB_TOKEN: ${{ github.event.inputs.token }} GIT_AUTHOR_NAME: feast-ci-bot diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 7bbe9ad6ac..de6d98d140 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -1,6 +1,6 @@ name: unit-tests -on: [push, pull_request] +on: [pull_request] jobs: unit-test-python: runs-on: ${{ matrix.os }} @@ -69,15 +69,10 @@ jobs: - name: Install dependencies run: make install-python-ci-dependencies - name: Test Python - env: - SNOWFLAKE_CI_DEPLOYMENT: ${{ secrets.SNOWFLAKE_CI_DEPLOYMENT }} - SNOWFLAKE_CI_USER: ${{ secrets.SNOWFLAKE_CI_USER }} - SNOWFLAKE_CI_PASSWORD: ${{ secrets.SNOWFLAKE_CI_PASSWORD }} - SNOWFLAKE_CI_ROLE: ${{ secrets.SNOWFLAKE_CI_ROLE }} - SNOWFLAKE_CI_WAREHOUSE: ${{ secrets.SNOWFLAKE_CI_WAREHOUSE }} run: pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 + if: github.repository == 'feast-dev/feast' with: token: ${{ secrets.CODECOV_TOKEN }} files: ./coverage.xml diff --git a/.gitignore b/.gitignore index 6a86eb2682..1edde846ff 100644 --- a/.gitignore +++ b/.gitignore @@ -125,6 +125,8 @@ instance/ # Sphinx documentation docs/_build/ +sdk/python/docs/source +sdk/python/docs/html # PyBuilder target/ @@ -184,7 +186,6 @@ dmypy.json *.code-workspace # Protos -sdk/python/docs/html sdk/python/feast/protos/ sdk/go/protos/ go/protos/ diff --git a/.releaserc.js b/.releaserc.js index 2acf9b7350..aadc4373e9 100644 --- a/.releaserc.js +++ b/.releaserc.js @@ -28,18 +28,26 @@ module.exports = { "releaseRules": [ {breaking: true, release: 'minor'}, {tag: 'Breaking', release: 'minor'}, - ] + {type: '*!', release: 'minor'}, + ], + // Ensure that the "BREAKING CHANGE" notes in commit footers are parsed + "parserOpts": { + "noteKeywords": ["BREAKING CHANGE", "BREAKING CHANGES"] + } }], ["@semantic-release/exec", { // Validate the type of release we are doing "verifyReleaseCmd": "./infra/scripts/validate-release.sh ${nextRelease.type} " + current_branch, - // Bump all version files - "prepareCmd": "python ./infra/scripts/release/bump_file_versions.py ${lastRelease.version} ${nextRelease.version}" + // Bump all version files and build UI / update yarn.lock + "prepareCmd": "python ./infra/scripts/release/bump_file_versions.py ${lastRelease.version} ${nextRelease.version}; make build-ui" }], - "@semantic-release/release-notes-generator", + ["@semantic-release/release-notes-generator", { + // Ensure that a "Breaking Changes" section is added to the release notes + "preset": "angular" + }], // Update the changelog [ @@ -58,7 +66,8 @@ module.exports = { "CHANGELOG.md", "java/pom.xml", "infra/charts/**/*.*", - "ui/package.json" + "ui/package.json", + "sdk/python/feast/ui/yarn.lock" ], message: "chore(release): release ${nextRelease.version}\n\n${nextRelease.notes}" } diff --git a/CHANGELOG.md b/CHANGELOG.md index 80852af83d..1cb5456577 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,48 @@ # Changelog +# [0.24.0](https://github.com/feast-dev/feast/compare/v0.23.0...v0.24.0) (2022-08-25) + + +### Bug Fixes + +* Check if on_demand_feature_views is an empty list rather than None for snowflake provider ([#3046](https://github.com/feast-dev/feast/issues/3046)) ([9b05e65](https://github.com/feast-dev/feast/commit/9b05e651546d1526aa166854e425285c8ec3a6d5)) +* FeatureStore.apply applies BatchFeatureView correctly ([#3098](https://github.com/feast-dev/feast/issues/3098)) ([41be511](https://github.com/feast-dev/feast/commit/41be51170db1f9d9439ac801625458d471827cee)) +* Fix Feast Java inconsistency with int64 serialization vs python ([#3031](https://github.com/feast-dev/feast/issues/3031)) ([4bba787](https://github.com/feast-dev/feast/commit/4bba78709e4e5884ce9aad75e84f9b4449665b81)) +* Fix feature service inference logic ([#3089](https://github.com/feast-dev/feast/issues/3089)) ([4310ed7](https://github.com/feast-dev/feast/commit/4310ed7e687da0e80a18c6d8cb95cfb15bbd1eae)) +* Fix field mapping logic during feature inference ([#3067](https://github.com/feast-dev/feast/issues/3067)) ([cdfa761](https://github.com/feast-dev/feast/commit/cdfa761a16206afcdb64959c25ee3b5f2b312566)) +* Fix incorrect on demand feature view diffing and improve Java tests ([#3074](https://github.com/feast-dev/feast/issues/3074)) ([0702310](https://github.com/feast-dev/feast/commit/0702310366882a388af4f299a69467841c132259)) +* Fix Java helm charts to work with refactored logic. Fix FTS image ([#3105](https://github.com/feast-dev/feast/issues/3105)) ([2b493e0](https://github.com/feast-dev/feast/commit/2b493e0457cea19a9b3faa163f099d6b32fde30d)) +* Fix on demand feature view output in feast plan + Web UI crash ([#3057](https://github.com/feast-dev/feast/issues/3057)) ([bfae6ac](https://github.com/feast-dev/feast/commit/bfae6ac5a42fcdeebfaed2d1473c546da23c3bdc)) +* Fix release workflow to release 0.24.0 ([#3138](https://github.com/feast-dev/feast/issues/3138)) ([a69aaae](https://github.com/feast-dev/feast/commit/a69aaae4c5595e87501e0b5d58533360306fb831)) +* Fix Spark offline store type conversion to arrow ([#3071](https://github.com/feast-dev/feast/issues/3071)) ([b26566d](https://github.com/feast-dev/feast/commit/b26566d92573164d9968fb356fd68446725f70f5)) +* Fixing Web UI, which fails for the SQL registry ([#3028](https://github.com/feast-dev/feast/issues/3028)) ([64603b6](https://github.com/feast-dev/feast/commit/64603b677421b21f04bd72238e358dac43122b29)) +* Force Snowflake Session to Timezone UTC ([#3083](https://github.com/feast-dev/feast/issues/3083)) ([9f221e6](https://github.com/feast-dev/feast/commit/9f221e66eb2dd83b0e6beb528a694f4933953571)) +* Make infer dummy entity join key idempotent ([#3115](https://github.com/feast-dev/feast/issues/3115)) ([1f5b1e0](https://github.com/feast-dev/feast/commit/1f5b1e078b41729938ab6b4d9a35c2fcb2be39cd)) +* More explicit error messages ([#2708](https://github.com/feast-dev/feast/issues/2708)) ([e4d7afd](https://github.com/feast-dev/feast/commit/e4d7afdce8fc2596d1a27f2d85f259f2fa35bafa)) +* Parse inline data sources ([#3036](https://github.com/feast-dev/feast/issues/3036)) ([c7ba370](https://github.com/feast-dev/feast/commit/c7ba370aa14ef3216c84aaa8852fd519931840bf)) +* Prevent overwriting existing file during `persist` ([#3088](https://github.com/feast-dev/feast/issues/3088)) ([69af21f](https://github.com/feast-dev/feast/commit/69af21f4c487506417d7bbb3ad32f1be2246a654)) +* Register BatchFeatureView in feature repos correctly ([#3092](https://github.com/feast-dev/feast/issues/3092)) ([b8e39ea](https://github.com/feast-dev/feast/commit/b8e39ea4cd2d990f2422c60bf39d8d940ecc9522)) +* Return an empty infra object from sql registry when it doesn't exist ([#3022](https://github.com/feast-dev/feast/issues/3022)) ([8ba87d1](https://github.com/feast-dev/feast/commit/8ba87d1b550526c24bb5f6b3ce63c6435676a5cb)) +* Teardown tables for Snowflake Materialization testing ([#3106](https://github.com/feast-dev/feast/issues/3106)) ([0a0c974](https://github.com/feast-dev/feast/commit/0a0c974f38e7ef41ecf0af5c6a3eacac7369aa38)) +* UI error when saved dataset is present in registry. ([#3124](https://github.com/feast-dev/feast/issues/3124)) ([83cf753](https://github.com/feast-dev/feast/commit/83cf7533fa757af7eb3ab1c4f540aca8edd134af)) +* Update sql.py ([#3096](https://github.com/feast-dev/feast/issues/3096)) ([2646a86](https://github.com/feast-dev/feast/commit/2646a864b0031617b26577926ade2341f998557b)) +* Updated snowflake template ([#3130](https://github.com/feast-dev/feast/issues/3130)) ([f0594e1](https://github.com/feast-dev/feast/commit/f0594e160a1f11d896e884b40f7e1110d2df6aa9)) + + +### Features + +* Add authentication option for snowflake connector ([#3039](https://github.com/feast-dev/feast/issues/3039)) ([74c75f1](https://github.com/feast-dev/feast/commit/74c75f1f4c91f0097f9a1085a4e68a07c524037d)) +* Add Cassandra/AstraDB online store contribution ([#2873](https://github.com/feast-dev/feast/issues/2873)) ([feb6cb8](https://github.com/feast-dev/feast/commit/feb6cb8518889288d6ddd97e4482db2f6b86eabd)) +* Add Snowflake materialization engine ([#2948](https://github.com/feast-dev/feast/issues/2948)) ([f3b522b](https://github.com/feast-dev/feast/commit/f3b522b007cc5e5ccd32dbe04e47d30136810f6c)) +* Adding saved dataset capabilities for Postgres ([#3070](https://github.com/feast-dev/feast/issues/3070)) ([d3253c3](https://github.com/feast-dev/feast/commit/d3253c362deb775a8f1f5cd325e44d3e598d0bdf)) +* Allow passing repo config path via flag ([#3077](https://github.com/feast-dev/feast/issues/3077)) ([0d2d951](https://github.com/feast-dev/feast/commit/0d2d951d565daac1a4f01fab988d44010b6856bb)) +* Contrib azure provider with synapse/mssql offline store and Azure registry store ([#3072](https://github.com/feast-dev/feast/issues/3072)) ([9f7e557](https://github.com/feast-dev/feast/commit/9f7e5573e764466590badab4250b69aef6f256b0)) +* Custom Docker image for Bytewax batch materialization ([#3099](https://github.com/feast-dev/feast/issues/3099)) ([cdd1b07](https://github.com/feast-dev/feast/commit/cdd1b0734fcb20c258cf6ee4c067f23d2fff81e0)) +* Feast AWS Athena offline store (again) ([#3044](https://github.com/feast-dev/feast/issues/3044)) ([989ce08](https://github.com/feast-dev/feast/commit/989ce085c0949564af61afb73401c27669cfdaba)) +* Implement spark offline store `offline_write_batch` method ([#3076](https://github.com/feast-dev/feast/issues/3076)) ([5b0cc87](https://github.com/feast-dev/feast/commit/5b0cc8798616455d955b543cb8012ad88927aea2)) +* Initial Bytewax materialization engine ([#2974](https://github.com/feast-dev/feast/issues/2974)) ([55c61f9](https://github.com/feast-dev/feast/commit/55c61f9c4584bf040a7dc1719200704402811d6d)) +* Refactor feature server helm charts to allow passing feature_store.yaml in environment variables ([#3113](https://github.com/feast-dev/feast/issues/3113)) ([85ee789](https://github.com/feast-dev/feast/commit/85ee78947bc9793b17348c08325844a2ee4ad0ff)) + # [0.23.0](https://github.com/feast-dev/feast/compare/v0.22.0...v0.23.0) (2022-08-02) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a8671d9986..ae259a72fa 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,49 @@ -# Development Guide: Main Feast Repository +

Development Guide: Main Feast Repository

+ > Please see [Development Guide](https://docs.feast.dev/project/development-guide) for project level development instructions. +

Maintainer's Guide

+ +> Please see [Maintainer's Guide](https://docs.feast.dev/project/maintainers) for instructions for maintainers. Normal developers can also use this guide to setup their forks for localized integration tests. + +

Table of Contents

+ +- [Overview](#overview) +- [Community](#community) +- [Making a pull request](#making-a-pull-request) + - [Pull request checklist](#pull-request-checklist) + - [Forking the repo](#forking-the-repo) + - [Pre-commit Hooks](#pre-commit-hooks) + - [Signing off commits](#signing-off-commits) + - [Incorporating upstream changes from master](#incorporating-upstream-changes-from-master) +- [Feast Python SDK / CLI](#feast-python-sdk--cli) + - [Environment Setup](#environment-setup) + - [Code Style & Linting](#code-style--linting) + - [Unit Tests](#unit-tests) + - [Integration Tests](#integration-tests) + - [Local integration tests](#local-integration-tests) + - [(Advanced) Full integration tests](#advanced-full-integration-tests) + - [(Advanced) Running specific provider tests or running your test against specific online or offline stores](#advanced-running-specific-provider-tests-or-running-your-test-against-specific-online-or-offline-stores) + - [(Experimental) Run full integration tests against containerized services](#experimental-run-full-integration-tests-against-containerized-services) + - [Contrib integration tests](#contrib-integration-tests) + - [(Contrib) Running tests for Spark offline store](#contrib-running-tests-for-spark-offline-store) + - [(Contrib) Running tests for Trino offline store](#contrib-running-tests-for-trino-offline-store) + - [(Contrib) Running tests for Postgres offline store](#contrib-running-tests-for-postgres-offline-store) + - [(Contrib) Running tests for Postgres online store](#contrib-running-tests-for-postgres-online-store) + - [(Contrib) Running tests for HBase online store](#contrib-running-tests-for-hbase-online-store) +- [(Experimental) Feast UI](#experimental-feast-ui) +- [Feast Java Serving](#feast-java-serving) +- [Developing the Feast Helm charts](#developing-the-feast-helm-charts) + - [Feast Java Feature Server Helm Chart](#feast-java-feature-server-helm-chart) + - [Feast Python / Go Feature Server Helm Chart](#feast-python--go-feature-server-helm-chart) +- [Feast Go Client](#feast-go-client) + - [Environment Setup](#environment-setup-1) + - [Building](#building) + - [Code Style & Linting](#code-style--linting-1) + - [Unit Tests](#unit-tests-1) + - [Testing with Github Actions workflows](#testing-with-github-actions-workflows) +- [Issues](#issues) + ## Overview This guide is targeted at developers looking to contribute to Feast components in the main Feast repository: @@ -8,6 +51,8 @@ the main Feast repository: - [Feast Java Serving](#feast-java-serving) - [Feast Go Client](#feast-go-client) +Please see [this page](https://docs.feast.dev/reference/codebase-structure) for more details on the structure of the entire codebase. + ## Community See [Contribution process](https://docs.feast.dev/project/contributing) and [Community](https://docs.feast.dev/community) for details on how to get more involved in the community. @@ -77,7 +122,7 @@ Note that this means if you are midway through working through a PR and rebase, Setting up your development environment for Feast Python SDK / CLI: 1. Ensure that you have Docker installed in your environment. Docker is used to provision service dependencies during testing, and build images for feature servers and other components. 1. Please note that we use [Docker with BuiltKit](https://docs.docker.com/develop/develop-images/build_enhancements/). -2. Ensure that you have `make`, Python (3.7 and above) with `pip`, installed. +2. Ensure that you have `make`, Python (3.8 and above) with `pip`, installed. 3. _Recommended:_ Create a virtual environment to isolate development dependencies to be installed ```sh # create & activate a virtual environment @@ -100,6 +145,8 @@ make build-ui pip install -e ".[dev]" ``` +This will allow the installed feast version to automatically reflect changes to your local development version of Feast without needing to reinstall everytime you make code changes. + ### Code Style & Linting Feast Python SDK / CLI codebase: - Conforms to [Black code style](https://black.readthedocs.io/en/stable/the_black_code_style.html) @@ -147,7 +194,7 @@ These tests create new temporary tables / datasets locally only, and they are cl make test-python-integration-local ``` -#### Full integration tests +#### (Advanced) Full integration tests To test across clouds, on top of setting up Redis, you also need GCP / AWS / Snowflake setup. > Note: you can manually control what tests are run today by inspecting @@ -155,36 +202,46 @@ To test across clouds, on top of setting up Redis, you also need GCP / AWS / Sno > and commenting out tests that are added to `DEFAULT_FULL_REPO_CONFIGS` **GCP** -1. Install the [Cloud SDK](https://cloud.google.com/sdk/docs/install). -2. Then run login to gcloud: +1. You can get free credits [here](https://cloud.google.com/free/docs/free-cloud-features#free-trial). +2. You will need to setup a service account, enable the BigQuery API, and create a staging location for a bucket. + * Setup your service account and project using steps 1-5 [here](https://codelabs.developers.google.com/codelabs/cloud-bigquery-python#0). + * Remember to save your `PROJECT_ID` and your `key.json`. These will be your secrets that you will need to configure in Github actions. Namely, `secrets.GCP_PROJECT_ID` and `secrets.GCP_SA_KEY`. The `GCP_SA_KEY` value is the contents of your `key.json` file. + * Follow these [instructions](https://cloud.google.com/storage/docs/creating-buckets) in your project to create a bucket for running GCP tests and remember to save the bucket name. + * Make sure to add the service account email that you created in the previous step to the users that can access your bucket. Then, make sure to give the account the correct access roles, namely `objectCreator`, `objectViewer`, `objectAdmin`, and `admin`, so that your tests can use the bucket. +3. Install the [Cloud SDK](https://cloud.google.com/sdk/docs/install). +4. Login to gcloud if you haven't already: ``` gcloud auth login gcloud auth application-default login ``` -- When you run `gcloud auth application-default login`, you should see some output of the form: - ``` - Credentials saved to file: [$HOME/.config/gcloud/application_default_credentials.json] - ``` -- You should run `export GOOGLE_APPLICATION_CREDENTIALS="$HOME/.config/gcloud/application_default_credentials.json”` to add the application credentials to your .zshrc or .bashrc. -3. Run `export GCLOUD_PROJECT=[your project]` to your .zshrc or .bashrc. -4. Running `gcloud config list` should give you something like this: -```sh -$ gcloud config list -[core] -account = [your email] -disable_usage_reporting = True -project = [your project] + - When you run `gcloud auth application-default login`, you should see some output of the form: + ``` + Credentials saved to file: [$HOME/.config/gcloud/application_default_credentials.json] + ``` + - You should run `export GOOGLE_APPLICATION_CREDENTIALS="$HOME/.config/gcloud/application_default_credentials.json”` to add the application credentials to your .zshrc or .bashrc. +5. Run `export GCLOUD_PROJECT=[your project id from step 2]` to your .zshrc or .bashrc. +6. Running `gcloud config list` should give you something like this: + ```sh + $ gcloud config list + [core] + account = [your email] + disable_usage_reporting = True + project = [your project id] + + Your active configuration is: [default] + ``` +7. Export GCP specific environment variables in your workflow. Namely, + ```sh + export GCS_REGION='[your gcs region e.g US]' + export GCS_STAGING_LOCATION='[your gcs staging location]' + ``` + **NOTE**: Your `GCS_STAGING_LOCATION` should be in the form `gs://` where the bucket name is from step 2. -Your active configuration is: [default] -``` -5. Export gcp specific environment variables. Namely, -```sh -export GCS_REGION='[your gcs region e.g US]' -export GCS_STAGING_LOCATION='[your gcs staging location]' -``` +8. Once authenticated, you should be able to run the integration tests for BigQuery without any failures. **AWS** -1. TODO(adchia): flesh out setting up AWS login (or create helper script) +1. Setup AWS by creating an account, database, and cluster. You will need to enable Redshift and Dynamo. + * You can get free credits [here](https://aws.amazon.com/free/?all-free-tier.sort-by=item.additionalFields.SortRank&al[…]f.Free%20Tier%20Types=*all&awsf.Free%20Tier%20Categories=*all). 2. To run the AWS Redshift and Dynamo integration tests you will have to export your own AWS credentials. Namely, ```sh @@ -200,18 +257,42 @@ export AWS_REGISTRY_PATH='[your aws registry path]' **Snowflake** 1. See https://signup.snowflake.com/ to setup a trial. -2. Then to run successfully, you'll need some environment variables setup: -```sh -export SNOWFLAKE_CI_DEPLOYMENT='[snowflake_deployment]' -export SNOWFLAKE_CI_USER='[your user]' -export SNOWFLAKE_CI_PASSWORD='[your pw]' -export SNOWFLAKE_CI_ROLE='[your CI role e.g. SYSADMIN]' -export SNOWFLAKE_CI_WAREHOUSE='[your warehouse]' -``` +2. Setup your account and if you are not an `ACCOUNTADMIN` (if you created your own account, you should be), give yourself the `SYSADMIN` role. + ```sql + grant role accountadmin, sysadmin to user user2; + ``` + * Also remember to save your [account name](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#:~:text=organization_name%20is%20the%20name%20of,your%20account%20within%20your%20organization), username, and role. + * Your account name can be found under +3. Create Dashboard and add a Tile. +4. Create a warehouse and database named `FEAST` with the schemas `OFFLINE` and `ONLINE`. + ```sql + create or replace warehouse feast_tests_wh with + warehouse_size='MEDIUM' --set your warehouse size to whatever your budget allows-- + auto_suspend = 180 + auto_resume = true + initially_suspended=true; + + create or replace database FEAST; + use database FEAST; + create schema OFFLINE; + create schema ONLINE; + ``` +5. You will need to create a data unloading location(either on S3, GCP, or Azure). Detailed instructions [here](https://docs.snowflake.com/en/user-guide/data-unload-overview.html). You will need to save the storage export location and the storage export name. You will need to create a [storage integration ](https://docs.snowflake.com/en/sql-reference/sql/create-storage-integration.html) in your warehouse to make this work. Name this storage integration `FEAST_S3`. +6. Then to run successfully, you'll need some environment variables setup: + ```sh + export SNOWFLAKE_CI_DEPLOYMENT='[your snowflake account name]' + export SNOWFLAKE_CI_USER='[your snowflake username]' + export SNOWFLAKE_CI_PASSWORD='[your snowflake pw]' + export SNOWFLAKE_CI_ROLE='[your CI role e.g. SYSADMIN]' + export SNOWFLAKE_CI_WAREHOUSE='[your warehouse]' + export BLOB_EXPORT_STORAGE_NAME='[your data unloading storage name]' + export BLOB_EXPORT_URI='[your data unloading blob uri]` + ``` +7. Once everything is setup, running snowflake integration tests should pass without failures. -Then run `make test-python-integration`. Note that for Snowflake / GCP / AWS, this will create new temporary tables / datasets. +Note that for Snowflake / GCP / AWS, running `make test-python-integration` will create new temporary tables / datasets in your cloud storage tables. -#### Running specific provider tests or running your test against specific online or offline stores +#### (Advanced) Running specific provider tests or running your test against specific online or offline stores 1. If you don't need to have your test run against all of the providers(`gcp`, `aws`, and `snowflake`) or don't need to run against all of the online stores, you can tag your test with specific providers or stores that you need(`@pytest.mark.universal_online_stores` or `@pytest.mark.universal_online_stores` with the `only` parameter). The `only` parameter selects specific offline providers and online stores that your test will test against. Example: @@ -242,13 +323,63 @@ The services with containerized replacements currently implemented are: - Trino - HBase - Postgres +- Cassandra You can run `make test-python-integration-container` to run tests against the containerized versions of dependencies. +### Contrib integration tests +#### (Contrib) Running tests for Spark offline store +You can run `make test-python-universal-spark` to run all tests against the Spark offline store. (Note: you'll have to run `pip install -e ".[dev]"` first). + +Not all tests are passing yet + +#### (Contrib) Running tests for Trino offline store +You can run `make test-python-universal-trino` to run all tests against the Trino offline store. (Note: you'll have to run `pip install -e ".[dev]"` first) + +#### (Contrib) Running tests for Postgres offline store +You can run `test-python-universal-postgres-offline` to run all tests against the Postgres offline store. (Note: you'll have to run `pip install -e ".[dev]"` first) + +#### (Contrib) Running tests for Postgres online store +You can run `test-python-universal-postgres-online` to run all tests against the Postgres offline store. (Note: you'll have to run `pip install -e ".[dev]"` first) + +#### (Contrib) Running tests for HBase online store +TODO + +## (Experimental) Feast UI +See [Feast contributing guide](ui/CONTRIBUTING.md) ## Feast Java Serving See [Java contributing guide](java/CONTRIBUTING.md) +See also development instructions related to the helm chart below at [Developing the Feast Helm charts](#developing-the-feast-helm-charts) + +## Developing the Feast Helm charts +There are 3 helm charts: +- Feast Java feature server +- Feast Python / Go feature server +- (deprecated) Feast Python feature server + +Generally, you can override the images in the helm charts with locally built Docker images, and install the local helm +chart. + +All README's for helm charts are generated using [helm-docs](https://github.com/norwoodj/helm-docs). You can install it +(e.g. with `brew install norwoodj/tap/helm-docs`) and then run `make build-helm-docs`. + +### Feast Java Feature Server Helm Chart +See the Java demo example (it has development instructions too using minikube) [here](examples/java-demo/README.md) + +It will: +- run `make build-java-docker-dev` to build local Java feature server binaries +- configure the included `application-override.yaml` to override the image tag to use the locally built dev images. +- install the local chart with `helm install feast-release ../../../infra/charts/feast --values application-override.yaml` + +### Feast Python / Go Feature Server Helm Chart +See the Python demo example (it has development instructions too using minikube) [here](examples/python-helm-demo/README.md) + +It will: +- run `make build-feature-server-dev` to build a local python feature server binary +- install the local chart with `helm install feast-release ../../../infra/charts/feast-feature-server --set image.tag=dev --set feature_store_yaml_base64=$(base64 feature_store.yaml)` + ## Feast Go Client ### Environment Setup Setting up your development environment for Feast Go SDK: @@ -258,7 +389,7 @@ Setting up your development environment for Feast Go SDK: ### Building Build the Feast Go Client with the `go` toolchain: ```sh -go build +make compile-go-lib ``` ### Code Style & Linting @@ -281,19 +412,9 @@ go vet ### Unit Tests Unit tests for the Feast Go Client can be run as follows: ```sh -go test +make test-go ``` ### Testing with Github Actions workflows -* Update your current master on your forked branch and make a pull request against your own forked master. -* Enable workflows by going to actions and clicking `Enable Workflows`. - * Pushes will now run your edited workflow yaml file against your test code. - * Unfortunately, in order to test any github workflow changes, you must push the code to the branch and see the output in the actions tab. - -## Issues -* pr-integration-tests workflow is skipped - * Add `ok-to-test` github label. -* pr-integration-tests errors out with `Error: fatal: invalid refspec '+refs/pull//merge:refs/remotes/pull//merge'` - * This is because github actions cannot pull the branch version for some reason so just find your PR number in your pull request header and hard code it into the `uses: actions/checkout@v2` section (i.e replace `refs/pull/${{ github.event.pull_request.number }}/merge` with `refs/pull//merge`) -* AWS/GCP workflow - * Currently still cannot test GCP/AWS workflow without setting up secrets in a forked repository. + +Please refer to the maintainers [doc](./docs/project/maintainers.md) if you would like to locally test out the github actions workflow changes. This document will help you setup your fork to test the ci integration tests and other workflows without needing to make a pull request against feast-dev master. diff --git a/Makefile b/Makefile index ee2b7c8f1b..8e03ed5349 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,11 @@ benchmark-python-local: FEAST_USAGE=False IS_TEST=True FEAST_IS_LOCAL_TEST=True python -m pytest --integration --benchmark --benchmark-autosave --benchmark-save-data sdk/python/tests test-python: - FEAST_USAGE=False IS_TEST=True python -m pytest -n 8 sdk/python/tests + @(docker info > /dev/null 2>&1 && \ + FEAST_USAGE=False \ + IS_TEST=True \ + python -m pytest -n 8 sdk/python/tests \ + ) || echo "This script uses Docker, and it isn't running - please start the Docker Daemon and try again!"; test-python-integration: FEAST_USAGE=False IS_TEST=True python -m pytest -n 8 --integration sdk/python/tests @@ -75,13 +79,10 @@ test-python-integration-local: FEAST_IS_LOCAL_TEST=True \ FEAST_LOCAL_ONLINE_CONTAINER=True \ python -m pytest -n 8 --integration \ - -k "not test_apply_entity_integration and \ - not test_apply_feature_view_integration and \ - not test_apply_data_source_integration and \ - not test_lambda_materialization and \ - not test_feature_view_inference_success and \ - not test_update_file_data_source_with_inferred_event_timestamp_col and \ - not test_nullable_online_store" \ + -k "not gcs_registry and \ + not s3_registry and \ + not test_lambda_materialization and \ + not test_snowflake" \ sdk/python/tests \ ) || echo "This script uses Docker, and it isn't running - please start the Docker Daemon and try again!"; @@ -93,9 +94,33 @@ test-python-integration-container: python -m pytest -n 8 --integration sdk/python/tests \ ) || echo "This script uses Docker, and it isn't running - please start the Docker Daemon and try again!"; -test-python-universal-contrib: +test-python-universal-spark: PYTHONPATH='.' \ - FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.offline_stores.contrib.contrib_repo_configuration \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.offline_stores.contrib.spark_repo_configuration \ + PYTEST_PLUGINS=feast.infra.offline_stores.contrib.spark_offline_store.tests \ + FEAST_USAGE=False IS_TEST=True \ + python -m pytest -n 8 --integration \ + -k "not test_historical_retrieval_fails_on_validation and \ + not test_historical_retrieval_with_validation and \ + not test_historical_features_persisting and \ + not test_historical_retrieval_fails_on_validation and \ + not test_universal_cli and \ + not test_go_feature_server and \ + not test_feature_logging and \ + not test_reorder_columns and \ + not test_logged_features_validation and \ + not test_lambda_materialization_consistency and \ + not test_offline_write and \ + not test_push_features_to_offline_store.py and \ + not gcs_registry and \ + not s3_registry and \ + not test_universal_types and \ + not test_snowflake" \ + sdk/python/tests + +test-python-universal-trino: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.offline_stores.contrib.trino_repo_configuration \ PYTEST_PLUGINS=feast.infra.offline_stores.contrib.trino_offline_store.tests \ FEAST_USAGE=False IS_TEST=True \ python -m pytest -n 8 --integration \ @@ -106,25 +131,130 @@ test-python-universal-contrib: not test_universal_cli and \ not test_go_feature_server and \ not test_feature_logging and \ - not test_universal_types" \ + not test_reorder_columns and \ + not test_logged_features_validation and \ + not test_lambda_materialization_consistency and \ + not test_offline_write and \ + not test_push_features_to_offline_store.py and \ + not gcs_registry and \ + not s3_registry and \ + not test_universal_types and \ + not test_snowflake" \ sdk/python/tests -test-python-universal-postgres: + +# Note: to use this, you'll need to have Microsoft ODBC 17 installed. +# See https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15#17 +test-python-universal-mssql: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.offline_stores.contrib.mssql_repo_configuration \ + PYTEST_PLUGINS=feast.infra.offline_stores.contrib.mssql_offline_store.tests \ + FEAST_USAGE=False IS_TEST=True \ + FEAST_LOCAL_ONLINE_CONTAINER=True \ + python -m pytest -n 8 --integration \ + -k "not gcs_registry and \ + not s3_registry and \ + not test_lambda_materialization and \ + not test_snowflake" \ + sdk/python/tests + + +#To use Athena as an offline store, you need to create an Athena database and an S3 bucket on AWS. https://docs.aws.amazon.com/athena/latest/ug/getting-started.html +#Modify environment variables ATHENA_DATA_SOURCE, ATHENA_DATABASE, ATHENA_S3_BUCKET_NAME if you want to change the data source, database, and bucket name of S3 to use. +#If tests fail with the pytest -n 8 option, change the number to 1. +test-python-universal-athena: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.offline_stores.contrib.athena_repo_configuration \ + PYTEST_PLUGINS=feast.infra.offline_stores.contrib.athena_offline_store.tests \ + FEAST_USAGE=False IS_TEST=True \ + ATHENA_DATA_SOURCE=AwsDataCatalog \ + ATHENA_DATABASE=default \ + ATHENA_S3_BUCKET_NAME=feast-integration-tests \ + python -m pytest -n 8 --integration \ + -k "not test_go_feature_server and \ + not test_logged_features_validation and \ + not test_lambda and \ + not test_feature_logging and \ + not test_offline_write and \ + not test_push_offline and \ + not test_historical_retrieval_with_validation and \ + not test_historical_features_persisting and \ + not test_historical_retrieval_fails_on_validation and \ + not gcs_registry and \ + not s3_registry and \ + not test_snowflake" \ + sdk/python/tests + +test-python-universal-postgres-offline: PYTHONPATH='.' \ FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.offline_stores.contrib.postgres_repo_configuration \ PYTEST_PLUGINS=sdk.python.feast.infra.offline_stores.contrib.postgres_offline_store.tests \ FEAST_USAGE=False \ IS_TEST=True \ - python -m pytest -x --integration \ - -k "not test_historical_retrieval_fails_on_validation and \ - not test_historical_retrieval_with_validation and \ + python -m pytest -n 8 --integration \ + -k "not test_historical_retrieval_with_validation and \ not test_historical_features_persisting and \ - not test_historical_retrieval_fails_on_validation and \ - not test_universal_cli and \ - not test_go_feature_server and \ - not test_feature_logging and \ - not test_universal_types" \ - sdk/python/tests + not test_universal_cli and \ + not test_go_feature_server and \ + not test_feature_logging and \ + not test_reorder_columns and \ + not test_logged_features_validation and \ + not test_lambda_materialization_consistency and \ + not test_offline_write and \ + not test_push_features_to_offline_store and \ + not gcs_registry and \ + not s3_registry and \ + not test_universal_types" \ + sdk/python/tests + +test-python-universal-postgres-online: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.online_stores.contrib.postgres_repo_configuration \ + PYTEST_PLUGINS=sdk.python.feast.infra.offline_stores.contrib.postgres_offline_store.tests \ + FEAST_USAGE=False \ + IS_TEST=True \ + python -m pytest -n 8 --integration \ + -k "not test_universal_cli and \ + not test_go_feature_server and \ + not test_feature_logging and \ + not test_reorder_columns and \ + not test_logged_features_validation and \ + not test_lambda_materialization_consistency and \ + not test_offline_write and \ + not test_push_features_to_offline_store and \ + not gcs_registry and \ + not s3_registry and \ + not test_universal_types and \ + not test_snowflake" \ + sdk/python/tests + +test-python-universal-cassandra: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.online_stores.contrib.cassandra_repo_configuration \ + PYTEST_PLUGINS=sdk.python.tests.integration.feature_repos.universal.online_store.cassandra \ + FEAST_USAGE=False \ + IS_TEST=True \ + python -m pytest -x --integration \ + sdk/python/tests + +test-python-universal-cassandra-no-cloud-providers: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.online_stores.contrib.cassandra_repo_configuration \ + PYTEST_PLUGINS=sdk.python.tests.integration.feature_repos.universal.online_store.cassandra \ + FEAST_USAGE=False \ + IS_TEST=True \ + python -m pytest -x --integration \ + -k "not test_lambda_materialization_consistency and \ + not test_apply_entity_integration and \ + not test_apply_feature_view_integration and \ + not test_apply_entity_integration and \ + not test_apply_feature_view_integration and \ + not test_apply_data_source_integration and \ + not test_nullable_online_store and \ + not gcs_registry and \ + not s3_registry and \ + not test_snowflake" \ + sdk/python/tests test-python-universal: FEAST_USAGE=False IS_TEST=True python -m pytest -n 8 --integration sdk/python/tests @@ -199,7 +329,7 @@ install-go-ci-dependencies: python -m pip install pybindgen==0.22.0 protobuf==3.20.1 install-protoc-dependencies: - pip install grpcio-tools==1.47.0 mypy-protobuf==3.1.0 + pip install --ignore-installed protobuf grpcio-tools==1.47.0 mypy-protobuf==3.1.0 compile-protos-go: install-go-proto-dependencies install-protoc-dependencies python setup.py build_go_protos @@ -212,7 +342,7 @@ install-feast-ci-locally: # Needs feast package to setup the feature store # CGO flag is due to this issue: https://github.com/golang/go/wiki/InvalidFlag -test-go: compile-protos-go compile-go-lib install-feast-ci-locally +test-go: compile-protos-go compile-protos-python compile-go-lib install-feast-ci-locally CGO_LDFLAGS_ALLOW=".*" go test -tags cgo,ccalloc ./... format-go: @@ -223,7 +353,7 @@ lint-go: compile-protos-go compile-go-lib # Docker -build-docker: build-ci-docker build-feature-server-python-aws-docker build-feature-transformation-server-docker build-feature-server-java-docker +build-docker: build-ci-docker build-feature-server-python-docker build-feature-server-python-aws-docker build-feature-transformation-server-docker build-feature-server-java-docker push-ci-docker: docker push $(REGISTRY)/feast-ci:$(VERSION) @@ -232,13 +362,21 @@ push-ci-docker: build-ci-docker: docker buildx build -t $(REGISTRY)/feast-ci:$(VERSION) -f infra/docker/ci/Dockerfile --load . +push-feature-server-python-docker: + docker push $(REGISTRY)/feature-server:$$VERSION + +build-feature-server-python-docker: + docker buildx build --build-arg VERSION=$$VERSION \ + -t $(REGISTRY)/feature-server:$$VERSION \ + -f sdk/python/feast/infra/feature_servers/multicloud/Dockerfile --load . + push-feature-server-python-aws-docker: - docker push $(REGISTRY)/feature-server-python-aws:$$VERSION + docker push $(REGISTRY)/feature-server-python-aws:$$VERSION build-feature-server-python-aws-docker: - docker buildx build --build-arg VERSION=$$VERSION \ - -t $(REGISTRY)/feature-server-python-aws:$$VERSION \ - -f sdk/python/feast/infra/feature_servers/aws_lambda/Dockerfile --load . + docker buildx build --build-arg VERSION=$$VERSION \ + -t $(REGISTRY)/feature-server-python-aws:$$VERSION \ + -f sdk/python/feast/infra/feature_servers/aws_lambda/Dockerfile --load . push-feature-transformation-server-docker: docker push $(REGISTRY)/feature-transformation-server:$(VERSION) @@ -256,6 +394,22 @@ build-feature-server-java-docker: -t $(REGISTRY)/feature-server-java:$(VERSION) \ -f java/infra/docker/feature-server/Dockerfile --load . +# Dev images + +build-feature-server-dev: + docker buildx build --build-arg VERSION=dev \ + -t feastdev/feature-server:dev \ + -f sdk/python/feast/infra/feature_servers/multicloud/Dockerfile.dev --load . + +build-java-docker-dev: + make build-java-no-tests REVISION=dev + docker buildx build --build-arg VERSION=dev \ + -t feastdev/feature-transformation-server:dev \ + -f sdk/python/feast/infra/transformation_servers/Dockerfile --load . + docker buildx build --build-arg VERSION=dev \ + -t feastdev/feature-server-java:dev \ + -f java/infra/docker/feature-server/Dockerfile.dev --load . + # Documentation install-dependencies-proto-docs: @@ -286,6 +440,11 @@ build-sphinx: compile-protos-python build-templates: python infra/scripts/compile-templates.py +build-helm-docs: + cd ${ROOT_DIR}/infra/charts/feast; helm-docs + cd ${ROOT_DIR}/infra/charts/feast-feature-server; helm-docs + cd ${ROOT_DIR}/infra/charts/feast-python-server; helm-docs + # Web UI # Note: requires node and yarn to be installed diff --git a/README.md b/README.md index ab69636a20..b663533710 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Feast (**Fea**ture **St**ore) is an open source feature store for machine learni Feast allows ML platform teams to: -* **Make features consistently available for training and serving** by managing an _offline store_ (to process historical data for scale-out batch scoring or model training), a low-latency _online store_ (to power real-time prediction)_,_ and a battle-tested _feature server_ (for serving pre-computed features online). +* **Make features consistently available for training and serving** by managing an _offline store_ (to process historical data for scale-out batch scoring or model training), a low-latency _online store_ (to power real-time prediction)_,_ and a battle-tested _feature server_ (to serve pre-computed features online). * **Avoid data leakage** by generating point-in-time correct feature sets so data scientists can focus on feature engineering rather than debugging error-prone dataset joining logic. This ensure that future feature values do not leak to models during training. * **Decouple ML from data infrastructure** by providing a single data access layer that abstracts feature storage from feature retrieval, ensuring models remain portable as you move from training models to serving models, from batch models to realtime models, and from one data infra system to another. @@ -152,7 +152,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Redshift source](https://docs.feast.dev/reference/data-sources/redshift) * [x] [BigQuery source](https://docs.feast.dev/reference/data-sources/bigquery) * [x] [Parquet file source](https://docs.feast.dev/reference/data-sources/file) - * [x] [Synapse source (community plugin)](https://github.com/Azure/feast-azure) + * [x] [Azure Synapse + Azure SQL source (contrib plugin)](https://docs.feast.dev/reference/data-sources/mssql) * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/data-sources/postgres) * [x] [Spark (contrib plugin)](https://docs.feast.dev/reference/data-sources/spark) @@ -161,7 +161,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) * [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) * [x] [BigQuery](https://docs.feast.dev/reference/offline-stores/bigquery) - * [x] [Synapse (community plugin)](https://github.com/Azure/feast-azure) + * [x] [Azure Synapse + Azure SQL (contrib plugin)](https://docs.feast.dev/reference/offline-stores/mssql.md) * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/offline-stores/postgres) * [x] [Trino (contrib plugin)](https://github.com/Shopify/feast-trino) @@ -177,7 +177,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Azure Cache for Redis (community plugin)](https://github.com/Azure/feast-azure) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/online-stores/postgres) * [x] [Custom online store support](https://docs.feast.dev/how-to-guides/adding-support-for-a-new-online-store) - * [x] [Cassandra / AstraDB](https://github.com/datastaxdevs/feast-cassandra-online-store) + * [x] [Cassandra / AstraDB](https://docs.feast.dev/reference/online-stores/cassandra) * [ ] Bigtable (in progress) * **Feature Engineering** * [x] On-demand Transformations (Alpha release. See [RFC](https://docs.google.com/document/d/1lgfIw0Drc65LpaxbUu49RCeJgMew547meSJttnUqz7c/edit#)) diff --git a/docs/README.md b/docs/README.md index 1b70f8fedc..b838e5fe5b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,11 +2,11 @@ ## What is Feast? -Feast (**Fea**ture **St**ore) is a customizable operational data system that re-uses existing infrastructure to manage and serve machine learning features to realtime models. +Feast (**Fea**ture **St**ore) is a customizable operational data system that re-uses existing infrastructure to manage and serve machine learning features to realtime models. Feast allows ML platform teams to: -* **Make features consistently available for training and serving** by managing an _offline store_ (to process historical data for scale-out batch scoring or model training), a low-latency _online store_ (to power real-time prediction)_,_ and a battle-tested _feature server_ (for serving pre-computed features online). +* **Make features consistently available for training and serving** by managing an _offline store_ (to process historical data for scale-out batch scoring or model training), a low-latency _online store_ (to power real-time prediction)_,_ and a battle-tested _feature server_ (to serve pre-computed features online). * **Avoid data leakage** by generating point-in-time correct feature sets so data scientists can focus on feature engineering rather than debugging error-prone dataset joining logic. This ensure that future feature values do not leak to models during training. * **Decouple ML from data infrastructure** by providing a single data access layer that abstracts feature storage from feature retrieval, ensuring models remain portable as you move from training models to serving models, from batch models to realtime models, and from one data infra system to another. @@ -20,8 +20,6 @@ Feast allows ML platform teams to: Feast helps ML platform teams with DevOps experience productionize real-time models. Feast can also help these teams build towards a feature platform that improves collaboration between engineers and data scientists. - - Feast is likely **not** the right tool if you * are in an organization that’s just getting started with ML and is not yet sure what the business impact of ML is @@ -67,7 +65,7 @@ Explore the following resources to get started with Feast: * [Quickstart](getting-started/quickstart.md) is the fastest way to get started with Feast * [Concepts](getting-started/concepts/) describes all important Feast API concepts * [Architecture](getting-started/architecture-and-components/) describes Feast's overall architecture. -* [Tutorials](tutorials/tutorials-overview.md) shows full examples of using Feast in machine learning applications. +* [Tutorials](tutorials/tutorials-overview/) shows full examples of using Feast in machine learning applications. * [Running Feast with Snowflake/GCP/AWS](how-to-guides/feast-snowflake-gcp-aws/) provides a more in-depth guide to using Feast. * [Reference](reference/feast-cli-commands.md) contains detailed API and design documents. * [Contributing](project/contributing.md) contains resources for anyone who wants to contribute to Feast. diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index b0e88b413f..8ee4867730 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -10,33 +10,31 @@ * [Quickstart](getting-started/quickstart.md) * [Concepts](getting-started/concepts/README.md) * [Overview](getting-started/concepts/overview.md) - * [Data source](getting-started/concepts/data-source.md) - * [Dataset](getting-started/concepts/dataset.md) + * [Data ingestion](getting-started/concepts/data-ingestion.md) * [Entity](getting-started/concepts/entity.md) * [Feature view](getting-started/concepts/feature-view.md) - * [Stream feature view](getting-started/concepts/stream-feature-view.md) * [Feature retrieval](getting-started/concepts/feature-retrieval.md) * [Point-in-time joins](getting-started/concepts/point-in-time-joins.md) * [Registry](getting-started/concepts/registry.md) + * [\[Alpha\] Saved dataset](getting-started/concepts/dataset.md) * [Architecture](getting-started/architecture-and-components/README.md) * [Overview](getting-started/architecture-and-components/overview.md) - * [Feature repository](getting-started/architecture-and-components/feature-repository.md) * [Registry](getting-started/architecture-and-components/registry.md) * [Offline store](getting-started/architecture-and-components/offline-store.md) * [Online store](getting-started/architecture-and-components/online-store.md) - * [Provider](getting-started/architecture-and-components/provider.md) * [Batch Materialization Engine](getting-started/architecture-and-components/batch-materialization-engine.md) + * [Provider](getting-started/architecture-and-components/provider.md) * [Learning by example](getting-started/feast-workshop.md) * [Third party integrations](getting-started/third-party-integrations.md) * [FAQ](getting-started/faq.md) ## Tutorials -* [Overview](tutorials/tutorials-overview.md) -* [Driver ranking](tutorials/driver-ranking-with-feast.md) -* [Fraud detection on GCP](tutorials/fraud-detection.md) -* [Real-time credit scoring on AWS](tutorials/real-time-credit-scoring-on-aws.md) -* [Driver stats on Snowflake](tutorials/driver-stats-on-snowflake.md) +* [Sample use-case tutorials](tutorials/tutorials-overview/README.md) + * [Driver ranking](tutorials/tutorials-overview/driver-ranking-with-feast.md) + * [Fraud detection on GCP](tutorials/tutorials-overview/fraud-detection.md) + * [Real-time credit scoring on AWS](tutorials/tutorials-overview/real-time-credit-scoring-on-aws.md) + * [Driver stats on Snowflake](tutorials/tutorials-overview/driver-stats-on-snowflake.md) * [Validating historical features with Great Expectations](tutorials/validating-historical-features.md) * [Using Scalable Registry](tutorials/using-scalable-registry.md) * [Building streaming features](tutorials/building-streaming-features.md) @@ -50,17 +48,23 @@ * [Build a training dataset](how-to-guides/feast-snowflake-gcp-aws/build-a-training-dataset.md) * [Load data into the online store](how-to-guides/feast-snowflake-gcp-aws/load-data-into-the-online-store.md) * [Read features from the online store](how-to-guides/feast-snowflake-gcp-aws/read-features-from-the-online-store.md) + * [Scaling Feast](how-to-guides/scaling-feast.md) + * [Structuring Feature Repos](how-to-guides/structuring-repos.md) * [Running Feast in production](how-to-guides/running-feast-in-production.md) -* [Upgrading from Feast 0.9](https://docs.google.com/document/u/1/d/1AOsr\_baczuARjCpmZgVd8mCqTF4AZ49OEyU4Cn-uTT0/edit) * [Upgrading for Feast 0.20+](how-to-guides/automated-feast-upgrade.md) -* [Adding a custom batch materialization engine](how-to-guides/creating-a-custom-materialization-engine.md) -* [Adding a new online store](how-to-guides/adding-support-for-a-new-online-store.md) -* [Adding a new offline store](how-to-guides/adding-a-new-offline-store.md) +* [Customizing Feast](how-to-guides/customizing-feast/README.md) + * [Adding a custom batch materialization engine](how-to-guides/customizing-feast/creating-a-custom-materialization-engine.md) + * [Adding a new offline store](how-to-guides/customizing-feast/adding-a-new-offline-store.md) + * [Adding a new online store](how-to-guides/customizing-feast/adding-support-for-a-new-online-store.md) + * [Adding a custom provider](how-to-guides/customizing-feast/creating-a-custom-provider.md) * [Adding or reusing tests](how-to-guides/adding-or-reusing-tests.md) ## Reference +* [Codebase Structure](reference/codebase-structure.md) +* [Type System](reference/type-system.md) * [Data sources](reference/data-sources/README.md) + * [Overview](reference/data-sources/overview.md) * [File](reference/data-sources/file.md) * [Snowflake](reference/data-sources/snowflake.md) * [BigQuery](reference/data-sources/bigquery.md) @@ -70,13 +74,18 @@ * [Kinesis](reference/data-sources/kinesis.md) * [Spark (contrib)](reference/data-sources/spark.md) * [PostgreSQL (contrib)](reference/data-sources/postgres.md) + * [Trino (contrib)](reference/data-sources/trino.md) + * [Azure Synapse + Azure SQL (contrib)](reference/data-sources/mssql.md) * [Offline stores](reference/offline-stores/README.md) + * [Overview](reference/offline-stores/overview.md) * [File](reference/offline-stores/file.md) * [Snowflake](reference/offline-stores/snowflake.md) * [BigQuery](reference/offline-stores/bigquery.md) * [Redshift](reference/offline-stores/redshift.md) * [Spark (contrib)](reference/offline-stores/spark.md) * [PostgreSQL (contrib)](reference/offline-stores/postgres.md) + * [Trino (contrib)](reference/offline-stores/trino.md) + * [Azure Synapse + Azure SQL (contrib)](reference/offline-stores/mssql.md) * [Online stores](reference/online-stores/README.md) * [SQLite](reference/online-stores/sqlite.md) * [Snowflake](reference/online-stores/snowflake.md) @@ -84,20 +93,25 @@ * [Datastore](reference/online-stores/datastore.md) * [DynamoDB](reference/online-stores/dynamodb.md) * [PostgreSQL (contrib)](reference/online-stores/postgres.md) + * [Cassandra + Astra DB (contrib)](reference/online-stores/cassandra.md) * [Providers](reference/providers/README.md) * [Local](reference/providers/local.md) * [Google Cloud Platform](reference/providers/google-cloud-platform.md) * [Amazon Web Services](reference/providers/amazon-web-services.md) + * [Azure](reference/providers/azure.md) +* [Batch Materialization Engines](reference/batch-materialization/README.md) + * [Bytewax](reference/batch-materialization/bytewax.md) + * [Snowflake](reference/batch-materialization/snowflake.md) * [Feature repository](reference/feature-repository/README.md) * [feature\_store.yaml](reference/feature-repository/feature-store-yaml.md) * [.feastignore](reference/feature-repository/feast-ignore.md) * [Feature servers](reference/feature-servers/README.md) * [Python feature server](reference/feature-servers/python-feature-server.md) - * [Go feature server](reference/feature-servers/go-feature-server.md) -* [\[Alpha\] Web UI](reference/alpha-web-ui.md) -* [\[Alpha\] Data quality monitoring](reference/dqm.md) + * [\[Alpha\] Go feature server](reference/feature-servers/go-feature-server.md) + * [\[Alpha\] AWS Lambda feature server](reference/feature-servers/alpha-aws-lambda-feature-server.md) +* [\[Beta\] Web UI](reference/alpha-web-ui.md) * [\[Alpha\] On demand feature view](reference/alpha-on-demand-feature-view.md) -* [\[Alpha\] AWS Lambda feature server](reference/alpha-aws-lambda-feature-server.md) +* [\[Alpha\] Data quality monitoring](reference/dqm.md) * [Feast CLI reference](reference/feast-cli-commands.md) * [Python API reference](http://rtd.feast.dev) * [Usage](reference/usage.md) @@ -106,6 +120,7 @@ * [Contribution process](project/contributing.md) * [Development guide](project/development-guide.md) + * [Maintainer Docs](project/maintainers.md) * [Versioning policy](project/versioning-policy.md) * [Release process](project/release-process.md) * [Feast 0.9 vs Feast 0.10+](project/feast-0.9-vs-feast-0.10+.md) diff --git a/docs/getting-started/architecture-and-components/README.md b/docs/getting-started/architecture-and-components/README.md index 8a6e181ea7..a67761b2fc 100644 --- a/docs/getting-started/architecture-and-components/README.md +++ b/docs/getting-started/architecture-and-components/README.md @@ -1,15 +1,25 @@ # Architecture -{% page-ref page="overview.md" %} +{% content-ref url="overview.md" %} +[overview.md](overview.md) +{% endcontent-ref %} -{% page-ref page="feature-repository.md" %} +{% content-ref url="registry.md" %} +[registry.md](registry.md) +{% endcontent-ref %} -{% page-ref page="registry.md" %} +{% content-ref url="offline-store.md" %} +[offline-store.md](offline-store.md) +{% endcontent-ref %} -{% page-ref page="offline-store.md" %} +{% content-ref url="online-store.md" %} +[online-store.md](online-store.md) +{% endcontent-ref %} -{% page-ref page="online-store.md" %} +{% content-ref url="batch-materialization-engine.md" %} +[batch-materialization-engine.md](batch-materialization-engine.md) +{% endcontent-ref %} -{% page-ref page="provider.md" %} - -{% page-reg page="batch-materialization-engine.md" %} +{% content-ref url="provider.md" %} +[provider.md](provider.md) +{% endcontent-ref %} diff --git a/docs/getting-started/architecture-and-components/batch-materialization-engine.md b/docs/getting-started/architecture-and-components/batch-materialization-engine.md index fb3c83ccb4..7be22fe125 100644 --- a/docs/getting-started/architecture-and-components/batch-materialization-engine.md +++ b/docs/getting-started/architecture-and-components/batch-materialization-engine.md @@ -4,7 +4,6 @@ A batch materialization engine is a component of Feast that's responsible for mo A materialization engine abstracts over specific technologies or frameworks that are used to materialize data. It allows users to use a pure local serialized approach (which is the default LocalMaterializationEngine), or delegates the materialization to seperate components (e.g. AWS Lambda, as implemented by the the LambdaMaterializaionEngine). -If the built-in engines are not sufficient, you can create your own custom materialization engine. Please see [this guide](../../how-to-guides/creating-a-custom-materialization-engine.md) for more details. +If the built-in engines are not sufficient, you can create your own custom materialization engine. Please see [this guide](../../how-to-guides/customizing-feast/creating-a-custom-materialization-engine.md) for more details. Please see [feature\_store.yaml](../../reference/feature-repository/feature-store-yaml.md#overview) for configuring engines. - diff --git a/docs/getting-started/architecture-and-components/feature-repository.md b/docs/getting-started/architecture-and-components/feature-repository.md deleted file mode 100644 index d231600eb8..0000000000 --- a/docs/getting-started/architecture-and-components/feature-repository.md +++ /dev/null @@ -1,27 +0,0 @@ -# Feature repository - -Feast users use Feast to manage two important sets of configuration: - -* Configuration about how to run Feast on your infrastructure -* Feature definitions - -With Feast, the above configuration can be written declaratively and stored as code in a central location. This central location is called a feature repository. The feature repository is the declarative source of truth for what the desired state of a feature store should be. - -The Feast CLI uses the feature repository to configure, deploy, and manage your feature store. - -An example structure of a feature repository is shown below: - -```text -$ tree -a -. -├── data -│ └── driver_stats.parquet -├── driver_features.py -├── feature_store.yaml -└── .feastignore - -1 directory, 4 files -``` - -For more details, see the [Feature repository](../../reference/feature-repository/) reference. - diff --git a/docs/getting-started/architecture-and-components/offline-store.md b/docs/getting-started/architecture-and-components/offline-store.md index 29a72bd5f0..48470c6547 100644 --- a/docs/getting-started/architecture-and-components/offline-store.md +++ b/docs/getting-started/architecture-and-components/offline-store.md @@ -1,17 +1,17 @@ # Offline store -Feast uses offline stores as storage and compute systems. Offline stores store historic time-series feature values. Feast does not generate these features, but instead uses the offline store as the interface for querying existing features in your organization. - -Offline stores are used primarily for two reasons +An offline store is an interface for working with historical time-series feature values that are stored in [data sources](../../getting-started/concepts/data-ingestion.md). +The `OfflineStore` interface has several different implementations, such as the `BigQueryOfflineStore`, each of which is backed by a different storage and compute engine. +For more details on which offline stores are supported, please see [Offline Stores](../../reference/offline-stores/). +Offline stores are primarily used for two reasons: 1. Building training datasets from time-series features. -2. Materializing \(loading\) features from the offline store into an online store in order to serve those features at low latency for prediction. - -Offline stores are configured through the [feature\_store.yaml](../../reference/offline-stores/). When building training datasets or materializing features into an online store, Feast will use the configured offline store along with the data sources you have defined as part of feature views to execute the necessary data operations. - -It is not possible to query all data sources from all offline stores, and only a single offline store can be used at a time. For example, it is not possible to query a BigQuery table from a `File` offline store, nor is it possible for a `BigQuery` offline store to query files from your local file system. +2. Materializing \(loading\) features into an online store to serve those features at low-latency in a production setting. -Please see the [Offline Stores](../../reference/offline-stores/) reference for more details on configuring offline stores. +Offline stores are configured through the [feature\_store.yaml](../../reference/offline-stores/). +When building training datasets or materializing features into an online store, Feast will use the configured offline store with your configured data sources to execute the necessary data operations. -Please see the [Push Source](reference/data-sources/push.md) for reference on how to push features directly to the offline store in your feature store. +Only a single offline store can be used at a time. +Moreover, offline stores are not compatible with all data sources; for example, the `BigQuery` offline store cannot be used to query a file-based data source. +Please see [Push Source](../../reference/data-sources/push.md) for more details on how to push features directly to the offline store in your feature store. diff --git a/docs/getting-started/architecture-and-components/overview.md b/docs/getting-started/architecture-and-components/overview.md index 97bd779503..b6e1c48e89 100644 --- a/docs/getting-started/architecture-and-components/overview.md +++ b/docs/getting-started/architecture-and-components/overview.md @@ -5,10 +5,10 @@ ## Functionality * **Create Batch Features:** ELT/ETL systems like Spark and SQL are used to transform data in the batch store. -* **Create Stream Features:** Stream features are created from streaming services such as Kafka or Kinesis, and can be pushed directly into Feast. +* **Create Stream Features:** Stream features are created from streaming services such as Kafka or Kinesis, and can be pushed directly into Feast via the [Push API](../../reference/data-sources/push.md). * **Feast Apply:** The user (or CI) publishes versioned controlled feature definitions using `feast apply`. This CLI command updates infrastructure and persists definitions in the object store registry. * **Feast Materialize:** The user (or scheduler) executes `feast materialize` which loads features from the offline store into the online store. -* **Model Training:** A model training pipeline is launched. It uses the Feast Python SDK to retrieve a training dataset and trains a model. +* **Model Training:** A model training pipeline is launched. It uses the Feast Python SDK to retrieve a training dataset that can be used for training models. * **Get Historical Features:** Feast exports a point-in-time correct training dataset based on the list of features and entity dataframe provided by the model training pipeline. * **Deploy Model:** The trained model binary (and list of features) are deployed into a model serving system. This step is not executed by Feast. * **Prediction:** A backend system makes a request for a prediction from the model serving service. @@ -25,9 +25,9 @@ A complete Feast deployment contains the following components: * Build and retrieve training datasets from the offline store. * Retrieve online features. * **Stream Processor:** The Stream Processor can be used to ingest feature data from streams and write it into the online or offline stores. Currently, there's an experimental Spark processor that's able to consume data from Kafka. -* **Batch Materialization Engine:** The [Batch Materialization Engine](batch-materialization-engine.md) component launches a process which loads data into the online store from the offline store. By default, Feast uses a local in-process engine implementation to materialize data. However, additional infrastructure can be used for a more scalable materialization process. -* **Online Store:** The online store is a database that stores only the latest feature values for each entity. The online store is populated by materialization jobs and from [stream ingestion](../../reference/data-sources/push.md). -* **Offline Store:** The offline store persists batch data that has been ingested into Feast. This data is used for producing training datasets. For feature retrieval and materialization, Feast does not manage the offline store directly, but runs queries against it. However, offline stores can be configured to write data to the offline store if Feast is configured to log served features and the offline store supports this functionality. +* **Batch Materialization Engine:** The [Batch Materialization Engine](batch-materialization-engine.md) component launches a process which loads data into the online store from the offline store. By default, Feast uses a local in-process engine implementation to materialize data. However, additional infrastructure can be used for a more scalable materialization process. +* **Online Store:** The online store is a database that stores only the latest feature values for each entity. The online store is either populated through materialization jobs or through [stream ingestion](../../reference/data-sources/push.md). +* **Offline Store:** The offline store persists batch data that has been ingested into Feast. This data is used for producing training datasets. For feature retrieval and materialization, Feast does not manage the offline store directly, but runs queries against it. However, offline stores can be configured to support writes if Feast configures logging functionality of served features. {% hint style="info" %} Java and Go Clients are also available for online feature retrieval. diff --git a/docs/getting-started/architecture-and-components/provider.md b/docs/getting-started/architecture-and-components/provider.md index 9eadf73ded..89f01c4e5b 100644 --- a/docs/getting-started/architecture-and-components/provider.md +++ b/docs/getting-started/architecture-and-components/provider.md @@ -1,10 +1,9 @@ # Provider -A provider is an implementation of a feature store using specific feature store components \(e.g. offline store, online store\) targeting a specific environment \(e.g. GCP stack\). +A provider is an implementation of a feature store using specific feature store components (e.g. offline store, online store) targeting a specific environment (e.g. GCP stack). -Providers orchestrate various components \(offline store, online store, infrastructure, compute\) inside an environment. For example, the `gcp` provider supports [BigQuery](https://cloud.google.com/bigquery) as an offline store and [Datastore](https://cloud.google.com/datastore) as an online store, ensuring that these components can work together seamlessly. Feast has three built-in providers \(`local`, `gcp`, and `aws`\) with default configurations that make it easy for users to start a feature store in a specific environment. These default configurations can be overridden easily. For instance, you can use the `gcp` provider but use Redis as the online store instead of Datastore. +Providers orchestrate various components (offline store, online store, infrastructure, compute) inside an environment. For example, the `gcp` provider supports [BigQuery](https://cloud.google.com/bigquery) as an offline store and [Datastore](https://cloud.google.com/datastore) as an online store, ensuring that these components can work together seamlessly. Feast has three built-in providers (`local`, `gcp`, and `aws`) with default configurations that make it easy for users to start a feature store in a specific environment. These default configurations can be overridden easily. For instance, you can use the `gcp` provider but use Redis as the online store instead of Datastore. -If the built-in providers are not sufficient, you can create your own custom provider. Please see [this guide](../../how-to-guides/creating-a-custom-provider.md) for more details. +If the built-in providers are not sufficient, you can create your own custom provider. Please see [this guide](../../how-to-guides/customizing-feast/creating-a-custom-provider.md) for more details. Please see [feature\_store.yaml](../../reference/feature-repository/feature-store-yaml.md#overview) for configuring providers. - diff --git a/docs/getting-started/architecture-and-components/registry.md b/docs/getting-started/architecture-and-components/registry.md index 6bbef98d17..0939fb53fc 100644 --- a/docs/getting-started/architecture-and-components/registry.md +++ b/docs/getting-started/architecture-and-components/registry.md @@ -2,11 +2,12 @@ The Feast feature registry is a central catalog of all the feature definitions and their related metadata. It allows data scientists to search, discover, and collaborate on new features. -Each Feast deployment has a single feature registry. Feast only supports file-based registries today, but supports three different backends +Each Feast deployment has a single feature registry. Feast only supports file-based registries today, but supports four different backends. * `Local`: Used as a local backend for storing the registry during development * `S3`: Used as a centralized backend for storing the registry on AWS * `GCS`: Used as a centralized backend for storing the registry on GCP +* `[Alpha] Azure`: Used as centralized backend for storing the registry on Azure Blob storage. The feature registry is updated during different operations when using Feast. More specifically, objects within the registry \(entities, feature views, feature services\) are updated when running `apply` from the Feast CLI, but metadata about objects can also be updated during operations like materialization. diff --git a/docs/getting-started/concepts/README.md b/docs/getting-started/concepts/README.md index 0fc415f059..e805e3b486 100644 --- a/docs/getting-started/concepts/README.md +++ b/docs/getting-started/concepts/README.md @@ -1,21 +1,33 @@ # Concepts -{% page-ref page="overview.md" %} +{% content-ref url="overview.md" %} +[overview.md](overview.md) +{% endcontent-ref %} -{% page-ref page="data-source.md" %} +{% content-ref url="data-ingestion.md" %} +[data-ingestion.md](data-ingestion.md) +{% endcontent-ref %} -{% page-ref page="dataset.md" %} +{% content-ref url="entity.md" %} +[entity.md](entity.md) +{% endcontent-ref %} -{% page-ref page="entity.md" %} +{% content-ref url="feature-view.md" %} +[feature-view.md](feature-view.md) +{% endcontent-ref %} -{% page-ref page="feature-view.md" %} +{% content-ref url="feature-retrieval.md" %} +[feature-retrieval.md](feature-retrieval.md) +{% endcontent-ref %} -{% page-ref page="feature-view.md" %} +{% content-ref url="point-in-time-joins.md" %} +[point-in-time-joins.md](point-in-time-joins.md) +{% endcontent-ref %} -{% page-ref page="stream-feature-view.md" %} +{% content-ref url="registry.md" %} +[registry.md](registry.md) +{% endcontent-ref %} -{% page-ref page="feature-retrieval.md" %} - -{% page-ref page="point-in-time-joins.md" %} - -{% page-ref page="registry.md" %} +{% content-ref url="dataset.md" %} +[dataset.md](dataset.md) +{% endcontent-ref %} diff --git a/docs/getting-started/concepts/data-ingestion.md b/docs/getting-started/concepts/data-ingestion.md new file mode 100644 index 0000000000..3dd3fbbd92 --- /dev/null +++ b/docs/getting-started/concepts/data-ingestion.md @@ -0,0 +1,95 @@ +# Data ingestion + +## Data source + +A data source in Feast refers to raw underlying data that users own (e.g. in a table in BigQuery). Feast does not manage any of the raw underlying data but instead, is in charge of loading this data and performing different operations on the data to retrieve or serve features. + +Feast uses a time-series data model to represent data. This data model is used to interpret feature data in data sources in order to build training datasets or materialize features into an online store. + +Below is an example data source with a single entity column (`driver`) and two feature columns (`trips_today`, and `rating`). + +![Ride-hailing data source](<../../.gitbook/assets/image (16).png>) + +Feast supports primarily **time-stamped** tabular data as data sources. There are many kinds of possible data sources: + +* **Batch data sources:** ideally, these live in data warehouses (BigQuery, Snowflake, Redshift), but can be in data lakes (S3, GCS, etc). Feast supports ingesting and querying data across both. +* **Stream data sources**: Feast does **not** have native streaming integrations. It does however facilitate making streaming features available in different environments. There are two kinds of sources: + * **Push sources** allow users to push features into Feast, and make it available for training / batch scoring ("offline"), for realtime feature serving ("online") or both. + * **\[Alpha] Stream sources** allow users to register metadata from Kafka or Kinesis sources. The onus is on the user to ingest from these sources, though Feast provides some limited helper methods to ingest directly from Kafka / Kinesis topics. +* **(Experimental) Request data sources:** This is data that is only available at request time (e.g. from a user action that needs an immediate model prediction response). This is primarily relevant as an input into [**on-demand feature views**](../../../docs/reference/alpha-on-demand-feature-view.md), which allow light-weight feature engineering and combining features across sources. + +## Batch data ingestion + +Ingesting from batch sources is only necessary to power real-time models. This is done through **materialization**. Under the hood, Feast manages an _offline store_ (to scalably generate training data from batch sources) and an _online store_ (to provide low-latency access to features for real-time models). + +A key command to use in Feast is the `materialize_incremental` command, which fetches the _latest_ values for all entities in the batch source and ingests these values into the online store. + +Materialization can be called programmatically or through the CLI: + +
+ +Code example: programmatic scheduled materialization + +This snippet creates a feature store object which points to the registry (which knows of all defined features) and the online store (DynamoDB in this case), and + +```python +# Define Python callable +def materialize(): + repo_config = RepoConfig( + registry=RegistryConfig(path="s3://[YOUR BUCKET]/registry.pb"), + project="feast_demo_aws", + provider="aws", + offline_store="file", + online_store=DynamoDBOnlineStoreConfig(region="us-west-2") + ) + store = FeatureStore(config=repo_config) + store.materialize_incremental(datetime.datetime.now()) + +# (In production) Use Airflow PythonOperator +materialize_python = PythonOperator( + task_id='materialize_python', + python_callable=materialize, +) +``` + +
+ +
+ +Code example: CLI based materialization + + + +#### How to run this in the CLI + +```bash +CURRENT_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S") +feast materialize-incremental $CURRENT_TIME +``` + +#### How to run this on Airflow + +```python +# Use BashOperator +materialize_bash = BashOperator( + task_id='materialize', + bash_command=f'feast materialize-incremental {datetime.datetime.now().replace(microsecond=0).isoformat()}', +) +``` + +
+ +### Batch data schema inference + +If the `schema` parameter is not specified when defining a data source, Feast attempts to infer the schema of the data source during `feast apply`. +The way it does this depends on the implementation of the offline store. For the offline stores that ship with Feast out of the box this inference is performed by inspecting the schema of the table in the cloud data warehouse, +or if a query is provided to the source, by running the query with a `LIMIT` clause and inspecting the result. + + +## Stream data ingestion + +Ingesting from stream sources happens either via a Push API or via a contrib processor that leverages an existing Spark context. + +* To **push data into the offline or online stores**: see [push sources](../../reference/data-sources/push.md) for details. +* (experimental) To **use a contrib Spark processor** to ingest from a topic, see [Tutorial: Building streaming features](../../tutorials/building-streaming-features.md) + diff --git a/docs/getting-started/concepts/data-source.md b/docs/getting-started/concepts/data-source.md deleted file mode 100644 index d468108ca1..0000000000 --- a/docs/getting-started/concepts/data-source.md +++ /dev/null @@ -1,12 +0,0 @@ -# Data source - -The data source refers to raw underlying data \(e.g. a table in BigQuery\). - -Feast uses a time-series data model to represent data. This data model is used to interpret feature data in data sources in order to build training datasets or when materializing features into an online store. - -Below is an example data source with a single entity \(`driver`\) and two features \(`trips_today`, and `rating`\). - -![Ride-hailing data source](../../.gitbook/assets/image%20%2816%29.png) - - - diff --git a/docs/getting-started/concepts/dataset.md b/docs/getting-started/concepts/dataset.md index 59f7168905..d55adb4703 100644 --- a/docs/getting-started/concepts/dataset.md +++ b/docs/getting-started/concepts/dataset.md @@ -1,22 +1,18 @@ -# Dataset +# \[Alpha] Saved dataset -Feast datasets allow for conveniently saving dataframes that include both features and entities to be subsequently used for data analysis and model training. -[Data Quality Monitoring](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98) was the primary motivation for creating dataset concept. +Feast datasets allow for conveniently saving dataframes that include both features and entities to be subsequently used for data analysis and model training. [Data Quality Monitoring](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98) was the primary motivation for creating dataset concept. Dataset's metadata is stored in the Feast registry and raw data (features, entities, additional input keys and timestamp) is stored in the [offline store](../architecture-and-components/offline-store.md). Dataset can be created from: -1. Results of historical retrieval -2. [planned] Logging request (including input for [on demand transformation](../../reference/alpha-on-demand-feature-view.md)) and response during feature serving -3. [planned] Logging features during writing to online store (from batch source or stream) +1. Results of historical retrieval +2. \[planned] Logging request (including input for [on demand transformation](../../reference/alpha-on-demand-feature-view.md)) and response during feature serving +3. \[planned] Logging features during writing to online store (from batch source or stream) -### Creating Saved Dataset from Historical Retrieval +### Creating a saved dataset from historical retrieval -To create a saved dataset from historical features for later retrieval or analysis, a user needs to call `get_historical_features` method first and then pass the returned retrieval job to `create_saved_dataset` method. -`create_saved_dataset` will trigger provided retrieval job (by calling `.persist()` on it) to store the data using specified `storage`. -Storage type must be the same as globally configured offline store (eg, it's impossible to persist data to Redshift with BigQuery source). -`create_saved_dataset` will also create SavedDataset object with all related metadata and will write it to the registry. +To create a saved dataset from historical features for later retrieval or analysis, a user needs to call `get_historical_features` method first and then pass the returned retrieval job to `create_saved_dataset` method. `create_saved_dataset` will trigger the provided retrieval job (by calling `.persist()` on it) to store the data using the specified `storage` behind the scenes. Storage type must be the same as the globally configured offline store (e.g it's impossible to persist data to a different offline source). `create_saved_dataset` will also create a `SavedDataset` object with all of the related metadata and will write this object to the registry. ```python from feast import FeatureStore @@ -39,12 +35,13 @@ dataset = store.create_saved_dataset( dataset.to_df() ``` -Saved dataset can be later retrieved using `get_saved_dataset` method: +Saved dataset can be retrieved later using the `get_saved_dataset` method in the feature store: + ```python dataset = store.get_saved_dataset('my_training_dataset') dataset.to_df() ``` ---- +*** -Check out our [tutorial on validating historical features](../../tutorials/validating-historical-features.md) to see how this concept can be applied in real-world use case. \ No newline at end of file +Check out our [tutorial on validating historical features](../../tutorials/validating-historical-features.md) to see how this concept can be applied in a real-world use case. diff --git a/docs/getting-started/concepts/entity.md b/docs/getting-started/concepts/entity.md index 77cfc0aff2..9203c01352 100644 --- a/docs/getting-started/concepts/entity.md +++ b/docs/getting-started/concepts/entity.md @@ -3,20 +3,41 @@ An entity is a collection of semantically related features. Users define entities to map to the domain of their use case. For example, a ride-hailing service could have customers and drivers as their entities, which group related features that correspond to these customers and drivers. ```python -driver = Entity(name='driver', value_type=ValueType.STRING, join_keys=['driver_id']) +driver = Entity(name='driver', join_keys=['driver_id']) ``` -Entities are typically defined as part of feature views. Entity name is used to reference the entity from a feature view definition and join key is used to identify the physical primary key on which feature values should be stored and retrieved. These keys are used during the lookup of feature values from the online store and the join process in point-in-time joins. It is possible to define composite entities \(more than one entity object\) in a feature view. It is also possible for feature views to have zero entities. See [feature view](feature-view.md) for more details. +The _entity name_ is used to uniquely identify the entity (for example to show in the experimental Web UI). The _join key_ is used to identify the physical primary key on which feature values should be joined together to be retrieved during feature retrieval. -Entities should be reused across feature views. +Entities are used by Feast in many contexts, as we explore below: -## **Entity key** +### Use case #1: Defining and storing features -A related concept is an entity key. These are one or more entity values that uniquely describe a feature view record. In the case of an entity \(like a `driver`\) that only has a single entity field, the entity _is_ an entity key. However, it is also possible for an entity key to consist of multiple entity values. For example, a feature view with the composite entity of \(customer, country\) might have an entity key of \(1001, 5\). +Feast's primary object for defining features is a _feature view,_ which is a collection of features. Feature views map to 0 or more entities, since a feature can be associated with: -![](../../.gitbook/assets/image%20%2815%29.png) +* zero entities (e.g. a global feature like _num\_daily\_global\_transactions_) +* one entity (e.g. a user feature like _user\_age_ or _last\_5\_bought\_items_) +* multiple entities, aka a composite key (e.g. a user + merchant category feature like _num\_user\_purchases\_in\_merchant\_category)_ -Entity keys act as primary keys. They are used during the lookup of features from the online store, and they are also used to match feature rows across feature views during point-in-time joins. +Feast refers to this collection of entities for a feature view as an **entity key**. +![](<../../.gitbook/assets/image (15).png>) +Entities should be reused across feature views. This helps with discovery of features, since it enables data scientists understand how other teams build features for the entity they are most interested in. +Feast will use the feature view concept to then define the schema of groups of features in a low-latency online store. + +### Use case #2: Retrieving features + +At _training time_, users control what entities they want to look up, for example corresponding to train / test / validation splits. A user specifies a list of _entity keys + timestamps_ they want to fetch [point-in-time](./point-in-time-joins.md) correct features for to generate a training dataset. + +At _serving time_, users specify _entity key(s)_ to fetch the latest feature values which can power real-time model prediction (e.g. a fraud detection model that needs to fetch the latest transaction user's features to make a prediction). + +{% hint style="info" %} +**Q: Can I retrieve features for all entities?** + +Kind of. + +In practice, this is most relevant for _batch scoring models_ (e.g. predict user churn for all existing users) that are offline only. For these use cases, Feast supports generating features for a SQL-backed list of entities. There is an [open GitHub issue](https://github.com/feast-dev/feast/issues/1611) that welcomes contribution to make this a more intuitive API. + +For _real-time feature retrieval_, there is no out of the box support for this because it would promote expensive and slow scan operations which can affect the performance of other operations on your data sources. Users can still pass in a large list of entities for retrieval, but this does not scale well. +{% endhint %} diff --git a/docs/getting-started/concepts/feast-types.md b/docs/getting-started/concepts/feast-types.md new file mode 100644 index 0000000000..72741f263e --- /dev/null +++ b/docs/getting-started/concepts/feast-types.md @@ -0,0 +1,14 @@ +# Data Types in Feast + +Feast frequently has to mediate data across platforms and systems, each with its own unique type system. +To make this possible, Feast itself has a type system for all the types it is able to handle natively. + +Feast's type system is built on top of [protobuf](https://github.com/protocolbuffers/protobuf). The messages that make up the type system can be found [here](https://github.com/feast-dev/feast/blob/master/protos/feast/types/Value.proto), and the corresponding python classes that wrap them can be found [here](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/types.py). + +Feast supports primitive data types (numerical values, strings, bytes, booleans and timestamps). The only complex data type Feast supports is Arrays, and arrays cannot contain other arrays. + +Each feature or schema field in Feast is associated with a data type, which is stored in Feast's [registry](registry.md). These types are also used to ensure that Feast operates on values correctly (e.g. making sure that timestamp columns used for [point-in-time correct joins](point-in-time-joins.md) actually have the timestamp type). + +As a result, each system that feast interacts with needs a way to translate data types from the native platform, into a feast type. E.g., Snowflake SQL types are converted to Feast types [here](https://rtd.feast.dev/en/master/feast.html#feast.type_map.snowflake_python_type_to_feast_value_type). The onus is therefore on authors of offline or online store connectors to make sure that this type mapping happens correctly. + +**Note**: Feast currently does *not* support a null type in its type system. \ No newline at end of file diff --git a/docs/getting-started/concepts/feature-repo.md b/docs/getting-started/concepts/feature-repo.md new file mode 100644 index 0000000000..a56f42ce10 --- /dev/null +++ b/docs/getting-started/concepts/feature-repo.md @@ -0,0 +1,22 @@ +# Feature Repository + +## Feature repository +Feast users use Feast to manage two important sets of configuration: + +* Configuration about how to run Feast on your infrastructure +* Feature definitions + +With Feast, the above configuration can be written declaratively and stored as code in a central location. This central location is called a feature repository. The feature repository is the declarative source of truth for what the desired state of a feature store should be. + +A feature repository is the collection of python files that define entities, feature views and data sources. Feature Repos also have a `feature_store.yaml` file at their root. + +Users can collaborate by making and reviewing changes to Feast object definitions (feature views, entities, etc.) in the feature repo. +But, these objects must be applied, either through API, or the CLI, for them to be available by downstream Feast actions (such as materialization, or retrieving online features). Internally, Feast only looks at the registry when performing these actions, and not at the feature repo directly. + +## Declarative Feature Definitions + +When using the CLI to apply changes (via `feast apply`), the CLI determines the state of the feature repo from the source files and updates the registry state to reflect the definitions in the feature repo files. +This means that new feature views are added to the registry, existing feature views are updated as necessary, and Feast objects removed from the source files are deleted from the registry. + +For more details, see the [Feature repository](../../reference/feature-repository/) reference. + diff --git a/docs/getting-started/concepts/feature-retrieval.md b/docs/getting-started/concepts/feature-retrieval.md index 85b7d9c5b7..f4462d0690 100644 --- a/docs/getting-started/concepts/feature-retrieval.md +++ b/docs/getting-started/concepts/feature-retrieval.md @@ -1,14 +1,191 @@ # Feature retrieval -## Dataset +## Overview -A dataset is a collection of rows that is produced by a historical retrieval from Feast in order to train a model. A dataset is produced by a join from one or more feature views onto an entity dataframe. Therefore, a dataset may consist of features from multiple feature views. +Generally, Feast supports several patterns of feature retrieval: -**Dataset vs Feature View:** Feature views contain the schema of data and a reference to where data can be found \(through its data source\). Datasets are the actual data manifestation of querying those data sources. +1. Training data generation (via `feature_store.get_historical_features(...)`) +2. Offline feature retrieval for batch scoring (via `feature_store.get_historical_features(...)`) +3. Online feature retrieval for real-time model predictions + - via the SDK: `feature_store.get_online_features(...)` + - via deployed feature server endpoints: `requests.post('http://localhost:6566/get-online-features', data=json.dumps(online_request))` -**Dataset vs Data Source:** Datasets are the output of historical retrieval, whereas data sources are the inputs. One or more data sources can be used in the creation of a dataset. +Each of these retrieval mechanisms accept: + +* some way of specifying entities (to fetch features for) +* some way to specify the features to fetch (either via [feature services](feature-retrieval.md#feature-services), which group features needed for a model version, or [feature references](feature-retrieval.md#feature-references)) + +Before beginning, you need to instantiate a local `FeatureStore` object that knows how to parse the registry (see [more details](https://docs.feast.dev/getting-started/concepts/registry)) + +
+ +How to: generate training data + +Feast abstracts away point-in-time join complexities with the `get_historical_features` API. + +It expects an **entity dataframe (or SQL query to retrieve a list of entities)** and a **list of feature references (or a feature service)** + +#### **Option 1: using feature references (to pick individual features when exploring data)** + +```python +entity_df = pd.DataFrame.from_dict( + { + "driver_id": [1001, 1002, 1003, 1004, 1001], + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + datetime(2021, 4, 12, 15, 1, 12), + datetime.now() + ] + } +) +training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_daily_features:daily_miles_driven" + ], +).to_df() +print(training_df.head()) +``` + +#### Option 2: using feature services (to version models) + +```python +entity_df = pd.DataFrame.from_dict( + { + "driver_id": [1001, 1002, 1003, 1004, 1001], + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + datetime(2021, 4, 12, 15, 1, 12), + datetime.now() + ] + } +) +training_df = store.get_historical_features( + entity_df=entity_df, + features=store.get_feature_service("model_v1"), +).to_df() +print(training_df.head()) +``` + +
+ +
+ +How to: retrieve offline features for batch scoring + +The main difference here compared to training data generation is how to handle timestamps in the entity dataframe. You want to pass in the **current time** to get the latest feature values for all your entities. + +#### Option 1: fetching features with entity dataframe + +```python +from feast import FeatureStore +import pandas as pd + +store = FeatureStore(repo_path=".") + +# Get the latest feature values for unique entities +entity_df = pd.DataFrame.from_dict({"driver_id": [1001, 1002, 1003, 1004, 1005],}) +entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) +batch_scoring_features = store.get_historical_features( + entity_df=entity_df, features=store.get_feature_service("model_v2"), +).to_df() +# predictions = model.predict(batch_scoring_features) +``` + +#### Option 2: fetching features using a SQL query to generate entities + +```python +from feast import FeatureStore + +store = FeatureStore(repo_path=".") + +# Get the latest feature values for unique entities +batch_scoring_features = store.get_historical_features( + entity_df=""" + SELECT + user_id, + CURRENT_TIME() as event_timestamp + FROM entity_source_table + WHERE user_last_active_time BETWEEN '2019-01-01' and '2020-12-31' + GROUP BY user_id + """ + , + features=store.get_feature_service("model_v2"), +).to_df() +# predictions = model.predict(batch_scoring_features) +``` + +
+ +
+ +How to: retrieve online features for real-time model inference (Python SDK) + +Feast will ensure the latest feature values for registered features are available. At retrieval time, you need to supply a list of **entities** and the corresponding **features** to be retrieved. Similar to `get_historical_features`, we recommend using feature services as a mechanism for grouping features in a model version. + +_Note: unlike `get_historical_features`, the `entity_rows` **do not need timestamps** since you only want one feature value per entity key._ + +```python +from feast import RepoConfig, FeatureStore +from feast.repo_config import RegistryConfig + +repo_config = RepoConfig( + registry=RegistryConfig(path="gs://feast-test-gcs-bucket/registry.pb"), + project="feast_demo_gcp", + provider="gcp", +) +store = FeatureStore(config=repo_config) + +features = store.get_online_features( + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_daily_features:daily_miles_driven", + ], + entity_rows=[ + { + "driver_id": 1001, + } + ], +).to_dict() +``` + +
+ +
+ +How to: retrieve online features for real-time model inference (Feature Server) + +Feast will ensure the latest feature values for registered features are available. At retrieval time, you need to supply a list of **entities** and the corresponding **features** to be retrieved. Similar to `get_historical_features`, we recommend using feature services as a mechanism for grouping features in a model version. + +_Note: unlike `get_historical_features`, the `entity_rows` **do not need timestamps** since you only want one feature value per entity key._ + +This approach requires you to deploy a feature server (see [Python feature server](../../reference/feature-servers/python-feature-server)). + +```python +import requests +import json + +online_request = { + "features": [ + "driver_hourly_stats:conv_rate", + ], + "entities": {"driver_id": [1001, 1002]}, +} +r = requests.post('http://localhost:6566/get-online-features', data=json.dumps(online_request)) +print(json.dumps(r.json(), indent=4, sort_keys=True)) +``` + +
## Feature Services + A feature service is an object that represents a logical group of features from one or more [feature views](feature-view.md#feature-view). Feature Services allows features from within a feature view to be used as needed by an ML model. Users can expect to create one feature service per model version, allowing for tracking of the features used by models. {% tabs %} @@ -38,6 +215,7 @@ Applying a feature service does not result in an actual service being deployed. Feature services enable referencing all or some features from a feature view. Retrieving from the online store with a feature service + ```python from feast import FeatureStore feature_store = FeatureStore('.') # Initialize the feature store @@ -49,6 +227,7 @@ features = feature_store.get_online_features( ``` Retrieving from the offline store with a feature service + ```python from feast import FeatureStore feature_store = FeatureStore('.') # Initialize the feature store @@ -78,10 +257,10 @@ online_features = fs.get_online_features( ) ``` -It is possible to retrieve features from multiple feature views with a single request, and Feast is able to join features from multiple tables in order to build a training dataset. However, It is not possible to reference \(or retrieve\) features from multiple projects at the same time. +It is possible to retrieve features from multiple feature views with a single request, and Feast is able to join features from multiple tables in order to build a training dataset. However, it is not possible to reference (or retrieve) features from multiple projects at the same time. {% hint style="info" %} -Note, if you're using [Feature views without entities](feature-view.md#feature-views-without-entities), then those features can be added here without additional entity values in the `entity_rows` +Note, if you're using [Feature views without entities](feature-view.md#feature-views-without-entities), then those features can be added here without additional entity values in the `entity_rows` parameter. {% endhint %} ## Event timestamp @@ -90,3 +269,10 @@ The timestamp on which an event occurred, as found in a feature view's data sour Event timestamps are used during point-in-time joins to ensure that the latest feature values are joined from feature views onto entity rows. Event timestamps are also used to ensure that old feature values aren't served to models during online serving. +## Dataset + +A dataset is a collection of rows that is produced by a historical retrieval from Feast in order to train a model. A dataset is produced by a join from one or more feature views onto an entity dataframe. Therefore, a dataset may consist of features from multiple feature views. + +**Dataset vs Feature View:** Feature views contain the schema of data and a reference to where data can be found (through its data source). Datasets are the actual data manifestation of querying those data sources. + +**Dataset vs Data Source:** Datasets are the output of historical retrieval, whereas data sources are the inputs. One or more data sources can be used in the creation of a dataset. diff --git a/docs/getting-started/concepts/feature-view.md b/docs/getting-started/concepts/feature-view.md index d0b8004828..ccb380497d 100644 --- a/docs/getting-started/concepts/feature-view.md +++ b/docs/getting-started/concepts/feature-view.md @@ -2,17 +2,35 @@ ## Feature views -A feature view is an object that represents a logical group of time-series feature data as it is found in a [data source](data-source.md). Feature views consist of zero or more [entities](entity.md), one or more [features](feature-view.md#feature), and a [data source](data-source.md). Feature views allow Feast to model your existing feature data in a consistent way in both an offline (training) and online (serving) environment. Feature views generally contain features that are properties of a specific object, in which case that object is defined as an entity and included in the feature view. If the features are not related to a specific object, the feature view might not have entities; see [feature views without entities](feature-view.md#feature-views-without-entities) below. +{% hint style="warning" %} +**Note**: Feature views do not work with non-timestamped data. A workaround is to insert dummy timestamps. +{% endhint %} + +A feature view is an object that represents a logical group of time-series feature data as it is found in a [data source](data-ingestion.md). Depending on the kind of feature view, it may contain some lightweight (experimental) feature transformations (see [\[Alpha\] On demand feature views](feature-view.md#alpha-on-demand-feature-views)). + +Feature views consist of: + +* a [data source](data-ingestion.md) +* zero or more [entities](entity.md) + * If the features are not related to a specific object, the feature view might not have entities; see [feature views without entities](feature-view.md#feature-views-without-entities) below. +* a name to uniquely identify this feature view in the project. +* (optional, but recommended) a schema specifying one or more [features](feature-view.md#feature) (without this, Feast will infer the schema by reading from the data source) +* (optional, but recommended) metadata (for example, description, or other free-form metadata via `tags`) +* (optional) a TTL, which limits how far back Feast will look when generating historical datasets + +Feature views allow Feast to model your existing feature data in a consistent way in both an offline (training) and online (serving) environment. Feature views generally contain features that are properties of a specific object, in which case that object is defined as an entity and included in the feature view. {% tabs %} {% tab title="driver_trips_feature_view.py" %} ```python -from feast import BigQuerySource, FeatureView, Field +from feast import BigQuerySource, Entity, FeatureView, Field from feast.types import Float32, Int64 +driver = Entity(name="driver", join_keys=["driver_id"]) + driver_stats_fv = FeatureView( name="driver_activity", - entities=["driver"], + entities=[driver], schema=[ Field(name="trips_today", dtype=Int64), Field(name="rating", dtype=Float32), @@ -31,13 +49,9 @@ Feature views are used during * Loading of feature values into an online store. Feature views determine the storage schema in the online store. Feature values can be loaded from batch sources or from [stream sources](../../reference/data-sources/push.md). * Retrieval of features from the online store. Feature views provide the schema definition to Feast in order to look up features from the online store. -{% hint style="info" %} -Feast does not generate feature values. It acts as the ingestion and serving system. The data sources described within feature views should reference feature values in their already computed form. -{% endhint %} - ## Feature views without entities -If a feature view contains features that are not related to a specific entity, the feature view can be defined without entities (only event timestamps are needed for this feature view). +If a feature view contains features that are not related to a specific entity, the feature view can be defined without entities (only timestamps are needed for this feature view). {% tabs %} {% tab title="global_stats.py" %} @@ -61,9 +75,7 @@ global_stats_fv = FeatureView( ## Feature inferencing -If the `features` parameter is not specified in the feature view creation, Feast will infer the features during `feast apply` by creating a feature for each column in the -underlying data source except the columns corresponding to the entities of the feature view or the columns corresponding to the timestamp columns of the feature view's -data source. The names and value types of the inferred features will use the names and data types of the columns from which the features were inferred. +If the `schema` parameter is not specified in the creation of the feature view, Feast will infer the features during `feast apply` by creating a `Field` for each column in the underlying data source except the columns corresponding to the entities of the feature view or the columns corresponding to the timestamp columns of the feature view's data source. The names and value types of the inferred features will use the names and data types of the columns from which the features were inferred. ## Entity aliasing @@ -76,16 +88,17 @@ It is suggested that you dynamically specify the new FeatureView name using `.wi {% tabs %} {% tab title="location_stats_feature_view.py" %} ```python -from feast import BigQuerySource, Entity, FeatureView, Field, ValueType -from feast.types import Int32 +from feast import BigQuerySource, Entity, FeatureView, Field +from feast.types import Int32, Int64 -location = Entity(name="location", join_keys=["location_id"], value_type=ValueType.INT64) +location = Entity(name="location", join_keys=["location_id"]) location_stats_fv= FeatureView( name="location_stats", - entities=["location"], + entities=[location], schema=[ - Field(name="temperature", dtype=Int32) + Field(name="temperature", dtype=Int32), + Field(name="location_id", dtype=Int64), ], source=BigQuerySource( table="feast-oss.demo_data.location_stats" @@ -117,11 +130,11 @@ temperatures_fs = FeatureService( {% endtab %} {% endtabs %} -## Feature +## Field -A feature is an individual measurable property. It is typically a property observed on a specific entity, but does not have to be associated with an entity. For example, a feature of a `customer` entity could be the number of transactions they have made on an average month, while a feature that is not observed on a specific entity could be the total number of posts made by all users in the last month. +A field or feature is an individual measurable property. It is typically a property observed on a specific entity, but does not have to be associated with an entity. For example, a feature of a `customer` entity could be the number of transactions they have made on an average month, while a feature that is not observed on a specific entity could be the total number of posts made by all users in the last month. Supported types for fields in Feast can be found in `sdk/python/feast/types.py`. -Features are defined as part of feature views. Since Feast does not transform data, a feature is essentially a schema that only contains a name and a type: +Fields are defined as part of feature views. Since Feast does not transform data, a field is essentially a schema that only contains a name and a type: ```python from feast import Field @@ -133,16 +146,32 @@ trips_today = Field( ) ``` -Together with [data sources](data-source.md), they indicate to Feast where to find your feature values, e.g., in a specific parquet file or BigQuery table. Feature definitions are also used when reading features from the feature store, using [feature references](feature-retrieval.md#feature-references). +Together with [data sources](data-ingestion.md), they indicate to Feast where to find your feature values, e.g., in a specific parquet file or BigQuery table. Feature definitions are also used when reading features from the feature store, using [feature references](feature-retrieval.md#feature-references). Feature names must be unique within a [feature view](feature-view.md#feature-view). +Each field can have additional metadata associated with it, specified as key-value [tags](https://rtd.feast.dev/en/master/feast.html#feast.field.Field). + ## \[Alpha] On demand feature views -On demand feature views allows users to use existing features and request time data (features only available at request time) to transform and create new features. Users define python transformation logic which is executed in both historical retrieval and online retrieval paths: +On demand feature views allows data scientists to use existing features and request time data (features only available at request time) to transform and create new features. Users define python transformation logic which is executed in both the historical retrieval and online retrieval paths. + +Currently, these transformations are executed locally. This is fine for online serving, but does not scale well to offline retrieval. + +### Why use on demand feature views? + +This enables data scientists to easily impact the online feature retrieval path. For example, a data scientist could + +1. Call `get_historical_features` to generate a training dataframe +2. Iterate in notebook on feature engineering in Pandas +3. Copy transformation logic into on demand feature views and commit to a dev branch of the feature repository +4. Verify with `get_historical_features` (on a small dataset) that the transformation gives expected output over historical data +5. Verify with `get_online_features` on dev branch that the transformation correctly outputs online features +6. Submit a pull request to the staging / prod branches which impact production traffic ```python from feast import Field, RequestSource +from feast.on_demand_feature_view import on_demand_feature_view from feast.types import Float64 # Define a request data source which encodes features / information only @@ -172,3 +201,58 @@ def transformed_conv_rate(features_df: pd.DataFrame) -> pd.DataFrame: df['conv_rate_plus_val2'] = (features_df['conv_rate'] + features_df['val_to_add_2']) return df ``` + +## \[Alpha] Stream feature views + +A stream feature view is an extension of a normal feature view. The primary difference is that stream feature views have both stream and batch data sources, whereas a normal feature view only has a batch data source. + +Stream feature views should be used instead of normal feature views when there are stream data sources (e.g. Kafka and Kinesis) available to provide fresh features in an online setting. Here is an example definition of a stream feature view with an attached transformation: + +```python +from datetime import timedelta + +from feast import Field, FileSource, KafkaSource, stream_feature_view +from feast.data_format import JsonFormat +from feast.types import Float32 + +driver_stats_batch_source = FileSource( + name="driver_stats_source", + path="data/driver_stats.parquet", + timestamp_field="event_timestamp", +) + +driver_stats_stream_source = KafkaSource( + name="driver_stats_stream", + kafka_bootstrap_servers="localhost:9092", + topic="drivers", + timestamp_field="event_timestamp", + batch_source=driver_stats_batch_source, + message_format=JsonFormat( + schema_json="driver_id integer, event_timestamp timestamp, conv_rate double, acc_rate double, created timestamp" + ), + watermark_delay_threshold=timedelta(minutes=5), +) + +@stream_feature_view( + entities=[driver], + ttl=timedelta(seconds=8640000000), + mode="spark", + schema=[ + Field(name="conv_percentage", dtype=Float32), + Field(name="acc_percentage", dtype=Float32), + ], + timestamp_field="event_timestamp", + online=True, + source=driver_stats_stream_source, +) +def driver_hourly_stats_stream(df: DataFrame): + from pyspark.sql.functions import col + + return ( + df.withColumn("conv_percentage", col("conv_rate") * 100.0) + .withColumn("acc_percentage", col("acc_rate") * 100.0) + .drop("conv_rate", "acc_rate") + ) +``` + +See [here](https://github.com/feast-dev/streaming-tutorial) for a example of how to use stream feature views to register your own streaming data pipelines in Feast. diff --git a/docs/getting-started/concepts/overview.md b/docs/getting-started/concepts/overview.md index 7134073792..ffbad86c03 100644 --- a/docs/getting-started/concepts/overview.md +++ b/docs/getting-started/concepts/overview.md @@ -1,14 +1,29 @@ # Overview -The top-level namespace within Feast is a [project](overview.md#project). Users define one or more [feature views](feature-view.md) within a project. Each feature view contains one or more [features](feature-view.md#feature). These features typically relate to one or more [entities](entity.md). A feature view must always have a [data source](data-source.md), which in turn is used during the generation of training [datasets](feature-retrieval.md#dataset) and when materializing feature values into the online store. +### Feast project structure -![](../../.gitbook/assets/image%20%287%29.png) +The top-level namespace within Feast is a **project**. Users define one or more [feature views](feature-view.md) within a project. Each feature view contains one or more [features](feature-view.md#feature). These features typically relate to one or more [entities](entity.md). A feature view must always have a [data source](data-ingestion.md), which in turn is used during the generation of training [datasets](feature-retrieval.md#dataset) and when materializing feature values into the online store. -## Project +![](<../../.gitbook/assets/image (7).png>) -Projects provide complete isolation of feature stores at the infrastructure level. This is accomplished through resource namespacing, e.g., prefixing table names with the associated project. Each project should be considered a completely separate universe of entities and features. It is not possible to retrieve features from multiple projects in a single request. We recommend having a single feature store and a single project per environment \(`dev`, `staging`, `prod`\). +**Projects** provide complete isolation of feature stores at the infrastructure level. This is accomplished through resource namespacing, e.g., prefixing table names with the associated project. Each project should be considered a completely separate universe of entities and features. It is not possible to retrieve features from multiple projects in a single request. We recommend having a single feature store and a single project per environment (`dev`, `staging`, `prod`). -{% hint style="info" %} -Projects are currently being supported for backward compatibility reasons. Projects may change in the future as we simplify the Feast API. -{% endhint %} +### Data ingestion +For _offline use cases_ that only rely on batch data, Feast does not need to ingest data and can query your existing data (leveraging a compute engine, whether it be a data warehouse or (experimental) Spark / Trino). Feast can help manage **pushing** streaming features to a batch source to make features available for training. + +For _online use cases_, Feast supports **ingesting** features from batch sources to make them available online (through a process called **materialization**), and **pushing** streaming features to make them available both offline / online. We explore this more in the next concept page ([Data ingestion](data-ingestion.md)) + +### Feature registration and retrieval + +Features are _registered_ as code in a version controlled repository, and tie to data sources + model versions via the concepts of **entities, feature views,** and **feature services.** We explore these concepts more in the upcoming concept pages. These features are then _stored_ in a **registry**, which can be accessed across users and services. The features can then be _retrieved_ via SDK API methods or via a deployed **feature server** which exposes endpoints to query for online features (to power real time models). + + + +Feast supports several patterns of feature retrieval. + +| Use case | Example | API | +| :------------------------------------------------------: | :----------------------------------------------------------------------------------------------------: | :-----------------------: | +| Training data generation | Fetching user and item features for (user, item) pairs when training a production recommendation model | `get_historical_features` | +| Offline feature retrieval for batch predictions | Predicting user churn for all users on a daily basis | `get_historical_features` | +| Online feature retrieval for real-time model predictions | Fetching pre-computed features to predict whether a real-time credit card transaction is fraudulent | `get_online_features` | diff --git a/docs/getting-started/concepts/point-in-time-joins.md b/docs/getting-started/concepts/point-in-time-joins.md index d2961e2f74..5567220900 100644 --- a/docs/getting-started/concepts/point-in-time-joins.md +++ b/docs/getting-started/concepts/point-in-time-joins.md @@ -7,13 +7,15 @@ Feature values in Feast are modeled as time-series records. Below is an example The above table can be registered with Feast through the following feature view: ```python -from feast import FeatureView, Field, FileSource +from feast import Entity, FeatureView, Field, FileSource from feast.types import Float32, Int64 from datetime import timedelta +driver = Entity(name="driver", join_keys=["driver_id"]) + driver_stats_fv = FeatureView( name="driver_hourly_stats", - entities=["driver"], + entities=[driver], schema=[ Field(name="trips_today", dtype=Int64), Field(name="earnings_today", dtype=Float32), @@ -46,7 +48,7 @@ training_df = store.get_historical_features( ) ``` -For each row within the entity dataframe, Feast will query and join the selected features from the appropriate feature view data source. Feast will scan backward in time from the entity dataframe timestamp up to a maximum of the TTL time. +For each row within the entity dataframe, Feast will query and join the selected features from the appropriate feature view data source. Feast will scan backward in time from the entity dataframe timestamp up to a maximum of the TTL time specified. ![](../../.gitbook/assets/image%20%2831%29.png) diff --git a/docs/getting-started/concepts/registry.md b/docs/getting-started/concepts/registry.md index 2236f31931..b228050944 100644 --- a/docs/getting-started/concepts/registry.md +++ b/docs/getting-started/concepts/registry.md @@ -1,9 +1,88 @@ # Registry -The Feast registry is where all applied Feast objects (e.g. Feature views, entities, etc) are stored. The registry exposes methods to apply, list, retrieve and delete these objects. The registry is abstraction, with multiple possible implementations. +Feast uses a registry to store all applied Feast objects (e.g. Feature views, entities, etc). The registry exposes +methods to apply, list, retrieve and delete these objects, and is an abstraction with multiple implementations. -By default, the registry Feast uses a file-based registry implementation, which stores the protobuf representation of the registry as a serialized file. This registry file can be stored in a local file system, or in cloud storage (in, say, S3 or GCS). +### Options for registry implementations -However, there's inherent limitations with a file-based registry, since changing a single field in the registry requires re-writing the whole registry file. With multiple concurrent writers, this presents a risk of data loss, or bottlenecks writes to the registry since all changes have to be serialized (e.g. when running materialization for multiple feature views or time ranges concurrently). +#### File-based registry +By default, Feast uses a file-based registry implementation, which stores the protobuf representation of the registry as +a serialized file. This registry file can be stored in a local file system, or in cloud storage (in, say, S3 or GCS, or Azure). -Alternatively, a [SQL Registry](../../tutorials/using-scalable-registry.md) can be used for a more scalable registry. \ No newline at end of file +The quickstart guides that use `feast init` will use a registry on a local file system. To allow Feast to configure +a remote file registry, you need to create a GCS / S3 bucket that Feast can understand: +{% tabs %} +{% tab title="Example S3 file registry" %} +```yaml +project: feast_demo_aws +provider: aws +registry: s3://[YOUR BUCKET YOU CREATED]/registry.pb +online_store: null +offline_store: + type: file +``` +{% endtab %} + +{% tab title="Example GCS file registry" %} +```yaml +project: feast_demo_gcp +provider: gcp +registry: gs://[YOUR BUCKET YOU CREATED]/registry.pb +online_store: null +offline_store: + type: file +``` +{% endtab %} +{% endtabs %} + +However, there are inherent limitations with a file-based registry, since changing a single field in the registry +requires re-writing the whole registry file. With multiple concurrent writers, this presents a risk of data loss, or +bottlenecks writes to the registry since all changes have to be serialized (e.g. when running materialization for +multiple feature views or time ranges concurrently). + +#### SQL Registry +Alternatively, a [SQL Registry](../../tutorials/using-scalable-registry.md) can be used for a more scalable registry. + +This supports any SQLAlchemy compatible database as a backend. The exact schema can be seen in [sql.py](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/infra/registry/sql.py) + +### Updating the registry + +We recommend users store their Feast feature definitions in a version controlled repository, which then via CI/CD +automatically stays synced with the registry. Users will often also want multiple registries to correspond to +different environments (e.g. dev vs staging vs prod), with staging and production registries with locked down write +access since they can impact real user traffic. See [Running Feast in Production](../../how-to-guides/running-feast-in-production.md#1.-automatically-deploying-changes-to-your-feature-definitions) for details on how to set this up. + +### Accessing the registry from clients + +Users can specify the registry through a `feature_store.yaml` config file, or programmatically. We often see teams +preferring the programmatic approach because it makes notebook driven development very easy: + +#### Option 1: programmatically specifying the registry + +```python +repo_config = RepoConfig( + registry=RegistryConfig(path="gs://feast-test-gcs-bucket/registry.pb"), + project="feast_demo_gcp", + provider="gcp", + offline_store="file", # Could also be the OfflineStoreConfig e.g. FileOfflineStoreConfig + online_store="null", # Could also be the OnlineStoreConfig e.g. RedisOnlineStoreConfig +) +store = FeatureStore(config=repo_config) +``` + +#### Option 2: specifying the registry in the project's `feature_store.yaml` file + +```yaml +project: feast_demo_aws +provider: aws +registry: s3://feast-test-s3-bucket/registry.pb +online_store: null +offline_store: + type: file +``` + +Instantiating a `FeatureStore` object can then point to this: + +```python +store = FeatureStore(repo_path=".") +``` \ No newline at end of file diff --git a/docs/getting-started/concepts/stream-feature-view.md b/docs/getting-started/concepts/stream-feature-view.md deleted file mode 100644 index 2ce3993614..0000000000 --- a/docs/getting-started/concepts/stream-feature-view.md +++ /dev/null @@ -1,56 +0,0 @@ -# Stream feature view - -## Stream feature views - -A stream feature view is an extension of a normal feature view. The primary difference is that stream feature views have both stream and batch data sources, whereas a normal feature view only has a batch data source. - -Stream feature views should be used instead of normal feature views when there are stream data sources (e.g. Kafka and Kinesis) available to provide fresh features in an online setting. Here is an example definition of a stream feature view with an attached transformation: - -```python -from datetime import timedelta - -from feast import Field, FileSource, KafkaSource, stream_feature_view -from feast.data_format import JsonFormat -from feast.types import Float32 - -driver_stats_batch_source = FileSource( - name="driver_stats_source", - path="data/driver_stats.parquet", - timestamp_field="event_timestamp", -) - -driver_stats_stream_source = KafkaSource( - name="driver_stats_stream", - kafka_bootstrap_servers="localhost:9092", - topic="drivers", - timestamp_field="event_timestamp", - batch_source=driver_stats_batch_source, - message_format=JsonFormat( - schema_json="driver_id integer, event_timestamp timestamp, conv_rate double, acc_rate double, created timestamp" - ), - watermark_delay_threshold=timedelta(minutes=5), -) - -@stream_feature_view( - entities=[driver], - ttl=timedelta(seconds=8640000000), - mode="spark", - schema=[ - Field(name="conv_percentage", dtype=Float32), - Field(name="acc_percentage", dtype=Float32), - ], - timestamp_field="event_timestamp", - online=True, - source=driver_stats_stream_source, -) -def driver_hourly_stats_stream(df: DataFrame): - from pyspark.sql.functions import col - - return ( - df.withColumn("conv_percentage", col("conv_rate") * 100.0) - .withColumn("acc_percentage", col("acc_rate") * 100.0) - .drop("conv_rate", "acc_rate") - ) -``` - -See [here](https://github.com/feast-dev/streaming-tutorial) for a example of how to use stream feature views. diff --git a/docs/getting-started/faq.md b/docs/getting-started/faq.md index b2438fdf7a..a511ddb0dc 100644 --- a/docs/getting-started/faq.md +++ b/docs/getting-started/faq.md @@ -10,7 +10,7 @@ We encourage you to ask questions on [Slack](https://slack.feast.dev) or [GitHub ### Do you have any examples of how Feast should be used? -The [quickstart](quickstart.md) is the easiest way to learn about Feast. For more detailed tutorials, please check out the [tutorials](../tutorials/tutorials-overview.md) page. +The [quickstart](quickstart.md) is the easiest way to learn about Feast. For more detailed tutorials, please check out the [tutorials](../tutorials/tutorials-overview/) page. ## Concepts @@ -19,13 +19,14 @@ The [quickstart](quickstart.md) is the easiest way to learn about Feast. For mor No, there are [feature views without entities](concepts/feature-view.md#feature-views-without-entities). ### How does Feast handle model or feature versioning? -Feast expects that each version of a model corresponds to a different feature service. -Feature views once they are used by a feature service are intended to be immutable and not deleted (until a feature service is removed). In the future, `feast plan` and `feast apply will throw errors if it sees this kind of behavior. +Feast expects that each version of a model corresponds to a different feature service. + +Feature views once they are used by a feature service are intended to be immutable and not deleted (until a feature service is removed). In the future, `feast plan` and `feast apply` will throw errors if it sees this kind of behavior. ### What is the difference between data sources and the offline store? -The data source itself defines the underlying data warehouse table in which the features are stored. The offline store interface defines the APIs required to make an arbitrary compute layer work for Feast (e.g. pulling features given a set of feature views from their sources, exporting the data set results to different formats). Please see [data sources](concepts/data-source.md) and [offline store](architecture-and-components/offline-store.md) for more details. +The data source itself defines the underlying data warehouse table in which the features are stored. The offline store interface defines the APIs required to make an arbitrary compute layer work for Feast (e.g. pulling features given a set of feature views from their sources, exporting the data set results to different formats). Please see [data sources](concepts/data-ingestion.md) and [offline store](architecture-and-components/offline-store.md) for more details. ### Is it possible to have offline and online stores from different providers? @@ -34,6 +35,7 @@ Yes, this is possible. For example, you can use BigQuery as an offline store and ## Functionality ### How do I run `get_historical_features` without providing an entity dataframe? + Feast does not provide a way to do this right now. This is an area we're actively interested in contributions for. See [GitHub issue](https://github.com/feast-dev/feast/issues/1611) ### Does Feast provide security or access control? @@ -44,19 +46,21 @@ It is a good idea though to lock down the registry file so only the CI/CD pipeli ### Does Feast support streaming sources? -Yes. In earlier versions of Feast, we used Feast Spark to manage ingestion from stream sources. In the current version of Feast, we support [push based ingestion](../reference/data-sources/push.md). Streaming transformations are actively being worked on. +Yes. In earlier versions of Feast, we used Feast Spark to manage ingestion from stream sources. In the current version of Feast, we support [push based ingestion](../reference/data-sources/push.md). Feast also defines a [stream processor](../tutorials/building-streaming-features.md) that allows a deeper integration with stream sources. ### Does Feast support feature transformation? There are several kinds of transformations: -- On demand transformations (See [docs](../reference/alpha-on-demand-feature-view.md)) - - These transformations are Pandas transformations run on batch data when you call `get_historical_features` and at online serving time when you call `get_online_features. - - Note that if you use push sources to ingest streaming features, these transformations will execute on the fly as well -- Batch transformations (WIP, see [RFC](https://docs.google.com/document/d/1964OkzuBljifDvkV-0fakp2uaijnVzdwWNGdz7Vz50A/edit#)) - - These will include SQL + PySpark based transformations on batch data sources. -- Streaming transformations (RFC in progress) + +* On demand transformations (See [docs](../reference/alpha-on-demand-feature-view.md)) + * These transformations are Pandas transformations run on batch data when you call `get_historical_features` and at online serving time when you call \`get\_online\_features. + * Note that if you use push sources to ingest streaming features, these transformations will execute on the fly as well +* Batch transformations (WIP, see [RFC](https://docs.google.com/document/d/1964OkzuBljifDvkV-0fakp2uaijnVzdwWNGdz7Vz50A/edit)) + * These will include SQL + PySpark based transformations on batch data sources. +* Streaming transformations (RFC in progress) ### Does Feast have a Web UI? + Yes. See [documentation](../reference/alpha-web-ui.md). ### Does Feast support composite keys? @@ -84,15 +88,26 @@ Yes. Specifically: ### Does Feast support X storage engine? -The list of supported offline and online stores can be found [here](../reference/offline-stores/) and [here](../reference/online-stores/), respectively. The [roadmap](../roadmap.md) indicates the stores for which we are planning to add support. Finally, our Provider abstraction is built to be extensible, so you can plug in your own implementations of offline and online stores. Please see more details about custom providers [here](../how-to-guides/creating-a-custom-provider.md). +The list of supported offline and online stores can be found [here](../reference/offline-stores/) and [here](../reference/online-stores/), respectively. The [roadmap](../roadmap.md) indicates the stores for which we are planning to add support. Finally, our Provider abstraction is built to be extensible, so you can plug in your own implementations of offline and online stores. Please see more details about customizing Feast [here](../how-to-guides/customizing-feast/). ### Does Feast support using different clouds for offline vs online stores? Yes. Using a GCP or AWS provider in `feature_store.yaml` primarily sets default offline / online stores and configures where the remote registry file can live (Using the AWS provider also allows for deployment to AWS Lambda). You can override the offline and online stores to be in different clouds if you wish. +### What is the difference between a data source and an offline store? + +The data source and the offline store are closely tied, but separate concepts. +The offline store controls how feast talks to a data store for historical feature retrieval, and the data source points to specific table (or query) within a data store. Offline stores are infrastructure-level connectors to data stores like Snowflake. + +Additional differences: + +- Data sources may be specific to a project (e.g. feed ranking), but offline stores are agnostic and used across projects. +- A feast project may define several data sources that power different feature views, but a feast project has a single offline store. +- Feast users typically need to define data sources when using feast, but only need to use/configure existing offline stores without creating new ones. + ### How can I add a custom online store? -Please follow the instructions [here](../how-to-guides/adding-support-for-a-new-online-store.md). +Please follow the instructions [here](../how-to-guides/customizing-feast/adding-support-for-a-new-online-store.md). ### Can the same storage engine be used for both the offline and online store? @@ -105,10 +120,6 @@ Yes. There are two ways to use S3 in Feast: * Using Redshift as a data source via Spectrum ([AWS tutorial](https://docs.aws.amazon.com/redshift/latest/dg/tutorial-nested-data-create-table.html)), and then continuing with the [Running Feast with Snowflake/GCP/AWS](../how-to-guides/feast-snowflake-gcp-aws/) guide. See a [presentation](https://youtu.be/pMFbRJ7AnBk?t=9463) we did on this at our apply() meetup. * Using the `s3_endpoint_override` in a `FileSource` data source. This endpoint is more suitable for quick proof of concepts that won't necessarily scale for production use cases. -### How can I use Spark with Feast? - -Feast supports ingestion via Spark (See ) does not support Spark natively. However, you can create a [custom provider](../how-to-guides/creating-a-custom-provider.md) that will support Spark, which can help with more scalable materialization and ingestion. - ### Is Feast planning on supporting X functionality? Please see the [roadmap](../roadmap.md). @@ -119,7 +130,6 @@ Please see the [roadmap](../roadmap.md). For more details on contributing to the Feast community, see [here](../community.md) and this [here](../project/contributing.md). - ## Feast 0.9 (legacy) ### What is the difference between Feast 0.9 and Feast 0.10+? @@ -130,7 +140,6 @@ Feast 0.10+ is much lighter weight and more extensible than Feast 0.9. It is des Please see this [document](https://docs.google.com/document/d/1AOsr\_baczuARjCpmZgVd8mCqTF4AZ49OEyU4Cn-uTT0). If you have any questions or suggestions, feel free to leave a comment on the document! - ### What are the plans for Feast Core, Feast Serving, and Feast Spark? -Feast Core and Feast Serving were both part of Feast Java. We plan to support Feast Serving. We will not support Feast Core; instead we will support our object store based registry. We will not support Feast Spark. For more details on what we plan on supporting, please see the [roadmap](../roadmap.md). \ No newline at end of file +Feast Core and Feast Serving were both part of Feast Java. We plan to support Feast Serving. We will not support Feast Core; instead we will support our object store based registry. We will not support Feast Spark. For more details on what we plan on supporting, please see the [roadmap](../roadmap.md). diff --git a/docs/getting-started/feast-workshop.md b/docs/getting-started/feast-workshop.md index 8b6778c2d3..0d64845222 100644 --- a/docs/getting-started/feast-workshop.md +++ b/docs/getting-started/feast-workshop.md @@ -30,15 +30,15 @@ _See also:_ [_Feast quickstart_](https://docs.feast.dev/getting-started/quicksta These are meant mostly to be done in order, with examples building on previous concepts. -See https://github.com/feast-dev/feast-workshop - -| Time (min) | Description | Module | -| :--------: | ----------------------------------------------------------------------- |-----------| -| 30-45 | Setting up Feast projects & CI/CD + powering batch predictions | Module 0 | -| 15-20 | Streaming ingestion & online feature retrieval with Kafka, Spark, Redis | Module 1 | -| 10-15 | Real-time feature engineering with on demand transformations | Module 2 | -| TBD | Feature server deployment (embed, as a service, AWS Lambda) | TBD | -| TBD | Versioning features / models in Feast | TBD | -| TBD | Data quality monitoring in Feast | TBD | -| TBD | Batch transformations | TBD | -| TBD | Stream transformations | TBD | +See [https://github.com/feast-dev/feast-workshop](https://github.com/feast-dev/feast-workshop) + +| Time (min) | Description | Module | +| :--------: | ----------------------------------------------------------------------- | -------- | +| 30-45 | Setting up Feast projects & CI/CD + powering batch predictions | Module 0 | +| 15-20 | Streaming ingestion & online feature retrieval with Kafka, Spark, Redis | Module 1 | +| 10-15 | Real-time feature engineering with on demand transformations | Module 2 | +| TBD | Feature server deployment (embed, as a service, AWS Lambda) | TBD | +| TBD | Versioning features / models in Feast | TBD | +| TBD | Data quality monitoring in Feast | TBD | +| TBD | Batch transformations | TBD | +| TBD | Stream transformations | TBD | diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index 7bbcb78732..57b007707a 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -4,31 +4,38 @@ In this tutorial we will 1. Deploy a local feature store with a **Parquet file offline store** and **Sqlite online store**. 2. Build a training dataset using our time series features from our **Parquet files**. -3. Materialize feature values from the offline store into the online store. -4. Read the latest features from the online store for inference. - -You can run this tutorial in Google Colab or run it on your localhost, following the guided steps below. - -![](../.gitbook/assets/colab\_logo\_32px.png)[**Run in Google Colab**](https://colab.research.google.com/github/feast-dev/feast/blob/master/examples/quickstart/quickstart.ipynb) +3. Ingest batch features ("materialization") and streaming features (via a Push API) into the online store. +4. Read the latest features from the offline store for batch scoring +5. Read the latest features from the online store for real-time inference. +6. Explore the (experimental) Feast UI ## Overview -In this tutorial, we use feature stores to generate training data and power online model inference for a ride-sharing driver satisfaction prediction model. Feast solves several common issues in this flow: - -1. **Training-serving skew and complex data joins:** Feature values often exist across multiple tables. Joining these datasets can be complicated, slow, and error-prone. - * Feast joins these tables with battle-tested logic that ensures _point-in-time_ correctness so future feature values do not leak to models. - * Feast alerts users to offline / online skew with data quality monitoring -2. **Online feature availability:** At inference time, models often need access to features that aren't readily available and need to be precomputed from other datasources. - * Feast manages deployment to a variety of online stores (e.g. DynamoDB, Redis, Google Cloud Datastore) and ensures necessary features are consistently _available_ and _freshly computed_ at inference time. -3. **Feature reusability and model versioning:** Different teams within an organization are often unable to reuse features across projects, resulting in duplicate feature creation logic. Models have data dependencies that need to be versioned, for example when running A/B tests on model versions. - * Feast enables discovery of and collaboration on previously used features and enables versioning of sets of features (via _feature services_). - * Feast enables feature transformation so users can re-use transformation logic across online / offline usecases and across models. +In this tutorial, we'll use Feast to generate training data and power online model inference for a +ride-sharing driver satisfaction prediction model. Feast solves several common issues in this flow: + +1. **Training-serving skew and complex data joins:** Feature values often exist across multiple tables. Joining + these datasets can be complicated, slow, and error-prone. + * Feast joins these tables with battle-tested logic that ensures _point-in-time_ correctness so future feature + values do not leak to models. +2. **Online feature availability:** At inference time, models often need access to features that aren't readily + available and need to be precomputed from other data sources. + * Feast manages deployment to a variety of online stores (e.g. DynamoDB, Redis, Google Cloud Datastore) and + ensures necessary features are consistently _available_ and _freshly computed_ at inference time. +3. **Feature and model versioning:** Different teams within an organization are often unable to reuse + features across projects, resulting in duplicate feature creation logic. Models have data dependencies that need + to be versioned, for example when running A/B tests on model versions. + * Feast enables discovery of and collaboration on previously used features and enables versioning of sets of + features (via _feature services_). + * _(Experimental)_ Feast enables light-weight feature transformations so users can re-use transformation logic + across online / offline use cases and across models. ## Step 1: Install Feast Install the Feast SDK and CLI using pip: -* In this tutorial, we focus on a local deployment. For a more in-depth guide on how to use Feast with Snowflake / GCP / AWS deployments, see [Running Feast with Snowflake/GCP/AWS](../how-to-guides/feast-snowflake-gcp-aws/) +* In this tutorial, we focus on a local deployment. For a more in-depth guide on how to use Feast with Snowflake / + GCP / AWS deployments, see [Running Feast with Snowflake/GCP/AWS](../how-to-guides/feast-snowflake-gcp-aws/) {% tabs %} {% tab title="Bash" %} @@ -40,13 +47,13 @@ pip install feast ## Step 2: Create a feature repository -Bootstrap a new feature repository using `feast init` from the command line. +Bootstrap a new feature repository using `feast init` from the command line. {% tabs %} {% tab title="Bash" %} ```bash -feast init feature_repo -cd feature_repo +feast init my_project +cd my_project/feature_repo ``` {% endtab %} {% endtabs %} @@ -54,7 +61,7 @@ cd feature_repo {% tabs %} {% tab title="Output" %} ``` -Creating a new Feast repository in /home/Jovyan/feature_repo. +Creating a new Feast repository in /home/Jovyan/my_project. ``` {% endtab %} {% endtabs %} @@ -62,80 +69,149 @@ Creating a new Feast repository in /home/Jovyan/feature_repo. Let's take a look at the resulting demo repo itself. It breaks down into * `data/` contains raw demo parquet data -* `example.py` contains demo feature definitions +* `example_repo.py` contains demo feature definitions * `feature_store.yaml` contains a demo setup configuring where data sources are +* `test_workflow.py` showcases how to run all key Feast commands, including defining, retrieving, and pushing features. + You can run this with `python test_workflow.py`. {% tabs %} {% tab title="feature_store.yaml" %} ```yaml project: my_project +# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry) registry: data/registry.db +# The provider primarily specifies default offline / online stores & storing the registry in a given cloud provider: local online_store: - path: data/online_store.db + type: sqlite + path: data/online_store.db +entity_key_serialization_version: 2 ``` {% endtab %} -{% tab title="example.py" %} +{% tab title="example_repo.py" %} ```python # This is an example feature definition file from datetime import timedelta -from feast import Entity, FeatureService, FeatureView, Field, FileSource, ValueType -from feast.types import Float32, Int64 +import pandas as pd + +from feast import ( + Entity, + FeatureService, + FeatureView, + Field, + FileSource, + PushSource, + RequestSource, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Float32, Float64, Int64 + +# Define an entity for the driver. You can think of entity as a primary key used to +# fetch features. +driver = Entity(name="driver", join_keys=["driver_id"]) # Read data from parquet files. Parquet is convenient for local development mode. For # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. -driver_hourly_stats = FileSource( - path="/content/feature_repo/data/driver_stats.parquet", +driver_stats_source = FileSource( + name="driver_hourly_stats_source", + path="%PARQUET_PATH%", timestamp_field="event_timestamp", created_timestamp_column="created", ) -# Define an entity for the driver. You can think of entity as a primary key used to -# fetch features. -# Entity has a name used for later reference (in a feature view, eg) -# and join_key to identify physical field name used in storages -driver = Entity(name="driver", value_type=ValueType.INT64, join_keys=["driver_id"], description="driver id",) - # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. -driver_hourly_stats_view = FeatureView( +driver_stats_fv = FeatureView( + # The unique name of this feature view. Two feature views in a single + # project cannot have the same name name="driver_hourly_stats", - entities=["driver"], # reference entity by name - ttl=timedelta(seconds=86400 * 1), + entities=[driver], + ttl=timedelta(days=1), + # The list of features defined below act as a schema to both define features + # for both materialization of features into a store, and are used as references + # during retrieval for building a training dataset or serving features schema=[ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int64), ], online=True, - source=driver_hourly_stats, - tags={}, + source=driver_stats_source, + # Tags are user defined key/value pairs that are attached to each + # feature view + tags={"team": "driver_performance"}, ) -driver_stats_fs = FeatureService( - name="driver_activity", - features=[driver_hourly_stats_view] +# Defines a way to push data (to be available offline, online or both) into Feast. +driver_stats_push_source = PushSource( + name="driver_stats_push_source", + batch_source=driver_stats_source, +) + +# Define a request data source which encodes features / information only +# available at request time (e.g. part of the user initiated HTTP request) +input_request = RequestSource( + name="vals_to_add", + schema=[ + Field(name="val_to_add", dtype=Int64), + Field(name="val_to_add_2", dtype=Int64), + ], +) + + +# Define an on demand feature view which can generate new features based on +# existing feature views and RequestSource features +@on_demand_feature_view( + sources=[driver_stats_fv, input_request], + schema=[ + Field(name="conv_rate_plus_val1", dtype=Float64), + Field(name="conv_rate_plus_val2", dtype=Float64), + ], +) +def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_val1"] = inputs["conv_rate"] + inputs["val_to_add"] + df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] + return df + + +# This groups features into a model version +driver_activity_v1 = FeatureService( + name="driver_activity_v1", + features=[ + driver_stats_fv[["conv_rate"]], # Sub-selects a feature from a feature view + transformed_conv_rate, # Selects all features from the feature view + ], +) +driver_activity_v2 = FeatureService( + name="driver_activity_v2", features=[driver_stats_fv, transformed_conv_rate] ) ``` {% endtab %} {% endtabs %} -The key line defining the overall architecture of the feature store is the **provider**. This defines where the raw data exists (for generating training data & feature values for serving), and where to materialize feature values to in the online store (for serving). +The `feature_store.yaml` file configures the key overall architecture of the feature store. + +The provider value sets default offline and online stores. +* The offline store provides the compute layer to process historical data (for generating training data & feature + values for serving). +* The online store is a low latency store of the latest feature values (for powering real-time inference). Valid values for `provider` in `feature_store.yaml` are: -* local: use file source with SQLite/Redis -* gcp: use BigQuery/Snowflake with Google Cloud Datastore/Redis -* aws: use Redshift/Snowflake with DynamoDB/Redis +* local: use a SQL registry or local file registry. By default, use a file / Dask based offline store + SQLite online store +* gcp: use a SQL registry or GCS file registry. By default, use BigQuery (offline store) + Google Cloud Datastore (online store) +* aws: use a SQL registry or S3 file registry. By default, use Redshift (offline store) + DynamoDB (online store) -Note that there are many other sources Feast works with, including Azure, Hive, Trino, and PostgreSQL via community plugins. See [Third party integrations](../getting-started/third-party-integrations.md) for all supported datasources. +Note that there are many other offline / online stores Feast works with, including Spark, Azure, Hive, Trino, and +PostgreSQL via community plugins. See [Third party integrations](third-party-integrations.md) for all supported data sources. -A custom setup can also be made by following [adding a custom provider](../how-to-guides/creating-a-custom-provider.md). +A custom setup can also be made by following [Customizing Feast](../how-to-guides/customizing-feast/). ### Inspecting the raw data @@ -148,9 +224,22 @@ pd.read_parquet("data/driver_stats.parquet") ![Demo parquet data: data/driver\_stats.parquet](../.gitbook/assets/screen-shot-2021-08-23-at-2.35.18-pm.png) -## Step 3: Register feature definitions and deploy your feature store +## Step 3: Run sample workflow +There's an included `test_workflow.py` file which runs through a full sample workflow: +1. Register feature definitions through `feast apply` +2. Generate a training dataset (using `get_historical_features`) +3. Generate features for batch scoring (using `get_historical_features`) +4. Ingest batch features into an online store (using `materialize_incremental`) +5. Fetch online features to power real time inference (using `get_online_features`) +6. Ingest streaming features into offline / online stores (using `push`) +7. Verify online features are updated / fresher -The `apply` command scans python files in the current directory for feature view/entity definitions, registers the objects, and deploys infrastructure. In this example, it reads `example.py` (shown again below for convenience) and sets up SQLite online store tables. Note that we had specified SQLite as the default online store by using the `local` provider in `feature_store.yaml`. +We'll walk through some snippets of code below and explain +### Step 3a: Register feature definitions and deploy your feature store + +The `apply` command scans python files in the current directory for feature view/entity definitions, registers the +objects, and deploys infrastructure. In this example, it reads `example_repo.py` and sets up SQLite online store tables. Note that we had specified SQLite as the default online store by +configuring `online_store` in `feature_store.yaml`. {% tabs %} {% tab title="Bash" %} @@ -158,97 +247,61 @@ The `apply` command scans python files in the current directory for feature view feast apply ``` {% endtab %} - -{% tab title="example.py" %} -```python -# This is an example feature definition file - -from datetime import timedelta - -from feast import Entity, FeatureView, Field, FileSource, ValueType -from feast.types import Float32, Int64 - -# Read data from parquet files. Parquet is convenient for local development mode. For -# production, you can use your favorite DWH, such as BigQuery. See Feast documentation -# for more info. -driver_hourly_stats = FileSource( - path="/content/feature_repo/data/driver_stats.parquet", - timestamp_field="event_timestamp", - created_timestamp_column="created", -) - -# Define an entity for the driver. You can think of entity as a primary key used to -# fetch features. -# Entity has a name used for later reference (in a feature view, eg) -# and join_key to identify physical field name used in storages -driver = Entity(name="driver", value_type=ValueType.INT64, join_keys=["driver_id"], description="driver id",) - -# Our parquet files contain sample data that includes a driver_id column, timestamps and -# three feature column. Here we define a Feature View that will allow us to serve this -# data to our model online. -driver_hourly_stats_view = FeatureView( - name="driver_hourly_stats", - entities=["driver"], # reference entity by name - ttl=timedelta(seconds=86400 * 1), - schema=[ - Field(name="conv_rate", dtype=Float32), - Field(name="acc_rate", dtype=Float32), - Field(name="avg_daily_trips", dtype=Int64), - ], - online=True, - source=driver_hourly_stats, - tags={}, -) - -driver_stats_fs = FeatureService( - name="driver_activity", - features=[driver_hourly_stats_view] -) -``` -{% endtab %} {% endtabs %} {% tabs %} {% tab title="Output" %} ``` -Registered entity driver_id -Registered feature view driver_hourly_stats -Deploying infrastructure for driver_hourly_stats +Created entity driver +Created feature view driver_hourly_stats +Created on demand feature view transformed_conv_rate +Created feature service driver_activity_v1 +Created feature service driver_activity_v2 + +Created sqlite table my_project_driver_hourly_stats ``` {% endtab %} {% endtabs %} -## Step 4: Generating training data +### Step 3b: Generating training data or powering batch scoring models -To train a model, we need features and labels. Often, this label data is stored separately (e.g. you have one table storing user survey results and another set of tables with feature values). +To train a model, we need features and labels. Often, this label data is stored separately (e.g. you have one table storing user survey results and another set of tables with feature values). Feast can help generate the features that map to these labels. -The user can query that table of labels with timestamps and pass that into Feast as an _entity dataframe_ for training data generation. In many cases, Feast will also intelligently join relevant tables to create the relevant feature vectors. +Feast needs a list of **entities** (e.g. driver ids) and **timestamps**. Feast will intelligently join relevant +tables to create the relevant feature vectors. There are two ways to generate this list: +1. The user can query that table of labels with timestamps and pass that into Feast as an _entity dataframe_ for +training data generation. +2. The user can also query that table with a *SQL query* which pulls entities. See the documentation on [feature retrieval](https://docs.feast.dev/getting-started/concepts/feature-retrieval) for details * Note that we include timestamps because we want the features for the same driver at various timestamps to be used in a model. +#### Generating training data + {% tabs %} {% tab title="Python" %} ```python -from datetime import datetime, timedelta +from datetime import datetime import pandas as pd from feast import FeatureStore -# The entity dataframe is the dataframe we want to enrich with feature values +# Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for +# more details on how to retrieve for all entities in the offline store instead entity_df = pd.DataFrame.from_dict( { # entity's join key -> entity values "driver_id": [1001, 1002, 1003], - - # label name -> label values - "label_driver_reported_satisfaction": [1, 5, 3], - # "event_timestamp" (reserved key) -> timestamps "event_timestamp": [ - datetime.now() - timedelta(minutes=11), - datetime.now() - timedelta(minutes=36), - datetime.now() - timedelta(minutes=73), + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), ], + # (optional) label name -> label values. Feast does not process these + "label_driver_reported_satisfaction": [1, 5, 3], + # values we're using for an on-demand transformation + "val_to_add": [1, 2, 3], + "val_to_add_2": [10, 20, 30], } ) @@ -260,6 +313,8 @@ training_df = store.get_historical_features( "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate", "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", ], ).to_df() @@ -296,18 +351,55 @@ None ----- Example features ----- event_timestamp driver_id ... acc_rate avg_daily_trips -0 2021-08-23 15:12:55.489091+00:00 1003 ... 0.120588 938 -1 2021-08-23 15:49:55.489089+00:00 1002 ... 0.504881 635 -2 2021-08-23 16:14:55.489075+00:00 1001 ... 0.138416 606 +0 2021-08-23 15:12:55.489091+00:00 1003 ... 0.077863 741 +1 2021-08-23 15:49:55.489089+00:00 1002 ... 0.074327 113 +2 2021-08-23 16:14:55.489075+00:00 1001 ... 0.105046 347 [3 rows x 6 columns] ``` {% endtab %} {% endtabs %} -## Step 5: Load features into your online store +#### Run offline inference (batch scoring) +To power a batch model, we primarily need to generate features with the `get_historical_features` call, but using the current timestamp -We now serialize the latest values of features since the beginning of time to prepare for serving (note: `materialize-incremental` serializes all new features since the last `materialize` call). +{% tabs %} +{% tab title="Python" %} +```python +entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) +training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ], +).to_df() + +print("\n----- Example features -----\n") +print(training_df.head()) +``` +{% endtab %} +{% endtabs %} + +{% tabs %} +{% tab title="Output" %} +``` +----- Example features ----- + + driver_id event_timestamp ... acc_rate avg_daily_trips conv_rate_plus_val1 +0 1001 2022-08-08 18:22:06.555018+00:00 ... 0.864639 359 1.663844 +1 1002 2022-08-08 18:22:06.555018+00:00 ... 0.695982 311 2.151189 +2 1003 2022-08-08 18:22:06.555018+00:00 ... 0.949191 789 3.769165 +``` +{% endtab %} +{% endtabs %} +### Step 3c: Ingest batch features into your online store + +We now serialize the latest values of features since the beginning of time to prepare for serving (note: +`materialize-incremental` serializes all new features since the last `materialize` call). {% tabs %} {% tab title="Bash" %} @@ -330,9 +422,11 @@ driver_hourly_stats from 2021-08-22 16:25:47+00:00 to 2021-08-23 16:25:46+00:00: {% endtab %} {% endtabs %} -## Step 6: Fetching feature vectors for inference +### Step 3d: Fetching feature vectors for inference -At inference time, we need to quickly read the latest feature values for different drivers (which otherwise might have existed only in batch sources) from the online feature store using `get_online_features()`. These feature vectors can then be fed to the model. +At inference time, we need to quickly read the latest feature values for different drivers (which otherwise might +have existed only in batch sources) from the online feature store using `get_online_features()`. These feature +vectors can then be fed to the model. {% tabs %} {% tab title="Python" %} @@ -373,25 +467,30 @@ pprint(feature_vector) {% endtab %} {% endtabs %} -## Step 7: Using a feature service to fetch online features instead. +### Step 3e: Using a feature service to fetch online features instead. -You can also use feature services to manage multiple features, and decouple feature view definitions and the features needed by end applications. The feature store can also be used to fetch either online or historical features using the same api below. More information can be found [here](https://docs.feast.dev/getting-started/concepts/feature-retrieval). +You can also use feature services to manage multiple features, and decouple feature view definitions and the +features needed by end applications. The feature store can also be used to fetch either online or historical +features using the same API below. More information can be found +[here](https://docs.feast.dev/getting-started/concepts/feature-retrieval). -The `driver_activity` feature service pulls all features from the `driver_hourly_stats` feature view: +The `driver_activity_v1` feature service pulls all features from the `driver_hourly_stats` feature view: ```python +from feast import FeatureService driver_stats_fs = FeatureService( - name="driver_activity", features=[driver_hourly_stats_view] + name="driver_activity_v1", features=[driver_hourly_stats_view] ) ``` {% tabs %} {% tab title="Python" %} ```python +from pprint import pprint from feast import FeatureStore feature_store = FeatureStore('.') # Initialize the feature store -feature_service = feature_store.get_feature_service("driver_activity") +feature_service = feature_store.get_feature_service("driver_activity_v1") feature_vector = feature_store.get_online_features( features=feature_service, entity_rows=[ @@ -402,6 +501,8 @@ feature_vector = feature_store.get_online_features( ).to_dict() pprint(feature_vector) ``` +{% endtab %} +{% endtabs %} {% tabs %} {% tab title="Output" %} @@ -416,18 +517,46 @@ pprint(feature_vector) {% endtab %} {% endtabs %} -## Step 8: Browse your features with the Web UI (experimental) +## Step 4: Browse your features with the Web UI (experimental) View all registered features, data sources, entities, and feature services with the Web UI. One of the ways to view this is with the `feast ui` command. +{% tabs %} +{% tab title="Bash" %} +```bash +feast ui +``` +{% endtab %} +{% endtabs %} + +{% tabs %} +{% tab title="Output" %} +```bash +INFO: Started server process [66664] +08/17/2022 01:25:49 PM uvicorn.error INFO: Started server process [66664] +INFO: Waiting for application startup. +08/17/2022 01:25:49 PM uvicorn.error INFO: Waiting for application startup. +INFO: Application startup complete. +08/17/2022 01:25:49 PM uvicorn.error INFO: Application startup complete. +INFO: Uvicorn running on http://0.0.0.0:8888 (Press CTRL+C to quit) +08/17/2022 01:25:49 PM uvicorn.error INFO: Uvicorn running on http://0.0.0.0:8888 (Press CTRL+C to quit) +``` +{% endtab %} +{% endtabs %} + + ![](../reference/ui.png) +## Step 5: Re-examine `test_workflow.py` +Take a look at `test_workflow.py` again. It showcases many sample flows on how to interact with Feast. You'll see these +show up in the upcoming concepts + architecture + tutorial pages as well. + ## Next steps * Read the [Concepts](concepts/) page to understand the Feast data model. * Read the [Architecture](architecture-and-components/) page. -* Check out our [Tutorials](../tutorials/tutorials-overview.md) section for more examples on how to use Feast. +* Check out our [Tutorials](../tutorials/tutorials-overview/) section for more examples on how to use Feast. * Follow our [Running Feast with Snowflake/GCP/AWS](../how-to-guides/feast-snowflake-gcp-aws/) guide for a more in-depth tutorial on using Feast. * Join other Feast users and contributors in [Slack](https://slack.feast.dev) and become part of the community! diff --git a/docs/getting-started/third-party-integrations.md b/docs/getting-started/third-party-integrations.md index ef47a11029..8e6a600aa0 100644 --- a/docs/getting-started/third-party-integrations.md +++ b/docs/getting-started/third-party-integrations.md @@ -5,13 +5,13 @@ We integrate with a wide set of tools and technologies so you can make Feast wor {% hint style="info" %} Don't see your offline store or online store of choice here? Check out our guides to make a custom one! -* [Adding a new offline store](../how-to-guides/adding-a-new-offline-store.md) -* [Adding a new online store](../how-to-guides/adding-support-for-a-new-online-store.md) +* [Adding a new offline store](../how-to-guides/customizing-feast/adding-a-new-offline-store.md) +* [Adding a new online store](../how-to-guides/customizing-feast/adding-support-for-a-new-online-store.md) {% endhint %} ## Integrations -See [Functionality and Roadmap](../../README.md#-functionality-and-roadmap) +See [Functionality and Roadmap](../../#-functionality-and-roadmap) ## Standards @@ -19,7 +19,7 @@ In order for a plugin integration to be highlighted, it must meet the following 1. The plugin must have tests. Ideally it would use the Feast universal tests (see this [guide](../how-to-guides/adding-or-reusing-tests.md) for an example), but custom tests are fine. 2. The plugin must have some basic documentation on how it should be used. -3. The author must work with a maintainer to pass a basic code review (e.g. to ensure that the implementation roughly matches the core Feast implementations). +3. The author must work with a maintainer to pass a basic code review (e.g. to ensure that the implementation roughly matches the core Feast implementations). In order for a plugin integration to be merged into the main Feast repo, it must meet the following requirements: diff --git a/docs/how-to-guides/adding-or-reusing-tests.md b/docs/how-to-guides/adding-or-reusing-tests.md index 86c116442f..d68e47df5c 100644 --- a/docs/how-to-guides/adding-or-reusing-tests.md +++ b/docs/how-to-guides/adding-or-reusing-tests.md @@ -6,111 +6,208 @@ This guide will go over: 1. how Feast tests are setup 2. how to extend the test suite to test new functionality -3. how to use the existing test suite to test a new custom offline / online store. +3. how to use the existing test suite to test a new custom offline / online store ## Test suite overview -Let's inspect the test setup in `sdk/python/tests/integration`: +Unit tests are contained in `sdk/python/tests/unit`. +Integration tests are contained in `sdk/python/tests/integration`. +Let's inspect the structure of `sdk/python/tests/integration`: ```bash $ tree - . ├── e2e -│ └── test_universal_e2e.py +│ ├── test_go_feature_server.py +│ ├── test_python_feature_server.py +│ ├── test_universal_e2e.py +│ ├── test_usage_e2e.py +│ └── test_validation.py ├── feature_repos +│ ├── integration_test_repo_config.py │ ├── repo_configuration.py │ └── universal +│ ├── catalog │ ├── data_source_creator.py │ ├── data_sources +│ │ ├── __init__.py │ │ ├── bigquery.py │ │ ├── file.py -│ │ └── redshift.py +│ │ ├── redshift.py +│ │ └── snowflake.py │ ├── entities.py -│ └── feature_views.py +│ ├── feature_views.py +│ ├── online_store +│ │ ├── __init__.py +│ │ ├── datastore.py +│ │ ├── dynamodb.py +│ │ ├── hbase.py +│ │ └── redis.py +│ └── online_store_creator.py +├── materialization +│ └── test_lambda.py ├── offline_store +│ ├── test_feature_logging.py +│ ├── test_offline_write.py +│ ├── test_push_features_to_offline_store.py │ ├── test_s3_custom_endpoint.py │ └── test_universal_historical_retrieval.py ├── online_store -│ ├── test_e2e_local.py -│ ├── test_feature_service_read.py -│ ├── test_online_retrieval.py +│ ├── test_push_features_to_online_store.py │ └── test_universal_online.py -├── registration -│ ├── test_cli.py -│ ├── test_cli_apply_duplicated_featureview_names.py -│ ├── test_cli_chdir.py -│ ├── test_feature_service_apply.py -│ ├── test_feature_store.py -│ ├── test_inference.py -│ ├── test_registry.py -│ ├── test_universal_odfv_feature_inference.py -│ └── test_universal_types.py -└── scaffolding - ├── test_init.py - ├── test_partial_apply.py - ├── test_repo_config.py - └── test_repo_operations.py - -8 directories, 27 files -``` +└── registration + ├── test_feature_store.py + ├── test_inference.py + ├── test_registry.py + ├── test_universal_cli.py + ├── test_universal_odfv_feature_inference.py + └── test_universal_types.py -`feature_repos` has setup files for most tests in the test suite and pytest fixtures for other tests. These fixtures parametrize on different offline stores, online stores, etc. and thus abstract away store specific implementations so tests don't need to rewrite e.g. uploading dataframes to a specific store for setup. +``` -## Understanding an example test +* `feature_repos` has setup files for most tests in the test suite. +* `conftest.py` (in the parent directory) contains the most common [fixtures](https://docs.pytest.org/en/6.2.x/fixture.html), which are designed as an abstraction on top of specific offline/online stores, so tests do not need to be rewritten for different stores. Individual test files also contain more specific fixtures. +* The tests are organized by which Feast component(s) they test. + +## Structure of the test suite + +### Universal feature repo + +The universal feature repo refers to a set of fixtures (e.g. `environment` and `universal_data_sources`) that can be parametrized to cover various combinations of offline stores, online stores, and providers. +This allows tests to run against all these various combinations without requiring excess code. +The universal feature repo is constructed by fixtures in `conftest.py` with help from the various files in `feature_repos`. + +### Integration vs. unit tests + +Tests in Feast are split into integration and unit tests. +If a test requires external resources (e.g. cloud resources on GCP or AWS), it is an integration test. +If a test can be run purely locally (where locally includes Docker resources), it is a unit test. + +* Integration tests test non-local Feast behavior. For example, tests that require reading data from BigQuery or materializing data to DynamoDB are integration tests. Integration tests also tend to involve more complex Feast functionality. +* Unit tests test local Feast behavior. For example, tests that only require registering feature views are unit tests. Unit tests tend to only involve simple Feast functionality. + +### Main types of tests + +#### Integration tests + +1. E2E tests + * E2E tests test end-to-end functionality of Feast over the various codepaths (initialize a feature store, apply, and materialize). + * The main codepaths include: + * basic e2e tests for offline stores + * `test_universal_e2e.py` + * go feature server + * `test_go_feature_server.py` + * python http server + * `test_python_feature_server.py` + * usage tracking + * `test_usage_e2e.py` + * data quality monitoring feature validation + * `test_validation.py` +2. Offline and Online Store Tests + * Offline and online store tests mainly test for the offline and online retrieval functionality. + * The various specific functionalities that are tested include: + * push API tests + * `test_push_features_to_offline_store.py` + * `test_push_features_to_online_store.py` + * `test_offline_write.py` + * historical retrieval tests + * `test_universal_historical_retrieval.py` + * online retrieval tests + * `test_universal_online.py` + * data quality monitoring feature logging tests + * `test_feature_logging.py` + * online store tests + * `test_universal_online.py` +3. Registration Tests + * The registration folder contains all of the registry tests and some universal cli tests. This includes: + * CLI Apply and Materialize tests tested against on the universal test suite + * Data type inference tests + * Registry tests +4. Miscellaneous Tests + * AWS Lambda Materialization Tests (Currently do not work) + * `test_lambda.py` + +#### Unit tests + +1. Registry Diff Tests + * These are tests for the infrastructure and registry diff functionality that Feast uses to determine if changes to the registry or infrastructure is needed. +2. Local CLI Tests and Local Feast Tests + * These tests test all of the cli commands against the local file offline store. +3. Infrastructure Unit Tests + * DynamoDB tests with dynamo mocked out + * Repository configuration tests + * Schema inference unit tests + * Key serialization tests + * Basic provider unit tests +4. Feature Store Validation Tests + * These test mainly contain class level validation like hashing tests, protobuf and class serialization, and error and warning handling. + * Data source unit tests + * Feature service unit tests + * Feature service, feature view, and feature validation tests + * Protobuf/json tests for Feast ValueTypes + * Serialization tests + * Type mapping + * Feast types + * Serialization tests due to this [issue](https://github.com/feast-dev/feast/issues/2345) + * Feast usage tracking unit tests + +#### Docstring tests + +Docstring tests are primarily smoke tests to make sure imports and setup functions can be executed without errors. + +## Understanding the test suite with an example test + +### Example test Let's look at a sample test using the universal repo: {% tabs %} -{% tab title="Python" %} +{% tab code="sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py" %} ```python @pytest.mark.integration -@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +@pytest.mark.universal_offline_stores +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: f"full:{v}") def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources - feature_views = construct_universal_feature_views(data_sources) - customer_df, driver_df, orders_df, global_df, entity_df = ( - datasets["customer"], - datasets["driver"], - datasets["orders"], - datasets["global"], - datasets["entity"], - ) - # ... more test code + feature_views = construct_universal_feature_views(data_sources) - customer_fv, driver_fv, driver_odfv, order_fv, global_fv = ( - feature_views["customer"], - feature_views["driver"], - feature_views["driver_odfv"], - feature_views["order"], - feature_views["global"], - ) + entity_df_with_request_data = datasets.entity_df.copy(deep=True) + entity_df_with_request_data["val_to_add"] = [ + i for i in range(len(entity_df_with_request_data)) + ] + entity_df_with_request_data["driver_age"] = [ + i + 100 for i in range(len(entity_df_with_request_data)) + ] feature_service = FeatureService( - "convrate_plus100", + name="convrate_plus100", + features=[feature_views.driver[["conv_rate"]], feature_views.driver_odfv], + ) + feature_service_entity_mapping = FeatureService( + name="entity_mapping", features=[ - feature_views["driver"][["conv_rate"]], - feature_views["driver_odfv"] + feature_views.location.with_name("origin").with_join_key_map( + {"location_id": "origin_id"} + ), + feature_views.location.with_name("destination").with_join_key_map( + {"location_id": "destination_id"} + ), ], ) - feast_objects = [] - feast_objects.extend( + store.apply( [ - customer_fv, - driver_fv, - driver_odfv, - order_fv, - global_fv, driver(), customer(), + location(), feature_service, + feature_service_entity_mapping, + *feature_views.values(), ] ) - store.apply(feast_objects) # ... more test code job_from_df = store.get_historical_features( @@ -122,48 +219,86 @@ def test_historical_features(environment, universal_data_sources, full_feature_n "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", + "conv_rate_plus_100:conv_rate_plus_100_rounded", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", + "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) + + if job_from_df.supports_remote_storage_export(): + files = job_from_df.to_remote_storage() + print(files) + assert len(files) > 0 # This test should be way more detailed + + start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() # ... more test code - assert_frame_equal( - expected_df, actual_df_from_df_entities, check_dtype=False, + validate_dataframes( + expected_df, + table_from_df_entities, + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp = event_timestamp, ) # ... more test code ``` {% endtab %} {% endtabs %} -The key fixtures are the `environment` and `universal_data_sources` fixtures, which are defined in the `feature_repos` directories. This by default pulls in a standard dataset with driver and customer entities, certain feature views, and feature values. By including the environment as a parameter, the test automatically parametrizes across other offline / online store combinations. +* The key fixtures are the `environment` and `universal_data_sources` fixtures, which are defined in the `feature_repos` directories and the `conftest.py` file. This by default pulls in a standard dataset with driver and customer entities (that we have pre-defined), certain feature views, and feature values. + * The `environment` fixture sets up a feature store, parametrized by the provider and the online/offline store. It allows the test to query against that feature store without needing to worry about the underlying implementation or any setup that may be involved in creating instances of these datastores. + * Each fixture creates a different integration test with its own `IntegrationTestRepoConfig` which is used by pytest to generate a unique test testing one of the different environments that require testing. + +* Feast tests also use a variety of markers: + * The `@pytest.mark.integration` marker is used to designate integration tests which will cause the test to be run when you call `make test-python-integration`. + * The `@pytest.mark.universal_offline_stores` marker will parametrize the test on all of the universal offline stores including file, redshift, bigquery and snowflake. + * The `full_feature_names` parametrization defines whether or not the test should reference features as their full feature name (fully qualified path) or just the feature name itself. + ## Writing a new test or reusing existing tests ### To add a new test to an existing test file -* Use the same function signatures as an existing test (e.g. use `environment` as an argument) to include the relevant test fixtures. -* If possible, expand an individual test instead of writing a new test, due to the cost of standing up offline / online stores. +* Use the same function signatures as an existing test (e.g. use `environment` and `universal_data_sources` as an argument) to include the relevant test fixtures. +* If possible, expand an individual test instead of writing a new test, due to the cost of starting up offline / online stores. +* Use the `universal_offline_stores` and `universal_online_store` markers to parametrize the test against different offline store and online store combinations. You can also designate specific online and offline stores to test by using the `only` parameter on the marker. +```python +@pytest.mark.universal_online_stores(only=["redis"]) +``` ### To test a new offline / online store from a plugin repo * Install Feast in editable mode with `pip install -e`. * The core tests for offline / online store behavior are parametrized by the `FULL_REPO_CONFIGS` variable defined in `feature_repos/repo_configuration.py`. To overwrite this variable without modifying the Feast repo, create your own file that contains a `FULL_REPO_CONFIGS` (which will require adding a new `IntegrationTestRepoConfig` or two) and set the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. Then the core offline / online store tests can be run with `make test-python-universal`. * See the [custom offline store demo](https://github.com/feast-dev/feast-custom-offline-store-demo) and the [custom online store demo](https://github.com/feast-dev/feast-custom-online-store-demo) for examples. +### What are some important things to keep in mind when adding a new offline / online store? + +#### Type mapping/Inference + +Many problems arise when implementing your data store's type conversion to interface with Feast datatypes. +1. You will need to correctly update `inference.py` so that Feast can infer your datasource schemas +2. You also need to update `type_map.py` so that Feast knows how to convert your datastores types to Feast-recognized types in `feast/types.py`. + +#### Historical and online retrieval + +The most important functionality in Feast is historical and online retrieval. Most of the e2e and universal integration test test this functionality in some way. Making sure this functionality works also indirectly asserts that reading and writing from your datastore works as intended. + + ### To include a new offline / online store in the main Feast repo * Extend `data_source_creator.py` for your offline store. -* In `repo_configuration.py` add a new`IntegrationTestRepoConfig` or two (depending on how many online stores you want to test). +* In `repo_configuration.py` add a new `IntegrationTestRepoConfig` or two (depending on how many online stores you want to test). + * Generally, you should only need to test against sqlite. However, if you need to test against a production online store, then you can also test against Redis or dynamodb. * Run the full test suite with `make test-python-integration.` ### Including a new offline / online store in the main Feast repo from external plugins with community maintainers. -* This folder is for plugins that are officially maintained with community owners. Place the APIs in feast/infra/offline_stores/contrib/. +* This folder is for plugins that are officially maintained with community owners. Place the APIs in `feast/infra/offline_stores/contrib/`. * Extend `data_source_creator.py` for your offline store and implement the required APIs. * In `contrib_repo_configuration.py` add a new `IntegrationTestRepoConfig` (depending on how many online stores you want to test). * Run the test suite on the contrib test suite with `make test-python-contrib-universal`. @@ -171,7 +306,7 @@ The key fixtures are the `environment` and `universal_data_sources` fixtures, wh ### To include a new online store * In `repo_configuration.py` add a new config that maps to a serialized version of configuration you need in `feature_store.yaml` to setup the online store. -* In `repo_configuration.py`, add new`IntegrationTestRepoConfig` for offline stores you want to test. +* In `repo_configuration.py`, add new `IntegrationTestRepoConfig` for online stores you want to test. * Run the full test suite with `make test-python-integration` ### To use custom data in a new test @@ -193,11 +328,11 @@ def your_test(environment: Environment): # ... run test ``` -### Running your own redis cluster for testing +### Running your own Redis cluster for testing -* Install redis on your computer. If you are a mac user, you should be able to `brew install redis`. +* Install Redis on your computer. If you are a mac user, you should be able to `brew install redis`. * Running `redis-server --help` and `redis-cli --help` should show corresponding help menus. -* Run `cd scripts/create-cluster` and run `./create-cluster start` then `./create-cluster create` to start the server. You should see output that looks like this: +* * Run `./infra/scripts/redis-cluster.sh start` then `./infra/scripts/redis-cluster.sh create` to start the Redis cluster locally. You should see output that looks like this: ~~~~ Starting 6001 Starting 6002 @@ -206,6 +341,6 @@ Starting 6004 Starting 6005 Starting 6006 ~~~~ -* You should be able to run the integration tests and have the redis cluster tests pass. -* If you would like to run your own redis cluster, you can run the above commands with your own specified ports and connect to the newly configured cluster. -* To stop the cluster, run `./create-cluster stop` and then `./create-cluster clean`. +* You should be able to run the integration tests and have the Redis cluster tests pass. +* If you would like to run your own Redis cluster, you can run the above commands with your own specified ports and connect to the newly configured cluster. +* To stop the cluster, run `./infra/scripts/redis-cluster.sh stop` and then `./infra/scripts/redis-cluster.sh clean`. diff --git a/docs/how-to-guides/automated-feast-upgrade.md b/docs/how-to-guides/automated-feast-upgrade.md index ff17748537..89277fb615 100644 --- a/docs/how-to-guides/automated-feast-upgrade.md +++ b/docs/how-to-guides/automated-feast-upgrade.md @@ -56,7 +56,7 @@ $ feast repo-upgrade --write --- /Users/achal/feast/prompt_dory/example.py +++ /Users/achal/feast/prompt_dory/example.py @@ -28,9 +29,9 @@ - entities=["driver_id"], + entities=[driver_id], ttl=Duration(seconds=86400 * 365), features=[ - Feature(name="conv_rate", dtype=ValueType.FLOAT), diff --git a/docs/how-to-guides/customizing-feast/README.md b/docs/how-to-guides/customizing-feast/README.md new file mode 100644 index 0000000000..91c04e2f35 --- /dev/null +++ b/docs/how-to-guides/customizing-feast/README.md @@ -0,0 +1,24 @@ +# Customizing Feast + +Feast is highly pluggable and configurable: + +* One can use existing plugins (offline store, online store, batch materialization engine, providers) and configure those using the built in options. See reference documentation for details. +* The other way to customize Feast is to build your own custom components, and then point Feast to delegate to them. + +Below are some guides on how to add new custom components: + +{% content-ref url="adding-a-new-offline-store.md" %} +[adding-a-new-offline-store.md](adding-a-new-offline-store.md) +{% endcontent-ref %} + +{% content-ref url="adding-support-for-a-new-online-store.md" %} +[adding-support-for-a-new-online-store.md](adding-support-for-a-new-online-store.md) +{% endcontent-ref %} + +{% content-ref url="creating-a-custom-materialization-engine.md" %} +[creating-a-custom-materialization-engine.md](creating-a-custom-materialization-engine.md) +{% endcontent-ref %} + +{% content-ref url="creating-a-custom-provider.md" %} +[creating-a-custom-provider.md](creating-a-custom-provider.md) +{% endcontent-ref %} diff --git a/docs/how-to-guides/adding-a-new-offline-store.md b/docs/how-to-guides/customizing-feast/adding-a-new-offline-store.md similarity index 80% rename from docs/how-to-guides/adding-a-new-offline-store.md rename to docs/how-to-guides/customizing-feast/adding-a-new-offline-store.md index c548538fce..b2818b748f 100644 --- a/docs/how-to-guides/adding-a-new-offline-store.md +++ b/docs/how-to-guides/customizing-feast/adding-a-new-offline-store.md @@ -2,7 +2,7 @@ ## Overview -Feast makes adding support for a new offline store easy. Developers can simply implement the [OfflineStore](../../sdk/python/feast/infra/offline\_stores/offline\_store.py#L41) interface to add support for a new store (other than the existing stores like Parquet files, Redshift, and Bigquery). +Feast makes adding support for a new offline store easy. Developers can simply implement the [OfflineStore](../../../sdk/python/feast/infra/offline\_stores/offline\_store.py#L41) interface to add support for a new store (other than the existing stores like Parquet files, Redshift, and Bigquery). In this guide, we will show you how to extend the existing File offline store and use in a feature repo. While we will be implementing a specific store, this guide should be representative for adding support for any new offline store. @@ -22,7 +22,7 @@ The process for using a custom offline store consists of 8 steps: ## 1. Defining an OfflineStore class {% hint style="info" %} - OfflineStore class names must end with the OfflineStore suffix! +OfflineStore class names must end with the OfflineStore suffix! {% endhint %} ### Contrib offline stores @@ -31,23 +31,26 @@ New offline stores go in `sdk/python/feast/infra/offline_stores/contrib/`. #### What is a contrib plugin? -- Not guaranteed to implement all interface methods -- Not guaranteed to be stable. -- Should have warnings for users to indicate this is a contrib plugin that is not maintained by the maintainers. +* Not guaranteed to implement all interface methods +* Not guaranteed to be stable. +* Should have warnings for users to indicate this is a contrib plugin that is not maintained by the maintainers. #### How do I make a contrib plugin an "official" plugin? + To move an offline store plugin out of contrib, you need: -- GitHub actions (i.e `make test-python-integration`) is setup to run all tests against the offline store and pass. -- At least two contributors own the plugin (ideally tracked in our `OWNERS` / `CODEOWNERS` file). + +* GitHub actions (i.e `make test-python-integration`) is setup to run all tests against the offline store and pass. +* At least two contributors own the plugin (ideally tracked in our `OWNERS` / `CODEOWNERS` file). #### Define the offline store class -The OfflineStore class contains a couple of methods to read features from the offline store. Unlike the OnlineStore class, Feast does not manage any infrastructure for the offline store. + +The OfflineStore class contains a couple of methods to read features from the offline store. Unlike the OnlineStore class, Feast does not manage any infrastructure for the offline store. To fully implement the interface for the offline store, you will need to implement these methods: * `pull_latest_from_table_or_query` is invoked when running materialization (using the `feast materialize` or `feast materialize-incremental` commands, or the corresponding `FeatureStore.materialize()` method. This method pull data from the offline store, and the `FeatureStore` class takes care of writing this data into the online store. * `get_historical_features` is invoked when reading values from the offline store using the `FeatureStore.get_historical_features()` method. Typically, this method is used to retrieve features when training ML models. -* (optional) `offline_write_batch` is a method that supports directly pushing a pyarrow table to a feature view. Given a feature view with a specific schema, this function should write the pyarrow table to the batch source defined. More details about the push api can be found [here](docs/reference/data-sources/push.md). This method only needs implementation if you want to support the push api in your offline store. +* (optional) `offline_write_batch` is a method that supports directly pushing a pyarrow table to a feature view. Given a feature view with a specific schema, this function should write the pyarrow table to the batch source defined. More details about the push api can be found [here](../docs/reference/data-sources/push.md). This method only needs implementation if you want to support the push api in your offline store. * (optional) `pull_all_from_table_or_query` is a method that pulls all the data from an offline store from a specified start date to a specified end date. This method is only used for **SavedDatasets** as part of data quality monitoring validation. * (optional) `write_logged_features` is a method that takes a pyarrow table or a path that points to a parquet file and writes the data to a defined source defined by `LoggingSource` and `LoggingConfig`. This method is only used internally for **SavedDatasets**. @@ -140,29 +143,30 @@ To fully implement the interface for the offline store, you will need to impleme ) # Implementation here. pass - ``` {% endcode %} ### 1.1 Type Mapping Most offline stores will have to perform some custom mapping of offline store datatypes to feast value types. -- The function to implement here are `source_datatype_to_feast_value_type` and `get_column_names_and_types` in your `DataSource` class. + +* The function to implement here are `source_datatype_to_feast_value_type` and `get_column_names_and_types` in your `DataSource` class. * `source_datatype_to_feast_value_type` is used to convert your DataSource's datatypes to feast value types. * `get_column_names_and_types` retrieves the column names and corresponding datasource types. Add any helper functions for type conversion to `sdk/python/feast/type_map.py`. -- Be sure to implement correct type mapping so that Feast can process your feature columns without casting incorrectly that can potentially cause loss of information or incorrect data. + +* Be sure to implement correct type mapping so that Feast can process your feature columns without casting incorrectly that can potentially cause loss of information or incorrect data. ## 2. Defining an OfflineStoreConfig class Additional configuration may be needed to allow the OfflineStore to talk to the backing store. For example, Redshift needs configuration information like the connection information for the Redshift instance, credentials for connecting to the database, etc. -To facilitate configuration, all OfflineStore implementations are **required** to also define a corresponding OfflineStoreConfig class in the same file. This OfflineStoreConfig class should inherit from the `FeastConfigBaseModel` class, which is defined [here](../../sdk/python/feast/repo\_config.py#L44). +To facilitate configuration, all OfflineStore implementations are **required** to also define a corresponding OfflineStoreConfig class in the same file. This OfflineStoreConfig class should inherit from the `FeastConfigBaseModel` class, which is defined [here](../../../sdk/python/feast/repo\_config.py#L44). The `FeastConfigBaseModel` is a [pydantic](https://pydantic-docs.helpmanual.io) class, which parses yaml configuration into python objects. Pydantic also allows the model classes to define validators for the config classes, to make sure that the config classes are correctly defined. -This config class **must** container a `type` field, which contains the fully qualified class name of its corresponding OfflineStore class. +This config class **must** container a `type` field, which contains the fully qualified class name of its corresponding OfflineStore class. Additionally, the name of the config class must be the same as the OfflineStore class, with the `Config` suffix. @@ -195,7 +199,7 @@ online_store: ``` {% endcode %} -This configuration information is available to the methods of the OfflineStore, via the `config: RepoConfig` parameter which is passed into the methods of the OfflineStore interface, specifically at the `config.offline_store` field of the `config` parameter. This fields in the `feature_store.yaml` should map directly to your `OfflineStoreConfig` class that is detailed above in Section 2. +This configuration information is available to the methods of the OfflineStore, via the `config: RepoConfig` parameter which is passed into the methods of the OfflineStore interface, specifically at the `config.offline_store` field of the `config` parameter. This fields in the `feature_store.yaml` should map directly to your `OfflineStoreConfig` class that is detailed above in Section 2. {% code title="feast_custom_offline_store/file.py" %} ```python @@ -225,7 +229,7 @@ Custom offline stores may need to implement their own instances of the `Retrieva The `RetrievalJob` interface exposes two methods - `to_df` and `to_arrow`. The expectation is for the retrieval job to be able to return the rows read from the offline store as a parquet DataFrame, or as an Arrow table respectively. -Users who want to have their offline store support **scalable batch materialization** for online use cases (detailed in this [RFC](https://docs.google.com/document/d/1J7XdwwgQ9dY_uoV9zkRVGQjK9Sy43WISEW6D5V9qzGo/edit#heading=h.9gaqqtox9jg6)) will also need to implement `to_remote_storage` to distribute the reading and writing of offline store records to blob storage (such as S3). This may be used by a custom [Materialization Engine](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/infra/materialization/batch_materialization_engine.py#L72) to parallelize the materialization of data by processing it in chunks. If this is not implemented, Feast will default to local materialization (pulling all records into memory to materialize). +Users who want to have their offline store support **scalable batch materialization** for online use cases (detailed in this [RFC](https://docs.google.com/document/d/1J7XdwwgQ9dY\_uoV9zkRVGQjK9Sy43WISEW6D5V9qzGo/edit#heading=h.9gaqqtox9jg6)) will also need to implement `to_remote_storage` to distribute the reading and writing of offline store records to blob storage (such as S3). This may be used by a custom [Materialization Engine](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/infra/materialization/batch\_materialization\_engine.py#L72) to parallelize the materialization of data by processing it in chunks. If this is not implemented, Feast will default to local materialization (pulling all records into memory to materialize). {% code title="feast_custom_offline_store/file.py" %} ```python @@ -258,7 +262,7 @@ class CustomFileRetrievalJob(RetrievalJob): Before this offline store can be used as the batch source for a feature view in a feature repo, a subclass of the `DataSource` [base class](https://rtd.feast.dev/en/master/index.html?highlight=DataSource#feast.data\_source.DataSource) needs to be defined. This class is responsible for holding information needed by specific feature views to support reading historical values from the offline store. For example, a feature view using Redshift as the offline store may need to know which table contains historical feature values. -The data source class should implement two methods - `from_proto`, and `to_proto`. +The data source class should implement two methods - `from_proto`, and `to_proto`. For custom offline stores that are not being implemented in the main feature repo, the `custom_options` field should be used to store any configuration needed by the data source. In this case, the implementer is responsible for serializing this configuration into bytes in the `to_proto` method and reading the value back from bytes in the `from_proto` method. @@ -317,9 +321,9 @@ class CustomFileDataSource(FileSource): ``` {% endcode %} -## 5. Using the custom offline store +## 5. Using the custom offline store -After implementing these classes, the custom offline store can be used by referencing it in a feature repo's `feature_store.yaml` file, specifically in the `offline_store` field. The value specified should be the fully qualified class name of the OfflineStore. +After implementing these classes, the custom offline store can be used by referencing it in a feature repo's `feature_store.yaml` file, specifically in the `offline_store` field. The value specified should be the fully qualified class name of the OfflineStore. As long as your OfflineStore class is available in your Python environment, it will be imported by Feast dynamically at runtime. @@ -372,17 +376,17 @@ driver_hourly_stats_view = FeatureView( Even if you have created the `OfflineStore` class in a separate repo, you can still test your implementation against the Feast test suite, as long as you have Feast as a submodule in your repo. 1. In order to test against the test suite, you need to create a custom `DataSourceCreator` that implement our testing infrastructure methods, `create_data_source` and optionally, `created_saved_dataset_destination`. - * `create_data_source` should create a datasource based on the dataframe passed in. It may be implemented by uploading the contents of the dataframe into the offline store and returning a datasource object pointing to that location. See `BigQueryDataSourceCreator` for an implementation of a data source creator. - * `created_saved_dataset_destination` is invoked when users need to save the dataset for use in data validation. This functionality is still in alpha and is **optional**. + * `create_data_source` should create a datasource based on the dataframe passed in. It may be implemented by uploading the contents of the dataframe into the offline store and returning a datasource object pointing to that location. See `BigQueryDataSourceCreator` for an implementation of a data source creator. + * `created_saved_dataset_destination` is invoked when users need to save the dataset for use in data validation. This functionality is still in alpha and is **optional**. +2. Make sure that your offline store doesn't break any unit tests first by running: -2. Make sure that your offline store doesn't break any unit tests first by running: ``` make test-python ``` +3. Next, set up your offline store to run the universal integration tests. These are integration tests specifically intended to test offline and online stores against Feast API functionality, to ensure that the Feast APIs works with your offline store. -3. Next, set up your offline store to run the universal integration tests. These are integration tests specifically intended to test offline and online stores against Feast API functionality, to ensure that the Feast APIs works with your offline store. - - Feast parametrizes integration tests using the `FULL_REPO_CONFIGS` variable defined in `sdk/python/tests/integration/feature_repos/repo_configuration.py` which stores different offline store classes for testing. - - To overwrite the default configurations to use your own offline store, you can simply create your own file that contains a `FULL_REPO_CONFIGS` dictionary, and point Feast to that file by setting the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. The module should add new `IntegrationTestRepoConfig` classes to the `AVAILABLE_OFFLINE_STORES` by defining an offline store that you would like Feast to test with. + * Feast parametrizes integration tests using the `FULL_REPO_CONFIGS` variable defined in `sdk/python/tests/integration/feature_repos/repo_configuration.py` which stores different offline store classes for testing. + * To overwrite the default configurations to use your own offline store, you can simply create your own file that contains a `FULL_REPO_CONFIGS` dictionary, and point Feast to that file by setting the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. The module should add new `IntegrationTestRepoConfig` classes to the `AVAILABLE_OFFLINE_STORES` by defining an offline store that you would like Feast to test with. A sample `FULL_REPO_CONFIGS_MODULE` looks something like this: @@ -394,8 +398,7 @@ Even if you have created the `OfflineStore` class in a separate repo, you can st AVAILABLE_OFFLINE_STORES = [("local", PostgreSQLDataSourceCreator)] ``` - -4. You should swap out the `FULL_REPO_CONFIGS` environment variable and run the integration tests against your offline store. In the example repo, the file that overwrites `FULL_REPO_CONFIGS` is `feast_custom_offline_store/feast_tests.py`, so you would run: +4. You should swap out the `FULL_REPO_CONFIGS` environment variable and run the integration tests against your offline store. In the example repo, the file that overwrites `FULL_REPO_CONFIGS` is `feast_custom_offline_store/feast_tests.py`, so you would run: ```bash export FULL_REPO_CONFIGS_MODULE='feast_custom_offline_store.feast_tests' @@ -404,19 +407,46 @@ Even if you have created the `OfflineStore` class in a separate repo, you can st If the integration tests fail, this indicates that there is a mistake in the implementation of this offline store! -5. Remember to add your datasource to `repo_config.py` similar to how we added `spark`, `trino`, etc, to the dictionary `OFFLINE_STORE_CLASS_FOR_TYPE` and add the necessary configuration to `repo_configuration.py`. Namely, `AVAILABLE_OFFLINE_STORES` should load your repo configuration module. +5. Remember to add your datasource to `repo_config.py` similar to how we added `spark`, `trino`, etc, to the dictionary `OFFLINE_STORE_CLASS_FOR_TYPE`. This will allow Feast to load your class from the `feature_store.yaml`. + +6. Finally, add a Makefile target to the Makefile to run your datastore specific tests by setting the `FULL_REPO_CONFIGS_MODULE` and `PYTEST_PLUGINS` environment variable. The `PYTEST_PLUGINS` environment variable allows pytest to load in the `DataSourceCreator` for your datasource. You can remove certain tests that are not relevant or still do not work for your datastore using the `-k` option. + +{% code title="Makefile" %} +```Makefile +test-python-universal-spark: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.offline_stores.contrib.spark_repo_configuration \ + PYTEST_PLUGINS=feast.infra.offline_stores.contrib.spark_offline_store.tests \ + FEAST_USAGE=False IS_TEST=True \ + python -m pytest -n 8 --integration \ + -k "not test_historical_retrieval_fails_on_validation and \ + not test_historical_retrieval_with_validation and \ + not test_historical_features_persisting and \ + not test_historical_retrieval_fails_on_validation and \ + not test_universal_cli and \ + not test_go_feature_server and \ + not test_feature_logging and \ + not test_reorder_columns and \ + not test_logged_features_validation and \ + not test_lambda_materialization_consistency and \ + not test_offline_write and \ + not test_push_features_to_offline_store.py and \ + not gcs_registry and \ + not s3_registry and \ + not test_universal_types" \ + sdk/python/tests +``` +{% endcode %} ### 7. Dependencies -Add any dependencies for your offline store to our `sdk/python/setup.py` under a new `__REQUIRED` list with the packages and add it to the setup script so that if your offline store is needed, users can install the necessary python packages. These packages should be defined as extras so that they are not installed by users by default. -You will need to regenerate our requirements files. To do this, create separate pyenv environments for python 3.8, 3.9, and 3.10. In each environment, run the following commands: +Add any dependencies for your offline store to our `sdk/python/setup.py` under a new `__REQUIRED` list with the packages and add it to the setup script so that if your offline store is needed, users can install the necessary python packages. These packages should be defined as extras so that they are not installed by users by default. You will need to regenerate our requirements files. To do this, create separate pyenv environments for python 3.8, 3.9, and 3.10. In each environment, run the following commands: ``` export PYTHON= make lock-python-ci-dependencies ``` - ### 8. Add Documentation Remember to add documentation for your offline store. @@ -425,12 +455,12 @@ Remember to add documentation for your offline store. 2. You should also add a reference in `docs/reference/data-sources/README.md` and `docs/SUMMARY.md` to these markdown files. **NOTE**: Be sure to document the following things about your offline store: -- How to create the datasource and most what configuration is needed in the `feature_store.yaml` file in order to create the datasource. -- Make sure to flag that the datasource is in alpha development. -- Add some documentation on what the data model is for the specific offline store for more clarity. -- Finally, generate the python code docs by running: + +* How to create the datasource and most what configuration is needed in the `feature_store.yaml` file in order to create the datasource. +* Make sure to flag that the datasource is in alpha development. +* Add some documentation on what the data model is for the specific offline store for more clarity. +* Finally, generate the python code docs by running: ```bash make build-sphinx ``` - diff --git a/docs/how-to-guides/adding-support-for-a-new-online-store.md b/docs/how-to-guides/customizing-feast/adding-support-for-a-new-online-store.md similarity index 82% rename from docs/how-to-guides/adding-support-for-a-new-online-store.md rename to docs/how-to-guides/customizing-feast/adding-support-for-a-new-online-store.md index d1f5986f18..52f0897138 100644 --- a/docs/how-to-guides/adding-support-for-a-new-online-store.md +++ b/docs/how-to-guides/customizing-feast/adding-support-for-a-new-online-store.md @@ -2,13 +2,12 @@ ## Overview -Feast makes adding support for a new online store (database) easy. Developers can simply implement the [OnlineStore](../../sdk/python/feast/infra/online\_stores/online\_store.py#L26) interface to add support for a new store (other than the existing stores like Redis, DynamoDB, SQLite, and Datastore). +Feast makes adding support for a new online store (database) easy. Developers can simply implement the [OnlineStore](../../../sdk/python/feast/infra/online\_stores/online\_store.py#L26) interface to add support for a new store (other than the existing stores like Redis, DynamoDB, SQLite, and Datastore). In this guide, we will show you how to integrate with MySQL as an online store. While we will be implementing a specific store, this guide should be representative for adding support for any new online store. The full working code for this guide can be found at [feast-dev/feast-custom-online-store-demo](https://github.com/feast-dev/feast-custom-online-store-demo). - The process of using a custom online store consists of 6 steps: 1. Defining the `OnlineStore` class. @@ -21,7 +20,7 @@ The process of using a custom online store consists of 6 steps: ## 1. Defining an OnlineStore class {% hint style="info" %} - OnlineStore class names must end with the OnlineStore suffix! +OnlineStore class names must end with the OnlineStore suffix! {% endhint %} ### Contrib online stores @@ -30,19 +29,21 @@ New online stores go in `sdk/python/feast/infra/online_stores/contrib/`. #### What is a contrib plugin? -- Not guaranteed to implement all interface methods -- Not guaranteed to be stable. -- Should have warnings for users to indicate this is a contrib plugin that is not maintained by the maintainers. +* Not guaranteed to implement all interface methods +* Not guaranteed to be stable. +* Should have warnings for users to indicate this is a contrib plugin that is not maintained by the maintainers. #### How do I make a contrib plugin an "official" plugin? + To move an online store plugin out of contrib, you need: -- GitHub actions (i.e `make test-python-integration`) is setup to run all tests against the online store and pass. -- At least two contributors own the plugin (ideally tracked in our `OWNERS` / `CODEOWNERS` file). + +* GitHub actions (i.e `make test-python-integration`) is setup to run all tests against the online store and pass. +* At least two contributors own the plugin (ideally tracked in our `OWNERS` / `CODEOWNERS` file). The OnlineStore class broadly contains two sets of methods * One set deals with managing infrastructure that the online store needed for operations -* One set deals with writing data into the store, and reading data from the store. +* One set deals with writing data into the store, and reading data from the store. ### 1.1 Infrastructure Methods @@ -50,11 +51,11 @@ There are two methods that deal with managing infrastructure for online stores, * `update` is invoked when users run `feast apply` as a CLI command, or the `FeatureStore.apply()` sdk method. -The `update` method should be used to perform any operations necessary before data can be written to or read from the store. The `update` method can be used to create MySQL tables in preparation for reads and writes to new feature views. +The `update` method should be used to perform any operations necessary before data can be written to or read from the store. The `update` method can be used to create MySQL tables in preparation for reads and writes to new feature views. * `teardown` is invoked when users run `feast teardown` or `FeatureStore.teardown()`. -The `teardown` method should be used to perform any clean-up operations. `teardown` can be used to drop MySQL indices and tables corresponding to the feature views being deleted. +The `teardown` method should be used to perform any clean-up operations. `teardown` can be used to drop MySQL indices and tables corresponding to the feature views being deleted. {% code title="feast_custom_online_store/mysql.py" %} ```python @@ -123,10 +124,10 @@ def teardown( ### 1.2 Read/Write Methods -There are two methods that deal with writing data to and from the online stores.`online_write_batch `and `online_read`. +There are two methods that deal with writing data to and from the online stores.`online_write_batch` and `online_read`. -* `online_write_batch `is invoked when running materialization (using the `feast materialize` or `feast materialize-incremental` commands, or the corresponding `FeatureStore.materialize()` method. -* `online_read `is invoked when reading values from the online store using the `FeatureStore.get_online_features()` method. +* `online_write_batch` is invoked when running materialization (using the `feast materialize` or `feast materialize-incremental` commands, or the corresponding `FeatureStore.materialize()` method. +* `online_read` is invoked when reading values from the online store using the `FeatureStore.get_online_features()` method. {% code title="feast_custom_online_store/mysql.py" %} ```python @@ -210,22 +211,24 @@ def online_read( ### 1.3 Type Mapping Most online stores will have to perform some custom mapping of online store datatypes to feast value types. -- The function to implement here are `source_datatype_to_feast_value_type` and `get_column_names_and_types` in your `DataSource` class. + +* The function to implement here are `source_datatype_to_feast_value_type` and `get_column_names_and_types` in your `DataSource` class. * `source_datatype_to_feast_value_type` is used to convert your DataSource's datatypes to feast value types. * `get_column_names_and_types` retrieves the column names and corresponding datasource types. Add any helper functions for type conversion to `sdk/python/feast/type_map.py`. -- Be sure to implement correct type mapping so that Feast can process your feature columns without casting incorrectly that can potentially cause loss of information or incorrect data. + +* Be sure to implement correct type mapping so that Feast can process your feature columns without casting incorrectly that can potentially cause loss of information or incorrect data. ## 2. Defining an OnlineStoreConfig class Additional configuration may be needed to allow the OnlineStore to talk to the backing store. For example, MySQL may need configuration information like the host at which the MySQL instance is running, credentials for connecting to the database, etc. -To facilitate configuration, all OnlineStore implementations are **required** to also define a corresponding OnlineStoreConfig class in the same file. This OnlineStoreConfig class should inherit from the `FeastConfigBaseModel` class, which is defined [here](../../sdk/python/feast/repo\_config.py#L44). +To facilitate configuration, all OnlineStore implementations are **required** to also define a corresponding OnlineStoreConfig class in the same file. This OnlineStoreConfig class should inherit from the `FeastConfigBaseModel` class, which is defined [here](../../../sdk/python/feast/repo\_config.py#L44). The `FeastConfigBaseModel` is a [pydantic](https://pydantic-docs.helpmanual.io) class, which parses yaml configuration into python objects. Pydantic also allows the model classes to define validators for the config classes, to make sure that the config classes are correctly defined. -This config class **must** container a `type` field, which contains the fully qualified class name of its corresponding OnlineStore class. +This config class **must** container a `type` field, which contains the fully qualified class name of its corresponding OnlineStore class. Additionally, the name of the config class must be the same as the OnlineStore class, with the `Config` suffix. @@ -254,7 +257,7 @@ online_store: ``` {% endcode %} -This configuration information is available to the methods of the OnlineStore, via the`config: RepoConfig` parameter which is passed into all the methods of the OnlineStore interface, specifically at the `config.online_store` field of the `config` parameter. +This configuration information is available to the methods of the OnlineStore, via the`config: RepoConfig` parameter which is passed into all the methods of the OnlineStore interface, specifically at the `config.online_store` field of the `config` parameter. {% code title="feast_custom_online_store/mysql.py" %} ```python @@ -281,9 +284,9 @@ def online_write_batch( ``` {% endcode %} -## 3. Using the custom online store +## 3. Using the custom online store -After implementing both these classes, the custom online store can be used by referencing it in a feature repo's `feature_store.yaml` file, specifically in the `online_store` field. The value specified should be the fully qualified class name of the OnlineStore. +After implementing both these classes, the custom online store can be used by referencing it in a feature repo's `feature_store.yaml` file, specifically in the `online_store` field. The value specified should be the fully qualified class name of the OnlineStore. As long as your OnlineStore class is available in your Python environment, it will be imported by Feast dynamically at runtime. @@ -302,7 +305,7 @@ online_store: ``` {% endcode %} -If additional configuration for the online store is **not **required, then we can omit the other fields and only specify the `type` of the online store class as the value for the `online_store`. +If additional configuration for the online store is \*\*not \*\*required, then we can omit the other fields and only specify the `type` of the online store class as the value for the `online_store`. {% code title="feature_repo/feature_store.yaml" %} ```yaml @@ -315,19 +318,18 @@ online_store: feast_custom_online_store.mysql.MySQLOnlineStore ## 4. Testing the OnlineStore class -### Integrating with the integration test suite and unit test suite. +### 4.1 Integrating with the integration test suite and unit test suite. Even if you have created the `OnlineStore` class in a separate repo, you can still test your implementation against the Feast test suite, as long as you have Feast as a submodule in your repo. -1. In the Feast submodule, we can run all the unit tests and make sure they pass: +1. In the Feast submodule, we can run all the unit tests and make sure they pass: + ``` make test-python ``` - - 2. The universal tests, which are integration tests specifically intended to test offline and online stores, should be run against Feast to ensure that the Feast APIs works with your online store. - - Feast parametrizes integration tests using the `FULL_REPO_CONFIGS` variable defined in `sdk/python/tests/integration/feature_repos/repo_configuration.py` which stores different online store classes for testing. - - To overwrite these configurations, you can simply create your own file that contains a `FULL_REPO_CONFIGS` variable, and point Feast to that file by setting the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. + * Feast parametrizes integration tests using the `FULL_REPO_CONFIGS` variable defined in `sdk/python/tests/integration/feature_repos/repo_configuration.py` which stores different online store classes for testing. + * To overwrite these configurations, you can simply create your own file that contains a `FULL_REPO_CONFIGS` variable, and point Feast to that file by setting the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. A sample `FULL_REPO_CONFIGS_MODULE` looks something like this: @@ -341,10 +343,8 @@ AVAILABLE_ONLINE_STORES = {"postgres": (None, PostgreSQLDataSourceCreator)} ``` {% endcode %} - If you are planning to start the online store up locally(e.g spin up a local Redis Instance) for testing, then the dictionary entry should be something like: - ```python { "sqlite": ({"type": "sqlite"}, None), @@ -352,9 +352,7 @@ If you are planning to start the online store up locally(e.g spin up a local Red } ``` - -If you are planning instead to use a Dockerized container to run your tests against your online store, you can define a `OnlineStoreCreator` and replace the `None` object above with your `OnlineStoreCreator` class. - +If you are planning instead to use a Dockerized container to run your tests against your online store, you can define a `OnlineStoreCreator` and replace the `None` object above with your `OnlineStoreCreator` class. You should make this class available to pytest through the `PYTEST_PLUGINS` environment variable. If you create a containerized docker image for testing, developers who are trying to test with your online store will not have to spin up their own instance of the online store for testing. An example of an `OnlineStoreCreator` is shown below: @@ -374,40 +372,48 @@ class RedisOnlineStoreCreator(OnlineStoreCreator): ``` {% endcode %} -3\. You should swap out the `FULL_REPO_CONFIGS` environment variable and run the integration tests against your online store. In the example repo, the file that overwrites `FULL_REPO_CONFIGS` is `feast_custom_online_store/feast_tests.py`, so you would run: - -```bash -export FULL_REPO_CONFIGS_MODULE='feast_custom_online_store.feast_tests' -make test-python-universal +3\. Add a Makefile target to the Makefile to run your datastore specific tests by setting the `FULL_REPO_CONFIGS_MODULE` environment variable. Add `PYTEST_PLUGINS` if pytest is having trouble loading your `DataSourceCreator`. You can remove certain tests that are not relevant or still do not work for your datastore using the `-k` option. + +{% code title="Makefile" %} +```Makefile +test-python-universal-cassandra: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.online_stores.contrib.cassandra_repo_configuration \ + PYTEST_PLUGINS=sdk.python.tests.integration.feature_repos.universal.online_store.cassandra \ + FEAST_USAGE=False \ + IS_TEST=True \ + python -m pytest -x --integration \ + sdk/python/tests ``` +{% endcode %} -- If there are some tests that fail, this indicates that there is a mistake in the implementation of this online store! - +* If there are some tests that fail, this indicates that there is a mistake in the implementation of this online store! ### 5. Add Dependencies Add any dependencies for your online store to our `sdk/python/setup.py` under a new `_REQUIRED` list with the packages and add it to the setup script so that if your online store is needed, users can install the necessary python packages. These packages should be defined as extras so that they are not installed by users by default. -- You will need to regenerate our requirements files. To do this, create separate pyenv environments for python 3.8, 3.9, and 3.10. In each environment, run the following commands: + +* You will need to regenerate our requirements files. To do this, create separate pyenv environments for python 3.8, 3.9, and 3.10. In each environment, run the following commands: ``` export PYTHON= make lock-python-ci-dependencies ``` - ### 6. Add Documentation Remember to add the documentation for your online store. -1. Add a new markdown file to `docs/reference/online-stores/`. + +1. Add a new markdown file to `docs/reference/online-stores/`. 2. You should also add a reference in `docs/reference/online-stores/README.md` and `docs/SUMMARY.md`. Add a new markdown document to document your online store functionality similar to how the other online stores are documented. **NOTE**:Be sure to document the following things about your online store: -- Be sure to cover how to create the datasource and what configuration is needed in the `feature_store.yaml` file in order to create the datasource. -- Make sure to flag that the online store is in alpha development. -- Add some documentation on what the data model is for the specific online store for more clarity. -- Finally, generate the python code docs by running: + +* Be sure to cover how to create the datasource and what configuration is needed in the `feature_store.yaml` file in order to create the datasource. +* Make sure to flag that the online store is in alpha development. +* Add some documentation on what the data model is for the specific online store for more clarity. +* Finally, generate the python code docs by running: ```bash make build-sphinx ``` - diff --git a/docs/how-to-guides/creating-a-custom-materialization-engine.md b/docs/how-to-guides/customizing-feast/creating-a-custom-materialization-engine.md similarity index 92% rename from docs/how-to-guides/creating-a-custom-materialization-engine.md rename to docs/how-to-guides/customizing-feast/creating-a-custom-materialization-engine.md index 935ac3dc99..cca7bd3621 100644 --- a/docs/how-to-guides/creating-a-custom-materialization-engine.md +++ b/docs/how-to-guides/customizing-feast/creating-a-custom-materialization-engine.md @@ -1,4 +1,4 @@ -# Adding a custom materialization engine +# Adding a custom batch materialization engine ### Overview @@ -7,10 +7,10 @@ Feast batch materialization operations (`materialize` and `materialize-increment Custom batch materialization engines allow Feast users to extend Feast to customize the materialization process. Examples include: * Setting up custom materialization-specific infrastructure during `feast apply` (e.g. setting up Spark clusters or Lambda Functions) -* Launching custom batch ingestion \(materialization\) jobs \(Spark, Beam, AWS Lambda\) +* Launching custom batch ingestion (materialization) jobs (Spark, Beam, AWS Lambda) * Tearing down custom materialization-specific infrastructure during `feast teardown` (e.g. tearing down Spark clusters, or deleting Lambda Functions) -Feast comes with built-in materialization engines, e.g, `LocalMaterializationEngine`, and an experimental `LambdaMaterializationEngine`. However, users can develop their own materialization engines by creating a class that implements the contract in the [BatchMaterializationEngine class](https://github.com/feast-dev/feast/blob/6d7b38a39024b7301c499c20cf4e7aef6137c47c/sdk/python/feast/infra/materialization/batch_materialization_engine.py#L72). +Feast comes with built-in materialization engines, e.g, `LocalMaterializationEngine`, and an experimental `LambdaMaterializationEngine`. However, users can develop their own materialization engines by creating a class that implements the contract in the [BatchMaterializationEngine class](https://github.com/feast-dev/feast/blob/6d7b38a39024b7301c499c20cf4e7aef6137c47c/sdk/python/feast/infra/materialization/batch\_materialization\_engine.py#L72). ### Guide @@ -79,14 +79,13 @@ class MyCustomEngine(LocalMaterializationEngine): ) for task in tasks ] - ``` Notice how in the above engine we have only overwritten two of the methods on the `LocalMaterializatinEngine`, namely `update` and `materialize`. These two methods are convenient to replace if you are planning to launch custom batch jobs. #### Step 2: Configuring Feast to use the engine -Configure your [feature\_store.yaml](../reference/feature-repository/feature-store-yaml.md) file to point to your new engine class: +Configure your [feature\_store.yaml](../../reference/feature-repository/feature-store-yaml.md) file to point to your new engine class: ```yaml project: repo @@ -99,7 +98,7 @@ offline_store: type: file ``` -Notice how the `batch_engine` field above points to the module and class where your engine can be found. +Notice how the `batch_engine` field above points to the module and class where your engine can be found. #### Step 3: Using the engine @@ -109,7 +108,7 @@ Now you should be able to use your engine by running a Feast command: feast apply ``` -```text +``` Registered entity driver_id Registered feature view driver_hourly_stats Deploying infrastructure for driver_hourly_stats diff --git a/docs/how-to-guides/creating-a-custom-provider.md b/docs/how-to-guides/customizing-feast/creating-a-custom-provider.md similarity index 93% rename from docs/how-to-guides/creating-a-custom-provider.md rename to docs/how-to-guides/customizing-feast/creating-a-custom-provider.md index 40ec20ee6a..f2bc3f8327 100644 --- a/docs/how-to-guides/creating-a-custom-provider.md +++ b/docs/how-to-guides/customizing-feast/creating-a-custom-provider.md @@ -6,8 +6,8 @@ All Feast operations execute through a `provider`. Operations like materializing Custom providers allow Feast users to extend Feast to execute any custom logic. Examples include: -* Launching custom streaming ingestion jobs \(Spark, Beam\) -* Launching custom batch ingestion \(materialization\) jobs \(Spark, Beam\) +* Launching custom streaming ingestion jobs (Spark, Beam) +* Launching custom batch ingestion (materialization) jobs (Spark, Beam) * Adding custom validation to feature repositories during `feast apply` * Adding custom infrastructure setup logic which runs during `feast apply` * Extending Feast commands with in-house metrics, logging, or tracing @@ -37,7 +37,7 @@ from feast.infra.local import LocalProvider from feast.infra.offline_stores.offline_store import RetrievalJob from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto -from feast.registry import Registry +from feast.infra.registry.registry import Registry from feast.repo_config import RepoConfig @@ -87,7 +87,7 @@ It is possible to overwrite all the methods on the provider class. In fact, it i #### Step 2: Configuring Feast to use the provider -Configure your [feature\_store.yaml](../reference/feature-repository/feature-store-yaml.md) file to point to your new provider class: +Configure your [feature\_store.yaml](../../reference/feature-repository/feature-store-yaml.md) file to point to your new provider class: ```yaml project: repo @@ -100,7 +100,7 @@ offline_store: type: file ``` -Notice how the `provider` field above points to the module and class where your provider can be found. +Notice how the `provider` field above points to the module and class where your provider can be found. #### Step 3: Using the provider @@ -110,7 +110,7 @@ Now you should be able to use your provider by running a Feast command: feast apply ``` -```text +``` Registered entity driver_id Registered feature view driver_hourly_stats Deploying infrastructure for driver_hourly_stats @@ -128,4 +128,3 @@ That's it. You should now have a fully functional custom provider! ### Next steps Have a look at the [custom provider demo repository](https://github.com/feast-dev/feast-custom-provider-demo) for a fully functional example of a custom provider. Feel free to fork it when creating your own custom provider! - diff --git a/docs/how-to-guides/running-feast-in-production.md b/docs/how-to-guides/running-feast-in-production.md index f03629ea4b..61b7b1fe40 100644 --- a/docs/how-to-guides/running-feast-in-production.md +++ b/docs/how-to-guides/running-feast-in-production.md @@ -2,20 +2,16 @@ ## Overview -After learning about Feast concepts and playing with Feast locally, you're now ready to use Feast in production. -This guide aims to help with the transition from a sandbox project to production-grade deployment in the cloud or on-premise. +After learning about Feast concepts and playing with Feast locally, you're now ready to use Feast in production. This guide aims to help with the transition from a sandbox project to production-grade deployment in the cloud or on-premise. Overview of typical production configuration is given below: ![Overview](production-simple.png) {% hint style="success" %} -**Important note:** We're trying to keep Feast modular. With the exception of the core, most of the Feast blocks are loosely connected and can be used independently. Hence, you are free to build your own production configuration. -For example, you might not have a stream source and, thus, no need to write features in real-time to an online store. -Or you might not need to retrieve online features. +**Important note:** Feast is highly customizable and modular. Most Feast blocks are loosely connected and can be used independently. Hence, you are free to build your own production configuration. -Furthermore, there's no single "true" approach. As you will see in this guide, Feast usually provides several options for each problem. -It's totally up to you to pick a path that's better suited to your needs. +For example, you might not have a stream source and, thus, no need to write features in real-time to an online store. Or you might not need to retrieve online features. Feast also often provides multiple options to achieve the same goal. We discuss tradeoffs below. {% endhint %} In this guide we will show you how to: @@ -28,77 +24,30 @@ In this guide we will show you how to: ## 1. Automatically deploying changes to your feature definitions -The first step to setting up a deployment of Feast is to create a Git repository that contains your feature definitions. The recommended way to version and track your feature definitions is by committing them to a repository and tracking changes through commits. - -Most teams will need to have a feature store deployed to more than one environment. We have created an example repository \([Feast Repository Example](https://github.com/feast-dev/feast-ci-repo-example)\) which contains two Feast projects, one per environment. - -The contents of this repository are shown below: - -```bash -├── .github -│ └── workflows -│ ├── production.yml -│ └── staging.yml -│ -├── staging -│ ├── driver_repo.py -│ └── feature_store.yaml -│ -└── production - ├── driver_repo.py - └── feature_store.yaml -``` +### Setting up a feature repository -The repository contains three sub-folders: +The first step to setting up a deployment of Feast is to create a Git repository that contains your feature definitions. The recommended way to version and track your feature definitions is by committing them to a repository and tracking changes through commits. If you recall, running `feast apply` commits feature definitions to a **registry**, which users can then read elsewhere. -* `staging/`: This folder contains the staging `feature_store.yaml` and Feast objects. Users that want to make changes to the Feast deployment in the staging environment will commit changes to this directory. -* `production/`: This folder contains the production `feature_store.yaml` and Feast objects. Typically users would first test changes in staging before copying the feature definitions into the production folder, before committing the changes. -* `.github`: This folder is an example of a CI system that applies the changes in either the `staging` or `production` repositories using `feast apply`. This operation saves your feature definitions to a shared registry \(for example, on GCS\) and configures your infrastructure for serving features. +### Setting up CI/CD to automatically update the registry -The `feature_store.yaml` contains the following: +We recommend typically setting up CI/CD to automatically run `feast plan` and `feast apply` when pull requests are opened / merged. -```text -project: staging -registry: gs://feast-ci-demo-registry/staging/registry.db -provider: gcp -``` +### Setting up multiple environments -Notice how the registry has been configured to use a Google Cloud Storage bucket. All changes made to infrastructure using `feast apply` are tracked in the `registry.db`. This registry will be accessed later by the Feast SDK in your training pipelines or model serving services in order to read features. +A common scenario when using Feast in production is to want to test changes to Feast object definitions. For this, we recommend setting up a _staging_ environment for your offline and online stores, which mirrors _production_ (with potentially a smaller data set). +Having this separate environment allows users to test changes by first applying them to staging, and then promoting the changes to production after verifying the changes on staging. -{% hint style="success" %} -It is important to note that the CI system above must have access to create, modify, or remove infrastructure in your production environment. This is unlike clients of the feature store, who will only have read access. -{% endhint %} - -If your organization consists of many independent data science teams or a single group is working on several projects -that could benefit from sharing features, entities, sources, and transformations, then we encourage you to utilize Python packages inside each environment: - -``` -└── production - ├── common - │ ├── __init__.py - │ ├── sources.py - │ └── entities.py - ├── ranking - │ ├── __init__.py - │ ├── views.py - │ └── transformations.py - ├── segmentation - │ ├── __init__.py - │ ├── views.py - │ └── transformations.py - └── feature_store.yaml -``` - -In summary, once you have set up a Git based repository with CI that runs `feast apply` on changes, your infrastructure \(offline store, online store, and cloud environment\) will automatically be updated to support the loading of data into the feature store or retrieval of data. +Different options are presented in the [how-to guide](structuring-repos.md). ## 2. How to load data into your online store and keep it up to date To keep your online store up to date, you need to run a job that loads feature data from your feature view sources into your online store. In Feast, this loading operation is called materialization. ### 2.1. Manual materializations + The simplest way to schedule materialization is to run an **incremental** materialization using the Feast CLI: -```text +``` feast materialize-incremental 2022-01-01T00:00:00 ``` @@ -106,9 +55,9 @@ The above command will load all feature values from all feature view sources int A timestamp is required to set the end date for materialization. If your source is fully up to date then the end date would be the current time. However, if you are querying a source where data is not yet available, then you do not want to set the timestamp to the current time. You would want to use a timestamp that ends at a date for which data is available. The next time `materialize-incremental` is run, Feast will load data that starts from the previous end date, so it is important to ensure that the materialization interval does not overlap with time periods for which data has not been made available. This is commonly the case when your source is an ETL pipeline that is scheduled on a daily basis. -An alternative approach to incremental materialization \(where Feast tracks the intervals of data that need to be ingested\), is to call Feast directly from your scheduler like Airflow. In this case, Airflow is the system that tracks the intervals that have been ingested. +An alternative approach to incremental materialization (where Feast tracks the intervals of data that need to be ingested), is to call Feast directly from your scheduler like Airflow. In this case, Airflow is the system that tracks the intervals that have been ingested. -```text +``` feast materialize -v driver_hourly_stats 2020-01-01T00:00:00 2020-01-02T00:00:00 ``` @@ -118,14 +67,10 @@ The timestamps above should match the interval of data that has been computed by ### 2.2. Automate periodic materializations -It is up to you which orchestration/scheduler to use to periodically run `$ feast materialize`. -Feast keeps the history of materialization in its registry so that the choice could be as simple as a [unix cron util](https://en.wikipedia.org/wiki/Cron). -Cron util should be sufficient when you have just a few materialization jobs (it's usually one materialization job per feature view) triggered infrequently. -However, the amount of work can quickly outgrow the resources of a single machine. That happens because the materialization job needs to repackage all rows before writing them to an online store. That leads to high utilization of CPU and memory. -In this case, you might want to use a job orchestrator to run multiple jobs in parallel using several workers. -Kubernetes Jobs or Airflow are good choices for more comprehensive job orchestration. +It is up to you which orchestration/scheduler to use to periodically run `$ feast materialize`. Feast keeps the history of materialization in its registry so that the choice could be as simple as a [unix cron util](https://en.wikipedia.org/wiki/Cron). Cron util should be sufficient when you have just a few materialization jobs (it's usually one materialization job per feature view) triggered infrequently. However, the amount of work can quickly outgrow the resources of a single machine. That happens because the materialization job needs to repackage all rows before writing them to an online store. That leads to high utilization of CPU and memory. In this case, you might want to use a job orchestrator to run multiple jobs in parallel using several workers. Kubernetes Jobs or Airflow are good choices for more comprehensive job orchestration. If you are using Airflow as a scheduler, Feast can be invoked through the [BashOperator](https://airflow.apache.org/docs/apache-airflow/stable/howto/operator/bash.html) after the [Python SDK](https://pypi.org/project/feast/) has been installed into a virtual environment and your feature repo has been synced: + ```python materialize = BashOperator( task_id='materialize', @@ -134,8 +79,7 @@ materialize = BashOperator( ``` {% hint style="success" %} -Important note: Airflow worker must have read and write permissions to the registry file on GS / S3 -since it pulls configuration and updates materialization history. +Important note: Airflow worker must have read and write permissions to the registry file on GS / S3 since it pulls configuration and updates materialization history. {% endhint %} ## 3. How to use Feast for model training @@ -207,17 +151,14 @@ It is important to note that both the training pipeline and model serving servic ## 4. Retrieving online features for prediction -Once you have successfully loaded (or in Feast terminology materialized) your data from batch sources into the online store, you can start consuming features for model inference. -There are three approaches for that purpose sorted from the most simple one (in an operational sense) to the most performant (benchmarks to be published soon): +Once you have successfully loaded (or in Feast terminology materialized) your data from batch sources into the online store, you can start consuming features for model inference. There are three approaches for that purpose sorted from the most simple one (in an operational sense) to the most performant (benchmarks to be published soon): ### 4.1. Use the Python SDK within an existing Python service -This approach is the most convenient to keep your infrastructure as minimalistic as possible and avoid deploying extra services. -The Feast Python SDK will connect directly to the online store (Redis, Datastore, etc), pull the feature data, and run transformations locally (if required). -The obvious drawback is that your service must be written in Python to use the Feast Python SDK. -A benefit of using a Python stack is that you can enjoy production-grade services with integrations with many existing data science tools. +This approach is the most convenient to keep your infrastructure as minimalistic as possible and avoid deploying extra services. The Feast Python SDK will connect directly to the online store (Redis, Datastore, etc), pull the feature data, and run transformations locally (if required). The obvious drawback is that your service must be written in Python to use the Feast Python SDK. A benefit of using a Python stack is that you can enjoy production-grade services with integrations with many existing data science tools. To integrate online retrieval into your service use the following code: + ```python from feast import FeatureStore @@ -235,49 +176,45 @@ feature_vector = fs.get_online_features( ### 4.2. Consume features via HTTP API from Serverless Feature Server -If you don't want to add the Feast Python SDK as a dependency, or your feature retrieval service is written in a non-Python language, -Feast can deploy a simple feature server -on serverless infrastructure (eg, AWS Lambda, Google Cloud Run) for you. -This service will provide an HTTP API with JSON I/O, which can be easily used with any programming language. +If you don't want to add the Feast Python SDK as a dependency, or your feature retrieval service is written in a non-Python language, Feast can deploy a simple feature server on serverless infrastructure (eg, AWS Lambda, Google Cloud Run) for you. This service will provide an HTTP API with JSON I/O, which can be easily used with any programming language. -[Read more about this feature](../reference/alpha-aws-lambda-feature-server.md) +[Read more about this feature](../reference/feature-servers/alpha-aws-lambda-feature-server.md) ### 4.3. Go feature server deployed on Kubernetes -For users with very latency-sensitive and high QPS use-cases, Feast offers a high-performance [Go feature server](../reference/feature-servers/go-feature-server.md). -It can use either HTTP or gRPC. +For users with very latency-sensitive and high QPS use-cases, Feast offers a high-performance [Go feature server](../reference/feature-servers/go-feature-server.md). It can use either HTTP or gRPC. The Go feature server can be deployed to a Kubernetes cluster via Helm charts in a few simple steps: 1. Install [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) and [helm 3](https://helm.sh/) 2. Add the Feast Helm repository and download the latest charts: + ``` helm repo add feast-charts https://feast-helm-charts.storage.googleapis.com helm repo update ``` -3. Run Helm Install + +1. Run Helm Install + ``` helm install feast-release feast-charts/feast-feature-server \ --set global.registry.path=s3://feast/registries/prod \ --set global.project= ``` -This chart will deploy a single service. -The service must have read access to the registry file on cloud storage. -It will keep a copy of the registry in their memory and periodically refresh it, so expect some delays in update propagation in exchange for better performance. -In order for the Go feature server to be enabled, you should set `go_feature_serving: True` in the `feature_store.yaml`. +This chart will deploy a single service. The service must have read access to the registry file on cloud storage. It will keep a copy of the registry in their memory and periodically refresh it, so expect some delays in update propagation in exchange for better performance. In order for the Go feature server to be enabled, you should set `go_feature_serving: True` in the `feature_store.yaml`. ## 5. Ingesting features from a stream source -Recently Feast added functionality for [stream ingestion](../reference/data-sources/push.md). -Please note that this is still in an early phase and new incompatible changes may be introduced. +Recently Feast added functionality for [stream ingestion](../reference/data-sources/push.md). Please note that this is still in an early phase and new incompatible changes may be introduced. ### 5.1. Using Python SDK in your Apache Spark / Beam pipeline -The default option to write features from a stream is to add the Python SDK into your existing PySpark / Beam pipeline. -Feast SDK provides writer implementation that can be called from `foreachBatch` stream writer in PySpark like this: +The default option to write features from a stream is to add the Python SDK into your existing PySpark / Beam pipeline. Feast SDK provides writer implementation that can be called from `foreachBatch` stream writer in PySpark like this: ```python +from feast import FeatureStore + store = FeatureStore(...) def feast_writer(spark_df): @@ -289,21 +226,17 @@ streamingDF.writeStream.foreachBatch(feast_writer).start() ### 5.2. Push Service (Alpha) -Alternatively, if you want to ingest features directly from a broker (eg, Kafka or Kinesis), you can use the "push service", which will write to an online store and/or offline store. -This service will expose an HTTP API or when deployed on Serverless platforms like AWS Lambda or Google Cloud Run, -this service can be directly connected to Kinesis or PubSub. +Alternatively, if you want to ingest features directly from a broker (eg, Kafka or Kinesis), you can use the "push service", which will write to an online store and/or offline store. This service will expose an HTTP API or when deployed on Serverless platforms like AWS Lambda or Google Cloud Run, this service can be directly connected to Kinesis or PubSub. -If you are using Kafka, [HTTP Sink](https://docs.confluent.io/kafka-connect-http/current/overview.html) could be utilized as a middleware. -In this case, the "push service" can be deployed on Kubernetes or as a Serverless function. +If you are using Kafka, [HTTP Sink](https://docs.confluent.io/kafka-connect-http/current/overview.html) could be utilized as a middleware. In this case, the "push service" can be deployed on Kubernetes or as a Serverless function. ## 6. Monitoring Feast services can report their metrics to a StatsD-compatible collector. To activate this function, you'll need to provide a StatsD IP address and a port when deploying the helm chart (in future, this will be added to `feature_store.yaml`). -We use an [InfluxDB-style extension](https://github.com/prometheus/statsd_exporter#tagging-extensions) for StatsD format to be able to send tags along with metrics. Keep that in mind while selecting the collector ([telegraph](https://www.influxdata.com/blog/getting-started-with-sending-statsd-metrics-to-telegraf-influxdb/#introducing-influx-statsd) will work for sure). +We use an [InfluxDB-style extension](https://github.com/prometheus/statsd\_exporter#tagging-extensions) for StatsD format to be able to send tags along with metrics. Keep that in mind while selecting the collector ([telegraph](https://www.influxdata.com/blog/getting-started-with-sending-statsd-metrics-to-telegraf-influxdb/#introducing-influx-statsd) will work for sure). -We chose StatsD since it's a de-facto standard with various implementations (eg, [1](https://github.com/prometheus/statsd_exporter), [2](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/statsd/README.md)) -and metrics can be easily exported to Prometheus, InfluxDB, AWS CloudWatch, etc. +We chose StatsD since it's a de-facto standard with various implementations (eg, [1](https://github.com/prometheus/statsd\_exporter), [2](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/statsd/README.md)) and metrics can be easily exported to Prometheus, InfluxDB, AWS CloudWatch, etc. ## 7. Using environment variables in your yaml configuration @@ -329,7 +262,8 @@ online_store: connection_string: ${REDIS_CONNECTION_STRING:"0.0.0.0:6379"} ``` ---- +*** + ## Summary Summarizing it all together we want to show several options of architecture that will be most frequently used in production: @@ -345,21 +279,19 @@ Summarizing it all together we want to show several options of architecture that ![From Repository to Production: Feast Production Architecture](production-spark.png) - -### Option #2 *(still in development)* +### Option #2 _(still in development)_ Same as Option #1, except: -* Push service is deployed as AWS Lambda / Google Cloud Run and is configured as a sink for Kinesis or PubSub to ingest features directly from a stream broker. -Lambda / Cloud Run is being managed by Feast SDK (from CI environment) + +* Push service is deployed as AWS Lambda / Google Cloud Run and is configured as a sink for Kinesis or PubSub to ingest features directly from a stream broker. Lambda / Cloud Run is being managed by Feast SDK (from CI environment) * Materialization jobs are managed inside Kubernetes via Kubernetes Job (currently not managed by Helm) ![With Push Service as Lambda](production-lambda.png) - -### Option #3 *(still in development)* +### Option #3 _(still in development)_ Same as Option #2, except: + * Push service is deployed on Kubernetes cluster and exposes an HTTP API that can be used as a sink for Kafka (via kafka-http connector) or accessed directly. ![With Push Service in Kubernetes](production-kube.png) - diff --git a/docs/how-to-guides/scaling-feast.md b/docs/how-to-guides/scaling-feast.md new file mode 100644 index 0000000000..ce63f027c9 --- /dev/null +++ b/docs/how-to-guides/scaling-feast.md @@ -0,0 +1,26 @@ +# Scaling Feast + +## Overview + +Feast is designed to be easy to use and understand out of the box, with as few infrastructure dependencies as possible. However, there are components used by default that may not scale well. +Since Feast is designed to be modular, it's possible to swap such components with more performant components, at the cost of Feast depending on additional infrastructure. + + +### Scaling Feast Registry + +The default Feast [registry](../getting-started/concepts/registry.md) is a file-based registry. Any changes to the feature repo, or materializing data into the online store, results in a mutation to the registry. + +However, there are inherent limitations with a file-based registry, since changing a single field in the registry requires re-writing the whole registry file. +With multiple concurrent writers, this presents a risk of data loss, or bottlenecks writes to the registry since all changes have to be serialized (e.g. when running materialization for multiple feature views or time ranges concurrently). + +The recommended solution in this case is to use the [SQL based registry](../tutorials/using-scalable-registry.md), which allows concurrent, transactional, and fine-grained updates to the registry. This registry implementation requires access to an existing database (such as MySQL, Postgres, etc). + +### Scaling Materialization + +The default Feast materialization process is an in-memory process, which pulls data from the offline store before writing it to the online store. +However, this process does not scale for large data sets, since it's executed on a single-process. + +Feast supports pluggable [Materialization Engines](../getting-started/architecture-and-components/batch-materialization-engine.md), that allow the materialization process to be scaled up. +Aside from the local process, Feast supports a [Lambda-based materialization engine](https://rtd.feast.dev/en/master/#alpha-lambda-based-engine), and a [Bytewax-based materialization engine](https://rtd.feast.dev/en/master/#bytewax-engine). + +Users may also be able to build an engine to scale up materialization using existing infrastructure in their organizations. \ No newline at end of file diff --git a/docs/how-to-guides/structuring-repos.md b/docs/how-to-guides/structuring-repos.md new file mode 100644 index 0000000000..62f7094e9f --- /dev/null +++ b/docs/how-to-guides/structuring-repos.md @@ -0,0 +1,105 @@ +# Structuring Feature Repos + +A common scenario when using Feast in production is to want to test changes to Feast object definitions. For this, we recommend setting up a _staging_ environment for your offline and online stores, which mirrors _production_ (with potentially a smaller data set). +Having this separate environment allows users to test changes by first applying them to staging, and then promoting the changes to production after verifying the changes on staging. + +## Setting up multiple environments + +There are three common ways teams approach having separate environments + +1. Have separate git branches for each environment +2. Have separate `feature_store.yaml` files and separate Feast object definitions that correspond to each environment +3. Have separate `feature_store.yaml` files per environment, but share the Feast object definitions + +### Different version control branches + +To keep a clear separation of the feature repos, teams may choose to have multiple long-lived branches in their version control system, one for each environment. In this approach, with CI/CD setup, changes would first be made to the staging branch, and then copied over manually to the production branch once verified in the staging environment. + +### Separate `feature_store.yaml` files and separate Feast object definitions + +For this approach, we have created an example repository ([Feast Repository Example](https://github.com/feast-dev/feast-ci-repo-example)) which contains two Feast projects, one per environment. + +The contents of this repository are shown below: + +``` +├── .github +│ └── workflows +│ ├── production.yml +│ └── staging.yml +│ +├── staging +│ ├── driver_repo.py +│ └── feature_store.yaml +│ +└── production + ├── driver_repo.py + └── feature_store.yaml +``` + +The repository contains three sub-folders: + +* `staging/`: This folder contains the staging `feature_store.yaml` and Feast objects. Users that want to make changes to the Feast deployment in the staging environment will commit changes to this directory. +* `production/`: This folder contains the production `feature_store.yaml` and Feast objects. Typically users would first test changes in staging before copying the feature definitions into the production folder, before committing the changes. +* `.github`: This folder is an example of a CI system that applies the changes in either the `staging` or `production` repositories using `feast apply`. This operation saves your feature definitions to a shared registry (for example, on GCS) and configures your infrastructure for serving features. + +The `feature_store.yaml` contains the following: + +``` +project: staging +registry: gs://feast-ci-demo-registry/staging/registry.db +provider: gcp +``` + +Notice how the registry has been configured to use a Google Cloud Storage bucket. All changes made to infrastructure using `feast apply` are tracked in the `registry.db`. This registry will be accessed later by the Feast SDK in your training pipelines or model serving services in order to read features. + +{% hint style="success" %} +It is important to note that the CI system above must have access to create, modify, or remove infrastructure in your production environment. This is unlike clients of the feature store, who will only have read access. +{% endhint %} + +If your organization consists of many independent data science teams or a single group is working on several projects that could benefit from sharing features, entities, sources, and transformations, then we encourage you to utilize Python packages inside each environment: + +``` +└── production + ├── common + │ ├── __init__.py + │ ├── sources.py + │ └── entities.py + ├── ranking + │ ├── __init__.py + │ ├── views.py + │ └── transformations.py + ├── segmentation + │ ├── __init__.py + │ ├── views.py + │ └── transformations.py + └── feature_store.yaml +``` + + +### Shared Feast Object definitions with separate `feature_store.yaml` files + +This approach is very similar to the previous approach, but instead of having feast objects duplicated and having to copy over changes, it may be possible to share the same Feast object definitions and have different `feature_store.yaml` configuration. + +An example of how such a repository would be structured is as follows: + +``` +├── .github +│ └── workflows +│ ├── production.yml +│ └── staging.yml +├── staging +│ └── feature_store.yaml +├── production +│ └── feature_store.yaml +└── driver_repo.py +``` + +Users can then apply the applying them to each environment in this way: +```shell +feast -f staging/feature_store.yaml apply +``` + +This setup has the advantage that you can share the feature definitions entirely, which may prevent issues with copy-pasting code. + +## Summary +In summary, once you have set up a Git based repository with CI that runs `feast apply` on changes, your infrastructure (offline store, online store, and cloud environment) will automatically be updated to support the loading of data into the feature store or retrieval of data. diff --git a/docs/project/development-guide.md b/docs/project/development-guide.md index 58e29a5ca7..5aae0628f6 100644 --- a/docs/project/development-guide.md +++ b/docs/project/development-guide.md @@ -8,6 +8,7 @@ This guide is targeted at developers looking to contribute to Feast: * [Making a Pull Request](development-guide.md#making-a-pull-request) * [Feast Data Storage Format](development-guide.md#feast-data-storage-format) * [Feast Protobuf API](development-guide.md#feast-protobuf-api) +* [Maintainer Guide](./maintainers.md) > Learn How the Feast [Contributing Process](contributing.md) works. diff --git a/docs/project/maintainers.md b/docs/project/maintainers.md new file mode 100644 index 0000000000..ff77c3dfc4 --- /dev/null +++ b/docs/project/maintainers.md @@ -0,0 +1,59 @@ +# Setting up your environment +> Please see the [Development Guide](https://docs.feast.dev/project/development-guide) for project level development instructions and [Contributing Guide](https://github.com/feast-dev/feast/blob/master/CONTRIBUTING.md) for specific details on how to set up your develop environment and contribute to Feast. + +# Maintainers Development +> In most scenarios, your code changes or the areas of Feast that you are actively maintaining will only touch parts of the code(e.g one offline store/online store). + +## Forked Repo Best Practices +1. You should setup your fork so that you can make pull requests against your own master branch. + - This prevents unnecessary integration tests and other github actions that are irrelevant to your code changes from being run everytime you would like to make a code change. + - **NOTE**: Most workflows are enabled by default so manually [disable workflows](https://docs.github.com/en/actions/managing-workflow-runs/disabling-and-enabling-a-workflow) that are not needed. +2. When you are ready to merge changes into the official feast branch, make a pull request with the main feast branch and request a review from other maintainers. + - Since your code changes should only touch tests that are relevant to your functionality, and other tests should pass as well. + +**NOTE**: Remember to frequently sync your fork master branch with `feast-dev/feast:master`. + +## Github Actions Workflow on Fork +- **Recommended**: The github actions workflows that should be enabled on the fork are as follows: + - `unit-tests` + - Runs all of the unit tests that should always pass. + - `linter` + - Lints your pr for styling or complexity issues using mypy, isort, and flake. + - `fork-pr-integration-tests-[provider]` + - Run all of the integration tests to test Feast functionality on your fork for a specific provider. + - The `.github/workflows` folder has examples of common workflows(`aws`, `gcp`, and `snowflake`). + 1. Move the `fork_pr_integration_tests_[provider].yml` from `.github/fork_workflows` to `.github/workflows`. + 2. Edit `fork_pr_integration_tests_[provider].yml` (more details below) to only run the integration tests that are relevant to your area of interest. + 3. Push the workflow to your branch and it should automatically be added to the actions on your fork. + - `build_wheels` + - Release verification workflow to use for [release](docs/project/release-process.md). + +## Integration Test Workflow Changes +Fork specific integration tests are run by the `fork_pr_integration_tests.yml_[provider]` yaml workflow files. + +1. Under the `integration-test-python` job, replace `your github repo` with your feast github repo name. +2. If your offline store/online store needs special setup, add it to the job similar to how gcp is setup. + + ```yaml + - name: Set up gcloud SDK + uses: google-github-actions/setup-gcloud@v0 + with: + project_id: ${{ secrets.GCP_PROJECT_ID }} + service_account_key: ${{ secrets.GCP_SA_KEY }} + export_default_credentials: true + ``` + +3. Add any environment variables that you need to your github [secrets](https://github.com/Azure/actions-workflow-samples/blob/master/assets/create-secrets-for-GitHub-workflows.md). + - For specific github secrets that you will need to test the already supported datastores(e.g AWS, Bigquery, Snowflake, etc.) refer to this [guide](https://github.com/feast-dev/feast/blob/master/CONTRIBUTING.md) under the `Integration Tests` section. + - Access these by setting environment variables as `secrets.SECRET_NAME`. +4. To limit pytest in your github workflow to test only your specific tests, leverage the `-k` option for pytest. + + ```bash + pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread -k "BigQuery and not dynamo and not Redshift" + ``` + + - Each test in Feast is parametrized by its offline and online store so we can filter out tests by name. The above command chooses only tests with BigQuery that do not use Dynamo or Redshift. + +5. Everytime a pull request or a change to a pull request is made, the integration tests, the local integration tests, the unit tests, and the linter should run. + +> Sample fork setups can be found here: [snowflake](https://github.com/kevjumba/feast/pull/30) and [bigquery](https://github.com/kevjumba/feast/pull/31). diff --git a/docs/project/release-process.md b/docs/project/release-process.md index e9f3295d91..2ddc697730 100644 --- a/docs/project/release-process.md +++ b/docs/project/release-process.md @@ -4,49 +4,93 @@ For Feast maintainers, these are the concrete steps for making a new release. -### Pre-release Verification (Verification that wheels are built correctly) for minor release. +### 1. (for patch releases) Cherry-pick changes into the branch from master +If you were cutting Feast 0.22.3, for example, you might do: +1. `git checkout v0.22-branch` (or `git pull upstream v0.22-branch --rebase` if you've cut a release before) +2. `git cherry-pick [COMMIT FROM MASTER]` +3. `git push upstream v0.22-branch` to commit changes to the release branch + +> Note: if you're handling a maintenance release (i.e. an older version), semantic release may complain at you. See +> [Sample PR](https://github.com/feast-dev/feast/commit/40f2a6e13dd7d2a5ca5bff1af378e8712621d4f2) to enable an older +> branch to cut releases. + +After this step, you will have all the changes you need in the branch. + +### 2. Pre-release verification +A lot of things can go wrong. One of the most common is getting the wheels to build correctly (and not accidentally +building dev wheels from improper tagging or local code changes during the release process). + +We verify the wheels building in **your fork** of Feast, not the main feast-dev/feast repo. + +#### For minor releases (e.g. v0.22.0) 1. Merge upstream master changes into your **fork**. Make sure you are running the workflow off of your fork! 2. Create a tag manually for the release on your fork. For example, if you are doing a release for version 0.22.0, create a tag by doing the following. - Checkout master branch and run `git tag v0.22.0`. - Run `git push --tags` to push the tag to your forks master branch. -3. Access the `Actions` tab on your github UI on your fork and click the `build_wheels` action. This workflow will build the python sdk wheels for Python 3.8-3.10 on MacOS 10.15 and Linux and verify that these wheels are correct. The publish workflow uses this action to publish the python wheels for a new release to pypi. + > This is important. If you don't have a tag, then the wheels you build will be **dev wheels**, which we can't + > push. The release process will automatically produce a tag for you via Semantic Release. +3. Access the `Actions` tab on your GitHub UI on your fork and click the `build_wheels` action. This workflow will + build the python sdk wheels for Python 3.8-3.10 on MacOS 10.15 and Linux and verify that these wheels are correct. + The publish workflow uses this action to publish the python wheels for a new release to PyPI. 4. Look for the header `This workflow has a workflow_dispatch event trigger` and click `Run Workflow` on the right. -5. Run the workflow off of the tag you just created(`v0.22.0` in this case) and verify that the workflow worked (i.e ensure that all jobs are green). +5. Run the workflow off of the tag you just created(`v0.22.0` in this case, **not** the master branch) and verify that + the workflow worked (i.e ensure that all jobs are green). -### Pre-release Verification (Verification that wheels are built correctly) for patch release. -1. Check out the branch of your release (e.g `v0.22-branch` on your local **fork**) and push this to your fork (`git push -u origin `). -2. Cherry pick commits that are relevant to the patch release onto your forked branch. -3. Checkout the release branch and add a patch release tag (e.g `v0.22.1`) by running `git tag `. -4. Push tags to your origin branch with `git push origin `. -5. Kick off `build_wheels` workflow in the same way as is detailed in the last section on of the patch release tag. +#### For patch releases (e.g. v0.22.3) +You should already have checked out the existing minor release branch from step 1 (e.g. `v0.22-branch`). +1. Push the minor release branch to your fork (`git push -u origin `). +2. Add a patch release tag (e.g `v0.22.1`) by running `git tag `. + > This is important. If you don't have a tag, then the wheels you build will be **dev wheels**, which we can't + > push. The release process will automatically produce a tag for you via Semantic Release. +3. Push tags to your **origin branch** (not the upstream feast-dev/feast branch) with `git push origin `. +4. Kick off `build_wheels` workflow in your fork in the same way as is detailed in the last section, running the + workflow from this tag you just pushed up. -### Release for Python and Java SDK +### 3. Release for Python and Java SDK 1. Generate a [Personal Access Token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) or retrieve your saved personal access token. - - The personal access token should have all of the permissions under the `repo` checkbox. + * The personal access token should have all of the permissions under the `repo` checkbox. 2. Access the `Actions` tab on the main `feast-dev/feast` repo and find the `release` action. 3. Look for the header `This workflow has a workflow_dispatch event trigger` again and click `Run Workflow` on the right. + * If you are making a minor or major release, you should run it off of the master branch. + * If you are making a patch release, run it off of the corresponding minor release branch. 4. Try the dry run first with your personal access token. If this succeeds, uncheck `Dry Run` and run the release workflow. -5. All of the jobs should succeed besides the UI job which needs to be released separately. Ping a maintainer on Slack to run the UI release manually. -6. Try to install the feast release in your local environment and test out the `feast init` -> `feast apply` workflow to verify as a sanity check that the release worked correctly. +5. Then try running normally (without dry run). + - First, the `release` workflow will kick off. This publishes an NPM package for the Web UI ([NPM package](http://npmjs.com/package/@feast-dev/feast-ui)), + bumps files versions (e.g. helm chart, UI, Java pom.xml files), and generate a changelog using Semantic Release. + All jobs should succeed. + - Second, the `publish` workflow will kick off. This builds all the Python wheels ([PyPI link](https://pypi.org/project/feast/), + publishes helm charts, publishes the Python and Java feature servers to Docker ([DockerHub images](https://hub.docker.com/u/feastdev)), + publishes the Java Serving Client + Datatypes libraries to Maven ([Maven repo](https://mvnrepository.com/artifact/dev.feast)) +6. Try to install the Feast Python release in your local environment and test out the `feast init` -> `feast apply` + workflow to verify as a sanity check that the release worked correctly. +7. Verify the releases all show the new version: + - [NPM package](http://npmjs.com/package/@feast-dev/feast-ui) + - [PyPI link](https://pypi.org/project/feast/) + - [DockerHub images (Java + Python feature servers, feature transformation server)](https://hub.docker.com/u/feastdev) + - [Maven repo (feast-datatypes, feast-serving-client)](https://mvnrepository.com/artifact/dev.feast) + +### 4. (for minor releases) Post-release steps +#### 4a: Creating a new branch +Create a new branch based on master (i.e. v0.22-branch) and push to the main Feast repo. This will be where +cherry-picks go for future patch releases and where documentation will point. -### (for minor releases) Post-release steps -1. Create a new branch based on master (i.e. v0.22-branch) and push to the main Feast repo. This will be where cherry-picks go for future patch releases and where documentation will point. -2. Write a summary of the release in the GitHub release - 1. By default, Semantic Release will pull in messages from commits (features vs fixes, etc). But this is hard to digest still, so it helps to have a high level overview. +#### 4b: Adding a high level summary in the GitHub release notes +By default, Semantic Release will pull in messages from commits (features vs fixes, etc). But this is hard to digest, +so it helps to have a high level overview. See https://github.com/feast-dev/feast/releases for the releases. -### Update documentation +#### 4c: Update documentation -In the Feast Gitbook (ask [Danny Chiao](https://tectonfeast.slack.com/team/U029405HFEU) in Slack for access): +In the Feast Gitbook (ask [Danny Chiao](https://tectonfeast.slack.com/team/U029405HFEU) in Slack for access): 1. Create a new space within the Feast collection -2. Go to the overflow menu on the top -> Synchronize with Git +2. Go to the overflow menu on the top -> Synchronize with Git 1. Specify GitHub as the provider - + ![](new_branch_part_1.png) 2. Configure to point to the new release branch ![](new_branch_part_2.png) 3. Publish the new page for this branch as part of the collection - + ![](new_branch_part_3.png) 4. Go back to the main Feast collection and go to the overflow menu -> "Customize collection" @@ -54,4 +98,5 @@ In the Feast Gitbook (ask [Danny Chiao](https://tectonfeast.slack.com/team/U0294 5. Configure the default space to be your new branch and save ![](new_branch_part_5.png) -6. Verify on docs.feast.dev that this new space is the default (this may take a few minutes to propagate, and your browser cache may be caching the old branch as the default) \ No newline at end of file +6. Verify on [docs.feast.dev](http://docs.feast.dev) that this new space is the default (this may take a few minutes to + propagate, and your browser cache may be caching the old branch as the default) \ No newline at end of file diff --git a/docs/reference/alpha-on-demand-feature-view.md b/docs/reference/alpha-on-demand-feature-view.md index eb8c4f6291..01b47d13dc 100644 --- a/docs/reference/alpha-on-demand-feature-view.md +++ b/docs/reference/alpha-on-demand-feature-view.md @@ -1,23 +1,30 @@ -# \[Alpha\] On demand feature view +# \[Alpha] On demand feature view **Warning**: This is an _experimental_ feature. It's intended for early testing and feedback, and could change without warnings in future releases. -{% hint style="info" %} -To enable this feature, run **`feast alpha enable on_demand_transforms`** -{% endhint %} - ## Overview -On demand feature views allows users to use existing features and request time data \(features only available at request time\) to transform and create new features. Users define python transformation logic which is executed in both historical retrieval and online retrieval paths. +On demand feature views allows data scientists to use existing features and request time data (features only available at request time) to transform and create new features. Users define python transformation logic which is executed in both historical retrieval and online retrieval paths. + +Currently, these transformations are executed locally. This is fine for online serving, but does not scale well offline. -Currently, these transformations are executed locally. Future milestones include building a Feature Transformation Server for executing transformations at higher scale. +### Why use on demand feature views? + +This enables data scientists to easily impact the online feature retrieval path. For example, a data scientist could + +1. Call `get_historical_features` to generate a training dataframe +2. Iterate in notebook on feature engineering in Pandas +3. Copy transformation logic into on demand feature views and commit to a dev branch of the feature repository +4. Verify with `get_historical_features` (on a small dataset) that the transformation gives expected output over historical data +5. Verify with `get_online_features` on dev branch that the transformation correctly outputs online features +6. Submit a pull request to the staging / prod branches which impact production traffic ## CLI There are new CLI commands: * `feast on-demand-feature-views list` lists all registered on demand feature view after `feast apply` is run -* `feast on-demand-feature-views describe [NAME]` describes the definition of an on demand feature view +* `feast on-demand-feature-views describe [NAME]` describes the definition of an on demand feature view ## Example @@ -63,7 +70,7 @@ def transformed_conv_rate(features_df: pd.DataFrame) -> pd.DataFrame: ### **Feature retrieval** {% hint style="info" %} -The on demand feature view's name is the function name \(i.e. `transformed_conv_rate`\). +The on demand feature view's name is the function name (i.e. `transformed_conv_rate`). {% endhint %} And then to retrieve historical or online features, we can call this in a feature service or reference individual features: @@ -80,4 +87,3 @@ training_df = store.get_historical_features( ], ).to_df() ``` - diff --git a/docs/reference/alpha-web-ui.md b/docs/reference/alpha-web-ui.md index 182f9fb13d..7d21a3d45d 100644 --- a/docs/reference/alpha-web-ui.md +++ b/docs/reference/alpha-web-ui.md @@ -1,14 +1,15 @@ -# \[Alpha\] Feast Web UI +# \[Beta] Web UI -**Warning**: This is an _experimental_ feature. It's intended for early testing and feedback, and could change without warnings in future releases. +**Warning**: This is an _experimental_ feature. To our knowledge, this is stable, but there are still rough edges in the experience. Contributions are welcome! ## Overview The Feast Web UI allows users to explore their feature repository through a Web UI. It includes functionality such as: -- Browsing Feast objects (feature views, entities, data sources, feature services, and saved datasets) and their relationships -- Searching and filtering for Feast objects by tags -![Sample UI](ui.png) +* Browsing Feast objects (feature views, entities, data sources, feature services, and saved datasets) and their relationships +* Searching and filtering for Feast objects by tags + +![Sample UI](../../ui/sample.png) ## Usage @@ -19,6 +20,7 @@ There are several ways to use the Feast Web UI. The easiest way to get started is to run the `feast ui` command within a feature repository: Output of `feast ui --help`: + ```bash Usage: feast ui [OPTIONS] @@ -84,6 +86,7 @@ When you start the React app, it will look for `project-list.json` to find a lis ``` Then start the React App + ```bash yarn start ``` @@ -92,7 +95,7 @@ yarn start The advantage of importing Feast UI as a module is in the ease of customization. The `` component exposes a `feastUIConfigs` prop thorough which you can customize the UI. Currently it supports a few parameters. -##### Fetching the Project List +**Fetching the Project List** You can use `projectListPromise` to provide a promise that overrides where the Feast UI fetches the project list from. @@ -110,7 +113,7 @@ You can use `projectListPromise` to provide a promise that overrides where the F /> ``` -##### Custom Tabs +**Custom Tabs** You can add custom tabs for any of the core Feast objects through the `tabsRegistry`. @@ -133,4 +136,3 @@ const tabsRegistry = { ``` Examples of custom tabs can be found in the `ui/custom-tabs` folder. - diff --git a/docs/reference/batch-materialization/README.md b/docs/reference/batch-materialization/README.md new file mode 100644 index 0000000000..50640bce49 --- /dev/null +++ b/docs/reference/batch-materialization/README.md @@ -0,0 +1,7 @@ +# Batch materialization + +Please see [Batch Materialization Engine](../../getting-started/architecture-and-components/batch-materialization-engine.md) for an explanation of batch materialization engines. + +{% page-ref page="snowflake.md" %} + +{% page-ref page="bytewax.md" %} diff --git a/docs/reference/batch-materialization/bytewax.md b/docs/reference/batch-materialization/bytewax.md new file mode 100644 index 0000000000..bd98a4dc6e --- /dev/null +++ b/docs/reference/batch-materialization/bytewax.md @@ -0,0 +1,74 @@ +# Bytewax + +## Description + +The [Bytewax](https://bytewax.io) batch materialization engine provides an execution +engine for batch materializing operations (`materialize` and `materialize-incremental`). + +### Guide + +In order to use the Bytewax materialization engine, you will need a [Kubernetes](https://kubernetes.io/) cluster running version 1.22.10 or greater. + +#### Kubernetes Authentication + +The Bytewax materialization engine loads authentication and cluster information from the [kubeconfig file](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/). By default, kubectl looks for a file named `config` in the `$HOME/.kube directory`. You can specify other kubeconfig files by setting the `KUBECONFIG` environment variable. + +#### Resource Authentication + +Bytewax jobs can be configured to access [Kubernetes secrets](https://kubernetes.io/docs/concepts/configuration/secret/) as environment variables to access online and offline stores during job runs. + +To configure secrets, first create them using `kubectl`: + +``` shell +kubectl create secret generic -n bytewax aws-credentials --from-literal=aws-access-key-id='' --from-literal=aws-secret-access-key='' +``` + +Then configure them in the batch_engine section of `feature_store.yaml`: + +``` yaml +batch_engine: + type: bytewax + namespace: bytewax + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: aws-access-key-id + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: aws-secret-access-key +``` + +#### Configuration + +The Bytewax materialization engine is configured through the The `feature_store.yaml` configuration file: + +``` yaml +batch_engine: + type: bytewax + namespace: bytewax + image: bytewax/bytewax-feast:latest +``` + +The `namespace` configuration directive specifies which Kubernetes [namespace](https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/) jobs, services and configuration maps will be created in. + +#### Building a custom Bytewax Docker image + +The `image` configuration directive specifies which container image to use when running the materialization job. To create a custom image based on this container, run the following command: + +``` shell +DOCKER_BUILDKIT=1 docker build . -f ./sdk/python/feast/infra/materialization/contrib/bytewax/Dockerfile -t +``` + +Once that image is built and pushed to a registry, it can be specified as a part of the batch engine configuration: + +``` shell +batch_engine: + type: bytewax + namespace: bytewax + image: +``` + diff --git a/docs/reference/batch-materialization/snowflake.md b/docs/reference/batch-materialization/snowflake.md new file mode 100644 index 0000000000..c2fa441d6d --- /dev/null +++ b/docs/reference/batch-materialization/snowflake.md @@ -0,0 +1,28 @@ +# Snowflake + +## Description + +The [Snowflake](https://trial.snowflake.com) batch materialization engine provides a highly scalable and parallel execution engine using a Snowflake Warehouse for batch materializations operations (`materialize` and `materialize-incremental`) when using a `SnowflakeSource`. + +The engine requires no additional configuration other than for you to supply Snowflake's standard login and context details. The engine leverages custom (automatically deployed for you) Python UDFs to do the proper serialization of your offline store data to your online serving tables. + +When using all three options together, `snowflake.offline`, `snowflake.engine`, and `snowflake.online`, you get the most unique experience of unlimited scale and performance + governance and data security. + +## Example + +{% code title="feature_store.yaml" %} +```yaml +... +offline_store: + type: snowflake.offline +... +batch_engine: + type: snowflake.engine + account: snowflake_deployment.us-east-1 + user: user_login + password: user_password + role: sysadmin + warehouse: demo_wh + database: FEAST +``` +{% endcode %} diff --git a/docs/reference/codebase-structure.md b/docs/reference/codebase-structure.md new file mode 100644 index 0000000000..b75227860b --- /dev/null +++ b/docs/reference/codebase-structure.md @@ -0,0 +1,131 @@ +# Codebase structure + +Let's examine the Feast codebase. +This analysis is accurate as of Feast 0.23. + +``` +$ tree -L 1 -d +. +├── docs +├── examples +├── go +├── infra +├── java +├── protos +├── sdk +└── ui +``` + +## Python SDK + +The Python SDK lives in `sdk/python/feast`. +The majority of Feast logic lives in these Python files: +* The core Feast objects ([entities](../getting-started/concepts/entity.md), [feature views](../getting-started/concepts/feature-view.md), [data sources](../getting-started/concepts/dataset.md), etc.) are defined in their respective Python files, such as `entity.py`, `feature_view.py`, and `data_source.py`. +* The `FeatureStore` class is defined in `feature_store.py` and the associated configuration object (the Python representation of the `feature_store.yaml` file) are defined in `repo_config.py`. +* The CLI and other core feature store logic are defined in `cli.py` and `repo_operations.py`. +* The type system that is used to manage conversion between Feast types and external typing systems is managed in `type_map.py`. +* The Python feature server (the server that is started through the `feast serve` command) is defined in `feature_server.py`. + +There are also several important submodules: +* `infra/` contains all the infrastructure components, such as the provider, offline store, online store, batch materialization engine, and registry. +* `dqm/` covers data quality monitoring, such as the dataset profiler. +* `diff/` covers the logic for determining how to apply infrastructure changes upon feature repo changes (e.g. the output of `feast plan` and `feast apply`). +* `embedded_go/` covers the Go feature server. +* `ui/` contains the embedded Web UI, to be launched on the `feast ui` command. + +Of these submodules, `infra/` is the most important. +It contains the interfaces for the [provider](getting-started/architecture-and-components/provider.md), [offline store](getting-started/architecture-and-components/offline-store.md), [online store](getting-started/architecture-and-components/online-store.md), [batch materialization engine](getting-started/architecture-and-components/batch-materialization-engine.md), and [registry](getting-started/architecture-and-components/registry.md), as well as all of their individual implementations. + +``` +$ tree --dirsfirst -L 1 infra +infra +├── contrib +├── feature_servers +├── materialization +├── offline_stores +├── online_stores +├── registry +├── transformation_servers +├── utils +├── __init__.py +├── aws.py +├── gcp.py +├── infra_object.py +├── key_encoding_utils.py +├── local.py +├── passthrough_provider.py +└── provider.py +``` + +The tests for the Python SDK are contained in `sdk/python/tests`. +For more details, see this [overview](../how-to-guides/adding-or-reusing-tests.md#test-suite-overview) of the test suite. + +### Example flow: `feast apply` + +Let's walk through how `feast apply` works by tracking its execution across the codebase. + +1. All CLI commands are in `cli.py`. + Most of these commands are backed by methods in `repo_operations.py`. + The `feast apply` command triggers `apply_total_command`, which then calls `apply_total` in `repo_operations.py`. +2. With a `FeatureStore` object (from `feature_store.py`) that is initialized based on the `feature_store.yaml` in the current working directory, `apply_total` first parses the feature repo with `parse_repo` and then calls either `FeatureStore.apply` or `FeatureStore._apply_diffs` to apply those changes to the feature store. +3. Let's examine `FeatureStore.apply`. + It splits the objects based on class (e.g. `Entity`, `FeatureView`, etc.) and then calls the appropriate registry method to apply or delete the object. + For example, it might call `self._registry.apply_entity` to apply an entity. + If the default file-based registry is used, this logic can be found in `infra/registry/registry.py`. +4. Then the feature store must update its cloud infrastructure (e.g. online store tables) to match the new feature repo, so it calls `Provider.update_infra`, which can be found in `infra/provider.py`. +5. Assuming the provider is a built-in provider (e.g. one of the local, GCP, or AWS providers), it will call `PassthroughProvider.update_infra` in `infra/passthrough_provider.py`. +6. This delegates to the online store and batch materialization engine. + For example, if the feature store is configured to use the Redis online store then the `update` method from `infra/online_stores/redis.py` will be called. + And if the local materialization engine is configured then the `update` method from `infra/materialization/local_engine.py` will be called. + +At this point, the `feast apply` command is complete. + +### Example flow: `feast materialize` + +Let's walk through how `feast materialize` works by tracking its execution across the codebase. + +1. The `feast materialize` command triggers `materialize_command` in `cli.py`, which then calls `FeatureStore.materialize` from `feature_store.py`. +2. This then calls `Provider.materialize_single_feature_view`, which can be found in `infra/provider.py`. +3. As with `feast apply`, the provider is most likely backed by the passthrough provider, in which case `PassthroughProvider.materialize_single_feature_view` will be called. +4. This delegates to the underlying batch materialization engine. + Assuming that the local engine has been configured, `LocalMaterializationEngine.materialize` from `infra/materialization/local_engine.py` will be called. +5. Since materialization involves reading features from the offline store and writing them to the online store, the local engine will delegate to both the offline store and online store. + Specifically, it will call `OfflineStore.pull_latest_from_table_or_query` and `OnlineStore.online_write_batch`. + These two calls will be routed to the offline store and online store that have been configured. + +### Example flow: `get_historical_features` + +Let's walk through how `get_historical_features` works by tracking its execution across the codebase. + +1. We start with `FeatureStore.get_historical_features` in `feature_store.py`. + This method does some internal preparation, and then delegates the actual execution to the underlying provider by calling `Provider.get_historical_features`, which can be found in `infra/provider.py`. +2. As with `feast apply`, the provider is most likely backed by the passthrough provider, in which case `PassthroughProvider.get_historical_features` will be called. +3. That call simply delegates to `OfflineStore.get_historical_features`. + So if the feature store is configured to use Snowflake as the offline store, `SnowflakeOfflineStore.get_historical_features` will be executed. + +## Java SDK + +The `java/` directory contains the Java serving component. +See [here](https://github.com/feast-dev/feast/blob/master/java/CONTRIBUTING.md) for more details on how the repo is structured. + +## Go feature server + +The `go/` directory contains the Go feature server. +Most of the files here have logic to help with reading features from the online store. +Within `go/`, the `internal/feast/` directory contains most of the core logic: +* `onlineserving/` covers the core serving logic. +* `model/` contains the implementations of the Feast objects (entity, feature view, etc.). + * For example, `entity.go` is the Go equivalent of `entity.py`. It contains a very simple Go implementation of the entity object. +* `registry/` covers the registry. + * Currently only the file-based registry supported (the sql-based registry is unsupported). Additionally, the file-based registry only supports a file-based registry store, not the GCS or S3 registry stores. +* `onlinestore/` covers the online stores (currently only Redis and SQLite are supported). + +## Protobufs + +Feast uses [protobuf](https://github.com/protocolbuffers/protobuf) to store serialized versions of the core Feast objects. +The protobuf definitions are stored in `protos/feast`. + +## Web UI + +The `ui/` directory contains the Web UI. +See [here](https://github.com/feast-dev/feast/blob/master/ui/CONTRIBUTING.md) for more details on the structure of the Web UI. diff --git a/docs/reference/data-sources/README.md b/docs/reference/data-sources/README.md index b4fbc98b46..e69fbab8e3 100644 --- a/docs/reference/data-sources/README.md +++ b/docs/reference/data-sources/README.md @@ -1,6 +1,10 @@ # Data sources -Please see [Data Source](../../getting-started/concepts/feature-view.md#data-source) for an explanation of data sources. +Please see [Data Source](../../getting-started/concepts/data-ingestion.md) for a conceptual explanation of data sources. + +{% content-ref url="overview.md" %} +[overview.md](overview.md) +{% endcontent-ref %} {% content-ref url="file.md" %} [file.md](file.md) @@ -35,5 +39,13 @@ Please see [Data Source](../../getting-started/concepts/feature-view.md#data-sou {% endcontent-ref %} {% content-ref url="postgres.md" %} -[postgres.md]([postgres].md) +[postgres.md](postgres.md) +{% endcontent-ref %} + +{% content-ref url="trino.md" %} +[trino.md](trino.md) +{% endcontent-ref %} + +{% content-ref url="mssql.md" %} +[mssql.md](mssql.md) {% endcontent-ref %} diff --git a/docs/reference/data-sources/bigquery.md b/docs/reference/data-sources/bigquery.md index 47eb9b1bf6..51c9b19ecd 100644 --- a/docs/reference/data-sources/bigquery.md +++ b/docs/reference/data-sources/bigquery.md @@ -1,15 +1,14 @@ -# BigQuery +# BigQuery source ## Description -BigQuery data sources allow for the retrieval of historical feature values from BigQuery for building training datasets as well as materializing features into an online store. - -* Either a table reference or a SQL query can be provided. -* No performance guarantees can be provided over SQL query-based sources. Please use table references where possible. +BigQuery data sources are BigQuery tables or views. +These can be specified either by a table reference or a SQL query. +However, no performance guarantees can be provided for SQL query-based sources, so table references are recommended. ## Examples -Using a table reference +Using a table reference: ```python from feast import BigQuerySource @@ -19,7 +18,7 @@ my_bigquery_source = BigQuerySource( ) ``` -Using a query +Using a query: ```python from feast import BigQuerySource @@ -30,5 +29,9 @@ BigQuerySource( ) ``` -Configuration options are available [here](https://rtd.feast.dev/en/latest/index.html#feast.data_source.BigQuerySource). +The full set of configuration options is available [here](https://rtd.feast.dev/en/latest/index.html#feast.infra.offline_stores.bigquery_source.BigQuerySource). + +## Supported Types +BigQuery data sources support all eight primitive types and their corresponding array types. +For a comparison against other batch data sources, please see [here](overview.md#functionality-matrix). diff --git a/docs/reference/data-sources/file.md b/docs/reference/data-sources/file.md index 12e6529840..5895b1a8ce 100644 --- a/docs/reference/data-sources/file.md +++ b/docs/reference/data-sources/file.md @@ -1,8 +1,9 @@ -# File +# File source ## Description -File data sources allow for the retrieval of historical feature values from files on disk for building training datasets, as well as for materializing features into an online store. +File data sources are files on disk or on S3. +Currently only Parquet files are supported. {% hint style="warning" %} FileSource is meant for development purposes only and is not optimized for production use. @@ -20,5 +21,9 @@ parquet_file_source = FileSource( ) ``` -Configuration options are available [here](https://rtd.feast.dev/en/latest/index.html#feast.data_source.FileSource). +The full set of configuration options is available [here](https://rtd.feast.dev/en/latest/index.html#feast.infra.offline_stores.file_source.FileSource). +## Supported Types + +File data sources support all eight primitive types and their corresponding array types. +For a comparison against other batch data sources, please see [here](overview.md#functionality-matrix). diff --git a/docs/reference/data-sources/mssql.md b/docs/reference/data-sources/mssql.md new file mode 100644 index 0000000000..8bf1ede6aa --- /dev/null +++ b/docs/reference/data-sources/mssql.md @@ -0,0 +1,29 @@ +# MsSQL source (contrib) + +## Description + +MsSQL data sources are Microsoft sql table sources. +These can be specified either by a table reference or a SQL query. + +## Disclaimer + +The MsSQL data source does not achieve full test coverage. +Please do not assume complete stability. + +## Examples + +Defining a MsSQL source: + +```python +from feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source import ( + MsSqlServerSource, +) + +driver_hourly_table = "driver_hourly" + +driver_source = MsSqlServerSource( + table_ref=driver_hourly_table, + event_timestamp_column="datetime", + created_timestamp_column="created", +) +``` diff --git a/docs/reference/data-sources/overview.md b/docs/reference/data-sources/overview.md new file mode 100644 index 0000000000..112d4168d3 --- /dev/null +++ b/docs/reference/data-sources/overview.md @@ -0,0 +1,31 @@ +# Overview + +## Functionality + +In Feast, each batch data source is associated with a corresponding offline store. +For example, a `SnowflakeSource` can only be processed by the Snowflake offline store. +Otherwise, the primary difference between batch data sources is the set of supported types. +Feast has an internal type system, and aims to support eight primitive types (`bytes`, `string`, `int32`, `int64`, `float32`, `float64`, `bool`, and `timestamp`) along with the corresponding array types. +However, not every batch data source supports all of these types. + +For more details on the Feast type system, see [here](../type-system.md). + +## Functionality Matrix + +There are currently four core batch data source implementations: `FileSource`, `BigQuerySource`, `SnowflakeSource`, and `RedshiftSource`. +There are several additional implementations contributed by the Feast community (`PostgreSQLSource`, `SparkSource`, and `TrinoSource`), which are not guaranteed to be stable or to match the functionality of the core implementations. +Details for each specific data source can be found [here](README.md). + +Below is a matrix indicating which data sources support which types. + +| | File | BigQuery | Snowflake | Redshift | Postgres | Spark | Trino | +| :-------------------------------- | :-- | :-- | :-- | :-- | :-- | :-- | :-- | +| `bytes` | yes | yes | yes | yes | yes | yes | yes | +| `string` | yes | yes | yes | yes | yes | yes | yes | +| `int32` | yes | yes | yes | yes | yes | yes | yes | +| `int64` | yes | yes | yes | yes | yes | yes | yes | +| `float32` | yes | yes | yes | yes | yes | yes | yes | +| `float64` | yes | yes | yes | yes | yes | yes | yes | +| `bool` | yes | yes | yes | yes | yes | yes | yes | +| `timestamp` | yes | yes | yes | yes | yes | yes | yes | +| array types | yes | yes | no | no | yes | yes | no | \ No newline at end of file diff --git a/docs/reference/data-sources/postgres.md b/docs/reference/data-sources/postgres.md index 759cb50bbd..23d7818a04 100644 --- a/docs/reference/data-sources/postgres.md +++ b/docs/reference/data-sources/postgres.md @@ -1,15 +1,18 @@ -# PostgreSQL +# PostgreSQL source (contrib) ## Description -**NOTE**: The Postgres plugin is a contrib plugin. This means it may not be fully stable. +PostgreSQL data sources are PostgreSQL tables or views. +These can be specified either by a table reference or a SQL query. +## Disclaimer -The PostgreSQL data source allows for the retrieval of historical feature values from a PostgreSQL database for building training datasets as well as materializing features into an online store. +The PostgreSQL data source does not achieve full test coverage. +Please do not assume complete stability. ## Examples -Defining a Postgres source +Defining a Postgres source: ```python from feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source import ( @@ -23,3 +26,10 @@ driver_stats_source = PostgreSQLSource( created_timestamp_column="created", ) ``` + +The full set of configuration options is available [here](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source.PostgreSQLSource). + +## Supported Types + +PostgreSQL data sources support all eight primitive types and their corresponding array types. +For a comparison against other batch data sources, please see [here](overview.md#functionality-matrix). diff --git a/docs/reference/data-sources/push.md b/docs/reference/data-sources/push.md index 6af070d1c4..035ee58360 100644 --- a/docs/reference/data-sources/push.md +++ b/docs/reference/data-sources/push.md @@ -27,7 +27,7 @@ Feast allows users to push features previously registered in a feature view to t Note that the push schema needs to also include the entity. ```python -from feast import PushSource, ValueType, BigQuerySource, FeatureView, Feature, Field +from feast import Entity, PushSource, ValueType, BigQuerySource, FeatureView, Feature, Field from feast.types import Int64 push_source = PushSource( @@ -35,9 +35,11 @@ push_source = PushSource( batch_source=BigQuerySource(table="test.test"), ) +user = Entity(name="user", join_keys=["user_id"]) + fv = FeatureView( name="feature view", - entities=["user_id"], + entities=[user], schema=[Field(name="life_time_value", dtype=Int64)], source=push_source, ) diff --git a/docs/reference/data-sources/redshift.md b/docs/reference/data-sources/redshift.md index 7f50c64d02..2c3c65cc70 100644 --- a/docs/reference/data-sources/redshift.md +++ b/docs/reference/data-sources/redshift.md @@ -1,15 +1,14 @@ -# Redshift +# Redshift source ## Description -Redshift data sources allow for the retrieval of historical feature values from Redshift for building training datasets as well as materializing features into an online store. - -* Either a table name or a SQL query can be provided. -* No performance guarantees can be provided over SQL query-based sources. Please use table references where possible. +Redshift data sources are Redshift tables or views. +These can be specified either by a table reference or a SQL query. +However, no performance guarantees can be provided for SQL query-based sources, so table references are recommended. ## Examples -Using a table name +Using a table name: ```python from feast import RedshiftSource @@ -19,7 +18,7 @@ my_redshift_source = RedshiftSource( ) ``` -Using a query +Using a query: ```python from feast import RedshiftSource @@ -30,5 +29,9 @@ my_redshift_source = RedshiftSource( ) ``` -Configuration options are available [here](https://rtd.feast.dev/en/master/feast.html?#feast.RedshiftSource). +The full set of configuration options is available [here](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.redshift_source.RedshiftSource). + +## Supported Types +Redshift data sources support all eight primitive types, but currently do not support array types. +For a comparison against other batch data sources, please see [here](overview.md#functionality-matrix). diff --git a/docs/reference/data-sources/snowflake.md b/docs/reference/data-sources/snowflake.md index 0f5304b6cd..82bf5cb4d4 100644 --- a/docs/reference/data-sources/snowflake.md +++ b/docs/reference/data-sources/snowflake.md @@ -1,14 +1,13 @@ -# Snowflake +# Snowflake source ## Description -Snowflake data sources allow for the retrieval of historical feature values from Snowflake for building training datasets as well as materializing features into an online store. - -* Either a table reference or a SQL query can be provided. +Snowflake data sources are Snowflake tables or views. +These can be specified either by a table reference or a SQL query. ## Examples -Using a table reference +Using a table reference: ```python from feast import SnowflakeSource @@ -20,7 +19,7 @@ my_snowflake_source = SnowflakeSource( ) ``` -Using a query +Using a query: ```python from feast import SnowflakeSource @@ -38,7 +37,14 @@ my_snowflake_source = SnowflakeSource( ) ``` -One thing to remember is how Snowflake handles table and column name conventions. -You can read more about quote identifiers [here](https://docs.snowflake.com/en/sql-reference/identifiers-syntax.html) +{% hint style="warning" %} +Be careful about how Snowflake handles table and column name conventions. +In particular, you can read more about quote identifiers [here](https://docs.snowflake.com/en/sql-reference/identifiers-syntax.html). +{% endhint %} + +The full set of configuration options is available [here](https://rtd.feast.dev/en/latest/index.html#feast.infra.offline_stores.snowflake_source.SnowflakeSource). + +## Supported Types -Configuration options are available [here](https://rtd.feast.dev/en/latest/index.html#feast.data_source.SnowflakeSource). +Snowflake data sources support all eight primitive types, but currently do not support array types. +For a comparison against other batch data sources, please see [here](overview.md#functionality-matrix). diff --git a/docs/reference/data-sources/spark.md b/docs/reference/data-sources/spark.md index 266a401a51..99d5902667 100644 --- a/docs/reference/data-sources/spark.md +++ b/docs/reference/data-sources/spark.md @@ -1,16 +1,17 @@ -# Spark (contrib) +# Spark source (contrib) ## Description -**NOTE**: Spark data source api is currently in alpha development and the API is not completely stable. The API may change or update in the future. +Spark data sources are tables or files that can be loaded from some Spark store (e.g. Hive or in-memory). They can also be specified by a SQL query. -The spark data source API allows for the retrieval of historical feature values from file/database sources for building training datasets as well as materializing features into an online store. +## Disclaimer -* Either a table name, a SQL query, or a file path can be provided. +The Spark data source does not achieve full test coverage. +Please do not assume complete stability. ## Examples -Using a table reference from SparkSession(for example, either in memory or a Hive Metastore) +Using a table reference from SparkSession (for example, either in-memory or a Hive Metastore): ```python from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( @@ -22,7 +23,7 @@ my_spark_source = SparkSource( ) ``` -Using a query +Using a query: ```python from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( @@ -35,7 +36,7 @@ my_spark_source = SparkSource( ) ``` -Using a file reference +Using a file reference: ```python from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( @@ -49,3 +50,10 @@ my_spark_source = SparkSource( created_timestamp_column="created", ) ``` + +The full set of configuration options is available [here](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource). + +## Supported Types + +Spark data sources support all eight primitive types and their corresponding array types. +For a comparison against other batch data sources, please see [here](overview.md#functionality-matrix). diff --git a/docs/reference/data-sources/trino.md b/docs/reference/data-sources/trino.md new file mode 100644 index 0000000000..c74981f47e --- /dev/null +++ b/docs/reference/data-sources/trino.md @@ -0,0 +1,34 @@ +# Trino source (contrib) + +## Description + +Trino data sources are Trino tables or views. +These can be specified either by a table reference or a SQL query. + +## Disclaimer + +The Trino data source does not achieve full test coverage. +Please do not assume complete stability. + +## Examples + +Defining a Trino source: + +```python +from feast.infra.offline_stores.contrib.trino_offline_store.trino_source import ( + TrinoSource, +) + +driver_hourly_stats = TrinoSource( + event_timestamp_column="event_timestamp", + table_ref="feast.driver_stats", + created_timestamp_column="created", +) +``` + +The full set of configuration options is available [here](https://rtd.feast.dev/en/master/#trino-source). + +## Supported Types + +Trino data sources support all eight primitive types, but currently do not support array types. +For a comparison against other batch data sources, please see [here](overview.md#functionality-matrix). diff --git a/docs/reference/feature-repository.md b/docs/reference/feature-repository.md index a979034989..f9c91de350 100644 --- a/docs/reference/feature-repository.md +++ b/docs/reference/feature-repository.md @@ -89,8 +89,8 @@ A feature repository can also contain one or more Python files that contain feat ```python from datetime import timedelta -from feast import BigQuerySource, Entity, Feature, FeatureView, Field, ValueType -from feast.types import Float32, String +from feast import BigQuerySource, Entity, Feature, FeatureView, Field +from feast.types import Float32, Int64, String driver_locations_source = BigQuerySource( table="rh_prod.ride_hailing_co.drivers", @@ -100,17 +100,17 @@ driver_locations_source = BigQuerySource( driver = Entity( name="driver", - value_type=ValueType.INT64, description="driver id", ) driver_locations = FeatureView( name="driver_locations", - entities=["driver"], + entities=[driver], ttl=timedelta(days=1), schema=[ Field(name="lat", dtype=Float32), Field(name="lon", dtype=String), + Field(name="driver", dtype=Int64), ], source=driver_locations_source, ) diff --git a/docs/reference/feature-repository/README.md b/docs/reference/feature-repository/README.md index f737318773..2c1b112a78 100644 --- a/docs/reference/feature-repository/README.md +++ b/docs/reference/feature-repository/README.md @@ -94,8 +94,8 @@ A feature repository can also contain one or more Python files that contain feat ```python from datetime import timedelta -from feast import BigQuerySource, Entity, Feature, FeatureView, Field, ValueType -from feast.types import Float32, String +from feast import BigQuerySource, Entity, Feature, FeatureView, Field +from feast.types import Float32, Int64, String driver_locations_source = BigQuerySource( table_ref="rh_prod.ride_hailing_co.drivers", @@ -105,17 +105,17 @@ driver_locations_source = BigQuerySource( driver = Entity( name="driver", - value_type=ValueType.INT64, description="driver id", ) driver_locations = FeatureView( name="driver_locations", - entities=["driver"], + entities=[driver], ttl=timedelta(days=1), schema=[ Field(name="lat", dtype=Float32), Field(name="lon", dtype=String), + Field(name="driver", dtype=Int64), ], source=driver_locations_source, ) diff --git a/docs/reference/feature-servers/README.md b/docs/reference/feature-servers/README.md index 301cea372c..f9a40104c3 100644 --- a/docs/reference/feature-servers/README.md +++ b/docs/reference/feature-servers/README.md @@ -2,4 +2,14 @@ Feast users can choose to retrieve features from a feature server, as opposed to through the Python SDK. -{% page-ref page="python-feature-server.md" %} +{% content-ref url="python-feature-server.md" %} +[python-feature-server.md](python-feature-server.md) +{% endcontent-ref %} + +{% content-ref url="go-feature-server.md" %} +[go-feature-server.md](go-feature-server.md) +{% endcontent-ref %} + +{% content-ref url="alpha-aws-lambda-feature-server.md" %} +[alpha-aws-lambda-feature-server.md](alpha-aws-lambda-feature-server.md) +{% endcontent-ref %} \ No newline at end of file diff --git a/docs/reference/alpha-aws-lambda-feature-server.md b/docs/reference/feature-servers/alpha-aws-lambda-feature-server.md similarity index 66% rename from docs/reference/alpha-aws-lambda-feature-server.md rename to docs/reference/feature-servers/alpha-aws-lambda-feature-server.md index eadcf40bb4..caf5542bdc 100644 --- a/docs/reference/alpha-aws-lambda-feature-server.md +++ b/docs/reference/feature-servers/alpha-aws-lambda-feature-server.md @@ -1,20 +1,16 @@ -# \[Alpha\] AWS Lambda feature server +# \[Alpha] AWS Lambda feature server **Warning**: This is an _experimental_ feature. It's intended for early testing and feedback, and could change without warnings in future releases. -{% hint style="info" %} -To enable this feature, run **`feast alpha enable aws_lambda_feature_server`** -{% endhint %} - ## Overview -The AWS Lambda feature server is an HTTP endpoint that serves features with JSON I/O, deployed as a Docker image through AWS Lambda and AWS API Gateway. This enables users to get features from Feast using any programming language that can make HTTP requests. A [local feature server](feature-servers/python-feature-server.md) is also available. A remote feature server on GCP Cloud Run is currently being developed. +The AWS Lambda feature server is an HTTP endpoint that serves features with JSON I/O, deployed as a Docker image through AWS Lambda and AWS API Gateway. This enables users to get features from Feast using any programming language that can make HTTP requests. A [local feature server](python-feature-server.md) is also available. A remote feature server on GCP Cloud Run is currently being developed. ## Deployment The AWS Lambda feature server is only available to projects using the `AwsProvider` with registries on S3. It is disabled by default. To enable it, `feature_store.yaml` must be modified; specifically, the `enable` flag must be on and an `execution_role_name` must be specified. For example, after running `feast init -t aws`, changing the registry to be on S3, and enabling the feature server, the contents of `feature_store.yaml` should look similar to the following: -```text +``` project: dev registry: s3://feast/registries/dev provider: aws @@ -27,9 +23,6 @@ offline_store: database: feast s3_staging_location: s3://feast/redshift/tests/staging_location iam_role: arn:aws:iam::{aws_account}:role/redshift_s3_access_role -flags: - alpha_features: true - aws_lambda_feature_server: true feature_server: enabled: True execution_role_name: arn:aws:iam::{aws_account}:role/lambda_execution_role @@ -41,12 +34,12 @@ If enabled, the feature server will be deployed during `feast apply`. After it i Feast requires the following permissions in order to deploy and teardown AWS Lambda feature server: -| Permissions | Resources | -| ----------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- | -|

lambda:CreateFunction

lambda:GetFunction

lambda:DeleteFunction

lambda:AddPermission

lambda:UpdateFunctionConfiguration

| arn:aws:lambda:\:\:function:feast-\* | -|

ecr:CreateRepository

ecr:DescribeRepositories

ecr:DeleteRepository

ecr:PutImage

ecr:DescribeImages

ecr:BatchDeleteImage

ecr:CompleteLayerUpload

ecr:UploadLayerPart

ecr:InitiateLayerUpload

ecr:BatchCheckLayerAvailability

ecr:GetDownloadUrlForLayer

ecr:GetRepositoryPolicy

ecr:SetRepositoryPolicy

ecr:GetAuthorizationToken

| \* | -|

iam:PassRole

| arn:aws:iam::\:role/ | -|

apigateway:*

|

arn:aws:apigateway:*::/apis/*/routes/*/routeresponses

arn:aws:apigateway:*::/apis/*/routes/*/routeresponses/*

arn:aws:apigateway:*::/apis/*/routes/*

arn:aws:apigateway:*::/apis/*/routes

arn:aws:apigateway:*::/apis/*/integrations

arn:aws:apigateway:*::/apis/*/stages/*/routesettings/*

arn:aws:apigateway:*::/apis/*

arn:aws:apigateway:*::/apis

| +| Permissions | Resources | +| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +|

lambda:CreateFunction

lambda:GetFunction

lambda:DeleteFunction

lambda:AddPermission

lambda:UpdateFunctionConfiguration

| arn:aws:lambda:\:\:function:feast-\* | +|

ecr:CreateRepository

ecr:DescribeRepositories

ecr:DeleteRepository

ecr:PutImage

ecr:DescribeImages

ecr:BatchDeleteImage

ecr:CompleteLayerUpload

ecr:UploadLayerPart

ecr:InitiateLayerUpload

ecr:BatchCheckLayerAvailability

ecr:GetDownloadUrlForLayer

ecr:GetRepositoryPolicy

ecr:SetRepositoryPolicy

ecr:GetAuthorizationToken

| \* | +|

iam:PassRole

| arn:aws:iam::\:role/ | +|

apigateway:*

|

arn:aws:apigateway:*::/apis/*/routes/*/routeresponses

arn:aws:apigateway:*::/apis/*/routes/*/routeresponses/*

arn:aws:apigateway:*::/apis/*/routes/*

arn:aws:apigateway:*::/apis/*/routes

arn:aws:apigateway:*::/apis/*/integrations

arn:aws:apigateway:*::/apis/*/stages/*/routesettings/*

arn:aws:apigateway:*::/apis/*

arn:aws:apigateway:*::/apis

| The following inline policy can be used to grant Feast the necessary permissions: @@ -202,4 +195,3 @@ $ curl -X POST \ ] } ``` - diff --git a/docs/reference/feature-servers/go-feature-server.md b/docs/reference/feature-servers/go-feature-server.md index f83b765c3a..8209799086 100644 --- a/docs/reference/feature-servers/go-feature-server.md +++ b/docs/reference/feature-servers/go-feature-server.md @@ -71,14 +71,14 @@ online_store: go_feature_serving: True feature_server: feature_logging: - enable: True + enabled: True ``` Feature logging configuration in `feature_store.yaml` also allows to tweak some low-level parameters to achieve the best performance: ```yaml feature_server: feature_logging: - enable: True + enabled: True flush_interval_secs: 300 write_to_disk_interval_secs: 30 emit_timeout_micro_secs: 10000 diff --git a/docs/reference/feature-servers/python-feature-server.md b/docs/reference/feature-servers/python-feature-server.md index 2646c28ef4..d18bdc4f42 100644 --- a/docs/reference/feature-servers/python-feature-server.md +++ b/docs/reference/feature-servers/python-feature-server.md @@ -10,13 +10,14 @@ There is a CLI command that starts the server: `feast serve`. By default, Feast ## Deploying as a service -One can deploy a feature server by building a docker image that bundles in the project's `feature_store.yaml`. See this [helm chart](https://github.com/feast-dev/feast/blob/master/infra/charts/feast-python-server) for an example. +One can deploy a feature server by building a docker image that bundles in the project's `feature_store.yaml`. See this [helm chart](https://github.com/feast-dev/feast/blob/master/infra/charts/feast-feature-server) for an example on how to run Feast on Kubernetes. -A [remote feature server](../alpha-aws-lambda-feature-server.md) on AWS Lambda is also available. +A [remote feature server](alpha-aws-lambda-feature-server.md) on AWS Lambda is also available. ## Example ### Initializing a feature server + Here's an example of how to start the Python feature server with a local feature repo: ```bash @@ -49,6 +50,7 @@ INFO: Uvicorn running on http://127.0.0.1:6566 (Press CTRL+C to quit) ``` ### Retrieving features + After the server starts, we can execute cURL commands from another terminal tab: ```bash @@ -140,7 +142,7 @@ $ curl -X POST \ It's also possible to specify a feature service name instead of the list of features: -```text +``` curl -X POST \ "http://localhost:6566/get-online-features" \ -d '{ @@ -152,10 +154,12 @@ curl -X POST \ ``` ### Pushing features to the online and offline stores + The Python feature server also exposes an endpoint for [push sources](../../data-sources/push.md). This endpoint allows you to push data to the online and/or offline store. -The request definition for pushmode is a string parameter `to` where the options are: ["online", "offline", "online_and_offline"]. Note that timestamps need to be strings. -```text +The request definition for pushmode is a string parameter `to` where the options are: \["online", "offline", "online\_and\_offline"]. Note that timestamps need to be strings. + +``` curl -X POST "http://localhost:6566/push" -d '{ "push_source_name": "driver_hourly_stats_push_source", "df": { @@ -171,6 +175,7 @@ curl -X POST "http://localhost:6566/push" -d '{ ``` or equivalently from Python: + ```python import json import requests diff --git a/docs/reference/offline-stores/README.md b/docs/reference/offline-stores/README.md index 57d7f35dea..f4e3af2f34 100644 --- a/docs/reference/offline-stores/README.md +++ b/docs/reference/offline-stores/README.md @@ -1,16 +1,39 @@ # Offline stores -Please see [Offline Store](../../getting-started/architecture-and-components/offline-store.md) for an explanation of offline stores. +Please see [Offline Store](../../getting-started/architecture-and-components/offline-store.md) for a conceptual explanation of offline stores. -{% page-ref page="file.md" %} +{% content-ref url="overview.md" %} +[overview.md](overview.md) +{% endcontent-ref %} -{% page-ref page="snowflake.md" %} +{% content-ref url="file.md" %} +[file.md](file.md) +{% endcontent-ref %} -{% page-ref page="bigquery.md" %} +{% content-ref url="snowflake.md" %} +[snowflake.md](snowflake.md) +{% endcontent-ref %} -{% page-ref page="redshift.md" %} +{% content-ref url="bigquery.md" %} +[bigquery.md](bigquery.md) +{% endcontent-ref %} -{% page-ref page="spark.md" %} +{% content-ref url="redshift.md" %} +[redshift.md](redshift.md) +{% endcontent-ref %} -{% page-ref page="postgres.md" %} +{% content-ref url="spark.md" %} +[spark.md](spark.md) +{% endcontent-ref %} +{% content-ref url="postgres.md" %} +[postgres.md](postgres.md) +{% endcontent-ref %} + +{% content-ref url="trino.md" %} +[trino.md](trino.md) +{% endcontent-ref %} + +{% content-ref url="mssql.md" %} +[mssql.md](mssql.md) +{% endcontent-ref %} diff --git a/docs/reference/offline-stores/bigquery.md b/docs/reference/offline-stores/bigquery.md index 255c587d6b..0e286d78c4 100644 --- a/docs/reference/offline-stores/bigquery.md +++ b/docs/reference/offline-stores/bigquery.md @@ -1,13 +1,11 @@ -# BigQuery +# BigQuery offline store ## Description The BigQuery offline store provides support for reading [BigQuerySources](../data-sources/bigquery.md). -* BigQuery tables and views are allowed as sources. * All joins happen within BigQuery. -* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be uploaded to BigQuery in order to complete join operations. -* A [BigQueryRetrievalJob](https://github.com/feast-dev/feast/blob/c50a36ec1ad5b8d81c6f773c23204db7c7a7d218/sdk/python/feast/infra/offline_stores/bigquery.py#L210) is returned when calling `get_historical_features()`. +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. A Pandas dataframes will be uploaded to BigQuery as a table (marked for expiration) in order to complete join operations. ## Example @@ -22,4 +20,38 @@ offline_store: ``` {% endcode %} -Configuration options are available [here](https://rtd.feast.dev/en/latest/#feast.repo_config.BigQueryOfflineStoreConfig). +The full set of configuration options is available in [BigQueryOfflineStoreConfig](https://rtd.feast.dev/en/latest/index.html#feast.infra.offline_stores.bigquery.BigQueryOfflineStoreConfig). + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the BigQuery offline store. + +| | BigQuery | +| :----------------------------------------------------------------- | :------- | +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | yes | +| `offline_write_batch` (persist dataframes to offline store) | yes | +| `write_logged_features` (persist logged features to offline store) | yes | + +Below is a matrix indicating which functionality is supported by `BigQueryRetrievalJob`. + +| | BigQuery | +| ----------------------------------------------------- | -------- | +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | no | +| export to SQL | yes | +| export to data lake (S3, GCS, etc.) | no | +| export to data warehouse | yes | +| export as Spark dataframe | no | +| local execution of Python-based on-demand transforms | yes | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | yes | +| preview the query plan before execution | yes | +| read partitioned data* | partial | + +*See [GitHub issue](https://github.com/feast-dev/feast/issues/2530) for details on proposed solutions for enabling the BigQuery offline store to understand tables that use `_PARTITIONTIME` as the partition column. + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). diff --git a/docs/reference/offline-stores/file.md b/docs/reference/offline-stores/file.md index 42ac821691..4b76d9af90 100644 --- a/docs/reference/offline-stores/file.md +++ b/docs/reference/offline-stores/file.md @@ -1,11 +1,13 @@ -# File +# File offline store ## Description -The File offline store provides support for reading [FileSources](../data-sources/file.md). +The file offline store provides support for reading [FileSources](../data-sources/file.md). +It uses Dask as the compute engine. -* Only Parquet files are currently supported. -* All data is downloaded and joined using Python and may not scale to production workloads. +{% hint style="warning" %} +All data is downloaded and joined using Python and therefore may not scale to production workloads. +{% endhint %} ## Example @@ -19,4 +21,36 @@ offline_store: ``` {% endcode %} -Configuration options are available [here](https://rtd.feast.dev/en/latest/#feast.repo_config.FileOfflineStoreConfig). +The full set of configuration options is available in [FileOfflineStoreConfig](https://rtd.feast.dev/en/latest/#feast.infra.offline_stores.file.FileOfflineStoreConfig). + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the file offline store. + +| | File | +| :-------------------------------- | :-- | +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | yes | +| `offline_write_batch` (persist dataframes to offline store) | yes | +| `write_logged_features` (persist logged features to offline store) | yes | + +Below is a matrix indicating which functionality is supported by `FileRetrievalJob`. + +| | File | +| --------------------------------- | --- | +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | no | +| export to SQL | no | +| export to data lake (S3, GCS, etc.) | no | +| export to data warehouse | no | +| export as Spark dataframe | no | +| local execution of Python-based on-demand transforms | yes | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | yes | +| preview the query plan before execution | yes | +| read partitioned data | yes | + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). diff --git a/docs/reference/offline-stores/mssql.md b/docs/reference/offline-stores/mssql.md new file mode 100644 index 0000000000..bec0c8deb8 --- /dev/null +++ b/docs/reference/offline-stores/mssql.md @@ -0,0 +1,59 @@ +# MsSQL/Synapse offline store (contrib) + +## Description + +The MsSQL offline store provides support for reading [MsSQL Sources](../data-sources/mssql.md). Specifically, it is developed to read from [Synapse SQL](https://docs.microsoft.com/en-us/azure/synapse-analytics/sql/overview-features) on Microsoft Azure + +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. + +## Disclaimer + +The MsSQL offline store does not achieve full test coverage. +Please do not assume complete stability. + +## Example + +{% code title="feature_store.yaml" %} +```yaml +registry: + registry_store_type: AzureRegistryStore + path: ${REGISTRY_PATH} # Environment Variable +project: production +provider: azure +online_store: + type: redis + connection_string: ${REDIS_CONN} # Environment Variable +offline_store: + type: mssql + connection_string: ${SQL_CONN} # Environment Variable +``` +{% endcode %} + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the Spark offline store. + +| | MsSql | +| :-------------------------------- | :-- | +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | yes | +| `offline_write_batch` (persist dataframes to offline store) | no | +| `write_logged_features` (persist logged features to offline store) | no | + +Below is a matrix indicating which functionality is supported by `MsSqlServerRetrievalJob`. + +| | MsSql | +| --------------------------------- | --- | +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | no | +| export to SQL | no | +| export to data lake (S3, GCS, etc.) | no | +| export to data warehouse | no | +| local execution of Python-based on-demand transforms | no | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | yes | + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). diff --git a/docs/reference/offline-stores/overview.md b/docs/reference/offline-stores/overview.md new file mode 100644 index 0000000000..10f99813ba --- /dev/null +++ b/docs/reference/offline-stores/overview.md @@ -0,0 +1,58 @@ +# Overview + +## Functionality + +Here are the methods exposed by the `OfflineStore` interface, along with the core functionality supported by the method: +* `get_historical_features`: point-in-time correct join to retrieve historical features +* `pull_latest_from_table_or_query`: retrieve latest feature values for materialization into the online store +* `pull_all_from_table_or_query`: retrieve a saved dataset +* `offline_write_batch`: persist dataframes to the offline store, primarily for push sources +* `write_logged_features`: persist logged features to the offline store, for feature logging + +The first three of these methods all return a `RetrievalJob` specific to an offline store, such as a `SnowflakeRetrievalJob`. Here is a list of functionality supported by `RetrievalJob`s: +* export to dataframe +* export to arrow table +* export to arrow batches (to handle large datasets in memory) +* export to SQL +* export to data lake (S3, GCS, etc.) +* export to data warehouse +* export as Spark dataframe +* local execution of Python-based on-demand transforms +* remote execution of Python-based on-demand transforms +* persist results in the offline store +* preview the query plan before execution (`RetrievalJob`s are lazily executed) +* read partitioned data + +## Functionality Matrix + +There are currently four core offline store implementations: `FileOfflineStore`, `BigQueryOfflineStore`, `SnowflakeOfflineStore`, and `RedshiftOfflineStore`. +There are several additional implementations contributed by the Feast community (`PostgreSQLOfflineStore`, `SparkOfflineStore`, and `TrinoOfflineStore`), which are not guaranteed to be stable or to match the functionality of the core implementations. +Details for each specific offline store, such as how to configure it in a `feature_store.yaml`, can be found [here](README.md). + +Below is a matrix indicating which offline stores support which methods. + +| | File | BigQuery | Snowflake | Redshift | Postgres | Spark | Trino | +| :-------------------------------- | :-- | :-- | :-- | :-- | :-- | :-- | :-- | +| `get_historical_features` | yes | yes | yes | yes | yes | yes | yes | +| `pull_latest_from_table_or_query` | yes | yes | yes | yes | yes | yes | yes | +| `pull_all_from_table_or_query` | yes | yes | yes | yes | yes | yes | yes | +| `offline_write_batch` | yes | yes | yes | yes | no | no | no | +| `write_logged_features` | yes | yes | yes | yes | no | no | no | + + +Below is a matrix indicating which `RetrievalJob`s support what functionality. + +| | File | BigQuery | Snowflake | Redshift | Postgres | Spark | Trino | +| --------------------------------- | --- | --- | --- | --- | --- | --- | --- | +| export to dataframe | yes | yes | yes | yes | yes | yes | yes | +| export to arrow table | yes | yes | yes | yes | yes | yes | yes | +| export to arrow batches | no | no | no | yes | no | no | no | +| export to SQL | no | yes | no | yes | yes | no | yes | +| export to data lake (S3, GCS, etc.) | no | no | yes | no | yes | no | no | +| export to data warehouse | no | yes | yes | yes | yes | no | no | +| export as Spark dataframe | no | no | no | no | no | yes | no | +| local execution of Python-based on-demand transforms | yes | yes | yes | yes | yes | no | yes | +| remote execution of Python-based on-demand transforms | no | no | no | no | no | no | no | +| persist results in the offline store | yes | yes | yes | yes | yes | yes | no | +| preview the query plan before execution | yes | yes | yes | yes | yes | yes | yes | +| read partitioned data | yes | yes | yes | yes | yes | yes | yes | diff --git a/docs/reference/offline-stores/postgres.md b/docs/reference/offline-stores/postgres.md index 9bd472673a..506666fc37 100644 --- a/docs/reference/offline-stores/postgres.md +++ b/docs/reference/offline-stores/postgres.md @@ -1,20 +1,14 @@ -# PostgreSQL (contrib) +# PostgreSQL offline store (contrib) ## Description -The PostgreSQL offline store is an offline store that provides support for reading [PostgreSQL](../data-sources/postgres.md) data sources. +The PostgreSQL offline store provides support for reading [PostgreSQLSources](../data-sources/postgres.md). +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. A Pandas dataframes will be uploaded to Postgres as a table in order to complete join operations. +## Disclaimer -**DISCLAIMER**: This PostgreSQL offline store still does not achieve full test coverage. - -* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be converted to a Spark dataframe and processed as a temporary view. -* A `PostgreSQLRetrievalJob` is returned when calling `get_historical_features()`. - * This allows you to call - * `to_df` to retrieve the pandas dataframe. - * `to_arrow` to retrieve the dataframe as a PyArrow table. - * `to_sql` to get the SQL query used to pull the features. - -* sslmode, sslkey_path, sslcert_path, and sslrootcert_path are optional +The PostgreSQL offline store does not achieve full test coverage. +Please do not assume complete stability. ## Example @@ -39,3 +33,38 @@ online_store: path: data/online_store.db ``` {% endcode %} + +Note that `sslmode`, `sslkey_path`, `sslcert_path`, and `sslrootcert_path` are optional parameters. +The full set of configuration options is available in [PostgreSQLOfflineStoreConfig](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.contrib.postgres_offline_store.postgres.PostgreSQLOfflineStoreConfig). + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the PostgreSQL offline store. + +| | Postgres | +| :-------------------------------- | :-- | +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | yes | +| `offline_write_batch` (persist dataframes to offline store) | no | +| `write_logged_features` (persist logged features to offline store) | no | + +Below is a matrix indicating which functionality is supported by `PostgreSQLRetrievalJob`. + +| | Postgres | +| --------------------------------- | --- | +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | no | +| export to SQL | yes | +| export to data lake (S3, GCS, etc.) | yes | +| export to data warehouse | yes | +| export as Spark dataframe | no | +| local execution of Python-based on-demand transforms | yes | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | yes | +| preview the query plan before execution | yes | +| read partitioned data | yes | + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). diff --git a/docs/reference/offline-stores/redshift.md b/docs/reference/offline-stores/redshift.md index 73148730c5..2cdf49bdb9 100644 --- a/docs/reference/offline-stores/redshift.md +++ b/docs/reference/offline-stores/redshift.md @@ -1,13 +1,11 @@ -# Redshift +# Redshift offline store ## Description The Redshift offline store provides support for reading [RedshiftSources](../data-sources/redshift.md). -* Redshift tables and views are allowed as sources. * All joins happen within Redshift. -* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be uploaded to Redshift in order to complete join operations. -* A [RedshiftRetrievalJob](https://github.com/feast-dev/feast/blob/bf557bcb72c7878a16dccb48443bbbe9dc3efa49/sdk/python/feast/infra/offline_stores/redshift.py#L161) is returned when calling `get_historical_features()`. +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. A Pandas dataframes will be uploaded to Redshift temporarily in order to complete join operations. ## Example @@ -27,7 +25,39 @@ offline_store: ``` {% endcode %} -Configuration options are available [here](https://github.com/feast-dev/feast/blob/bf557bcb72c7878a16dccb48443bbbe9dc3efa49/sdk/python/feast/infra/offline_stores/redshift.py#L22). +The full set of configuration options is available in [RedshiftOfflineStoreConfig](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.redshift.RedshiftOfflineStoreConfig). + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the Redshift offline store. + +| | Redshift | +| :-------------------------------- | :-- | +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | yes | +| `offline_write_batch` (persist dataframes to offline store) | yes | +| `write_logged_features` (persist logged features to offline store) | yes | + +Below is a matrix indicating which functionality is supported by `RedshiftRetrievalJob`. + +| | Redshift | +| --------------------------------- | --- | +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | yes | +| export to SQL | yes | +| export to data lake (S3, GCS, etc.) | no | +| export to data warehouse | yes | +| export as Spark dataframe | no | +| local execution of Python-based on-demand transforms | yes | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | yes | +| preview the query plan before execution | yes | +| read partitioned data | yes | + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). ## Permissions diff --git a/docs/reference/offline-stores/snowflake.md b/docs/reference/offline-stores/snowflake.md index e2afaef90d..e40ad7cd7a 100644 --- a/docs/reference/offline-stores/snowflake.md +++ b/docs/reference/offline-stores/snowflake.md @@ -1,17 +1,10 @@ -# Snowflake +# Snowflake offline store ## Description The [Snowflake](https://trial.snowflake.com) offline store provides support for reading [SnowflakeSources](../data-sources/snowflake.md). - -* Snowflake tables and views are allowed as sources. * All joins happen within Snowflake. -* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be uploaded to Snowflake in order to complete join operations. -* A `SnowflakeRetrievalJob` is returned when calling `get_historical_features()`. - * This allows you to call - * `to_snowflake` to save the dataset into Snowflake - * `to_sql` to get the SQL query that would execute on `to_df` - * `to_arrow_chunks` to get the result in batches ([Snowflake python connector docs](https://docs.snowflake.com/en/user-guide/python-connector-api.html#get_result_batches)) +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. A Pandas dataframes will be uploaded to Snowflake as a temporary table in order to complete join operations. ## Example @@ -31,4 +24,36 @@ offline_store: ``` {% endcode %} -Configuration options are available in [SnowflakeOfflineStoreConfig](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/infra/offline_stores/snowflake.py#L56). +The full set of configuration options is available in [SnowflakeOfflineStoreConfig](https://rtd.feast.dev/en/latest/#feast.infra.offline_stores.snowflake.SnowflakeOfflineStoreConfig). + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the Snowflake offline store. + +| | Snowflake | +| :-------------------------------- | :-- | +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | yes | +| `offline_write_batch` (persist dataframes to offline store) | yes | +| `write_logged_features` (persist logged features to offline store) | yes | + +Below is a matrix indicating which functionality is supported by `SnowflakeRetrievalJob`. + +| | Snowflake | +| --------------------------------- | --- | +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | no | +| export to SQL | yes | +| export to data lake (S3, GCS, etc.) | yes | +| export to data warehouse | yes | +| export as Spark dataframe | no | +| local execution of Python-based on-demand transforms | yes | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | yes | +| preview the query plan before execution | yes | +| read partitioned data | yes | + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). diff --git a/docs/reference/offline-stores/spark.md b/docs/reference/offline-stores/spark.md index 7eec8d7b73..f1ef1300bd 100644 --- a/docs/reference/offline-stores/spark.md +++ b/docs/reference/offline-stores/spark.md @@ -1,20 +1,15 @@ -# Spark (contrib) +# Spark offline store (contrib) ## Description -The Spark offline store is an offline store currently in alpha development that provides support for reading [SparkSources](../data-sources/spark.md). +The Spark offline store provides support for reading [SparkSources](../data-sources/spark.md). -## Disclaimer +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. A Pandas dataframes will be converted to a Spark dataframe and processed as a temporary view. -This Spark offline store still does not achieve full test coverage and continues to fail some integration tests when integrating with the feast universal test suite. Please do NOT assume complete stability of the API. +## Disclaimer -* Spark tables and views are allowed as sources that are loaded in from some Spark store(e.g in Hive or in memory). -* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be converted to a Spark dataframe and processed as a temporary view. -* A `SparkRetrievalJob` is returned when calling `get_historical_features()`. - * This allows you to call - * `to_df` to retrieve the pandas dataframe. - * `to_arrow` to retrieve the dataframe as a pyarrow Table. - * `to_spark_df` to retrieve the dataframe the spark. +The Spark offline store does not achieve full test coverage. +Please do not assume complete stability. ## Example @@ -36,3 +31,37 @@ online_store: path: data/online_store.db ``` {% endcode %} + +The full set of configuration options is available in [SparkOfflineStoreConfig](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.contrib.spark_offline_store.spark.SparkOfflineStoreConfig). + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the Spark offline store. + +| | Spark | +| :-------------------------------- | :-- | +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | yes | +| `offline_write_batch` (persist dataframes to offline store) | no | +| `write_logged_features` (persist logged features to offline store) | no | + +Below is a matrix indicating which functionality is supported by `SparkRetrievalJob`. + +| | Spark | +| --------------------------------- | --- | +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | no | +| export to SQL | no | +| export to data lake (S3, GCS, etc.) | no | +| export to data warehouse | no | +| export as Spark dataframe | yes | +| local execution of Python-based on-demand transforms | no | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | yes | +| preview the query plan before execution | yes | +| read partitioned data | yes | + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). diff --git a/docs/reference/offline-stores/trino.md b/docs/reference/offline-stores/trino.md new file mode 100644 index 0000000000..8cc604248f --- /dev/null +++ b/docs/reference/offline-stores/trino.md @@ -0,0 +1,64 @@ +# Trino offline store (contrib) + +## Description + +The Trino offline store provides support for reading [TrinoSources](../data-sources/trino.md). +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. A Pandas dataframes will be uploaded to Trino as a table in order to complete join operations. + +## Disclaimer + +The Trino offline store does not achieve full test coverage. +Please do not assume complete stability. + +## Example + +{% code title="feature_store.yaml" %} +```yaml +project: feature_repo +registry: data/registry.db +provider: local +offline_store: + type: feast_trino.trino.TrinoOfflineStore + host: localhost + port: 8080 + catalog: memory + connector: + type: memory +online_store: + path: data/online_store.db +``` +{% endcode %} + +The full set of configuration options is available in [TrinoOfflineStoreConfig](https://rtd.feast.dev/en/master/#trino-offline-store). + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the Trino offline store. + +| | Trino | +| :-------------------------------- | :-- | +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | yes | +| `offline_write_batch` (persist dataframes to offline store) | no | +| `write_logged_features` (persist logged features to offline store) | no | + +Below is a matrix indicating which functionality is supported by `TrinoRetrievalJob`. + +| | Trino | +| --------------------------------- | --- | +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | no | +| export to SQL | yes | +| export to data lake (S3, GCS, etc.) | no | +| export to data warehouse | no | +| export as Spark dataframe | no | +| local execution of Python-based on-demand transforms | yes | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | no | +| preview the query plan before execution | yes | +| read partitioned data | yes | + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). diff --git a/docs/reference/online-stores/README.md b/docs/reference/online-stores/README.md index 5eb566af3c..6d616b46f2 100644 --- a/docs/reference/online-stores/README.md +++ b/docs/reference/online-stores/README.md @@ -2,14 +2,31 @@ Please see [Online Store](../../getting-started/architecture-and-components/online-store.md) for an explanation of online stores. -{% page-ref page="sqlite.md" %} +{% content-ref url="sqlite.md" %} +[sqlite.md](sqlite.md) +{% endcontent-ref %} -{% page-ref page="snowflake.md" %} +{% content-ref url="snowflake.md" %} +[snowflake.md](snowflake.md) +{% endcontent-ref %} -{% page-ref page="redis.md" %} +{% content-ref url="redis.md" %} +[redis.md](redis.md) +{% endcontent-ref %} -{% page-ref page="datastore.md" %} +{% content-ref url="datastore.md" %} +[datastore.md](datastore.md) +{% endcontent-ref %} -{% page-ref page="dynamodb.md" %} +{% content-ref url="dynamodb.md" %} +[dynamodb.md](dynamodb.md) +{% endcontent-ref %} + +{% content-ref url="postgres.md" %} +[postgres.md](postgres.md) +{% endcontent-ref %} + +{% content-ref url="cassandra.md" %} +[cassandra.md](cassandra.md) +{% endcontent-ref %} -{% page-ref page="postgres.md" %} diff --git a/docs/reference/online-stores/cassandra.md b/docs/reference/online-stores/cassandra.md new file mode 100644 index 0000000000..3355c3728c --- /dev/null +++ b/docs/reference/online-stores/cassandra.md @@ -0,0 +1,61 @@ +# Cassandra + Astra DB online store (contrib) + +## Description + +The [Cassandra / Astra DB] online store provides support for materializing feature values into an Apache Cassandra / Astra DB database for online features. + +* The whole project is contained within a Cassandra keyspace +* Each feature view is mapped one-to-one to a specific Cassandra table +* This implementation inherits all strengths of Cassandra such as high availability, fault-tolerance, and data distribution + +An easy way to get started is the command `feast init REPO_NAME -t cassandra`. + +### Example (Cassandra) + +{% code title="feature_store.yaml" %} +```yaml +project: my_feature_repo +registry: data/registry.db +provider: local +online_store: + type: cassandra + hosts: + - 192.168.1.1 + - 192.168.1.2 + - 192.168.1.3 + keyspace: KeyspaceName + port: 9042 # optional + username: user # optional + password: secret # optional + protocol_version: 5 # optional + load_balancing: # optional + local_dc: 'datacenter1' # optional + load_balancing_policy: 'TokenAwarePolicy(DCAwareRoundRobinPolicy)' # optional +``` +{% endcode %} + +### Example (Astra DB) + +{% code title="feature_store.yaml" %} +```yaml +project: my_feature_repo +registry: data/registry.db +provider: local +online_store: + type: cassandra + secure_bundle_path: /path/to/secure/bundle.zip + keyspace: KeyspaceName + username: Client_ID + password: Client_Secret + protocol_version: 4 # optional + load_balancing: # optional + local_dc: 'eu-central-1' # optional + load_balancing_policy: 'TokenAwarePolicy(DCAwareRoundRobinPolicy)' # optional + +``` +{% endcode %} + +For a full explanation of configuration options please look at file +`sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/README.md`. + +Storage specifications can be found at `docs/specs/online_store_format.md`. \ No newline at end of file diff --git a/docs/reference/online-stores/datastore.md b/docs/reference/online-stores/datastore.md index 012d497f30..ed1425abb6 100644 --- a/docs/reference/online-stores/datastore.md +++ b/docs/reference/online-stores/datastore.md @@ -1,4 +1,4 @@ -# Datastore +# Datastore online store ## Description diff --git a/docs/reference/online-stores/dynamodb.md b/docs/reference/online-stores/dynamodb.md index 2af7e422d6..f9f8b4339d 100644 --- a/docs/reference/online-stores/dynamodb.md +++ b/docs/reference/online-stores/dynamodb.md @@ -1,4 +1,4 @@ -# DynamoDB +# DynamoDB online store ## Description diff --git a/docs/reference/online-stores/postgres.md b/docs/reference/online-stores/postgres.md index 7d24079da9..4f51dff617 100644 --- a/docs/reference/online-stores/postgres.md +++ b/docs/reference/online-stores/postgres.md @@ -1,4 +1,4 @@ -# PostgreSQL (contrib) +# PostgreSQL online store (contrib) ## Description diff --git a/docs/reference/online-stores/redis.md b/docs/reference/online-stores/redis.md index ce1de2ad54..4388ccfa0a 100644 --- a/docs/reference/online-stores/redis.md +++ b/docs/reference/online-stores/redis.md @@ -1,4 +1,4 @@ -# Redis +# Redis online store ## Description diff --git a/docs/reference/online-stores/snowflake.md b/docs/reference/online-stores/snowflake.md index ccf3d526da..bf975fa7ea 100644 --- a/docs/reference/online-stores/snowflake.md +++ b/docs/reference/online-stores/snowflake.md @@ -1,4 +1,4 @@ -# Snowflake +# Snowflake online store ## Description diff --git a/docs/reference/online-stores/sqlite.md b/docs/reference/online-stores/sqlite.md index fd11e3439c..668e6024e3 100644 --- a/docs/reference/online-stores/sqlite.md +++ b/docs/reference/online-stores/sqlite.md @@ -1,4 +1,4 @@ -# SQLite +# SQLite online store ## Description diff --git a/docs/reference/providers/README.md b/docs/reference/providers/README.md index dc52d92726..20686a1e14 100644 --- a/docs/reference/providers/README.md +++ b/docs/reference/providers/README.md @@ -7,3 +7,5 @@ Please see [Provider](../../getting-started/architecture-and-components/provider {% page-ref page="google-cloud-platform.md" %} {% page-ref page="amazon-web-services.md" %} + +{% page-ref page="azure.md" %} diff --git a/docs/reference/providers/azure.md b/docs/reference/providers/azure.md new file mode 100644 index 0000000000..123bf08763 --- /dev/null +++ b/docs/reference/providers/azure.md @@ -0,0 +1,26 @@ +# Azure (contrib) + +## Description + +* Offline Store: Uses the **MsSql** offline store by default. Also supports File as the offline store. +* Online Store: Uses the **Redis** online store by default. Also supports Sqlite as an online store. + +## Disclaimer + +The Azure provider does not achieve full test coverage. +Please do not assume complete stability. + +## Example + +{% code title="feature_store.yaml" %} +```yaml +registry: + registry_store_type: AzureRegistryStore + path: ${REGISTRY_PATH} # Environment Variable +project: production +provider: azure +online_store: + type: redis + connection_string: ${REDIS_CONN} # Environment Variable +``` +{% endcode %} \ No newline at end of file diff --git a/docs/reference/type-system.md b/docs/reference/type-system.md new file mode 100644 index 0000000000..affe394f57 --- /dev/null +++ b/docs/reference/type-system.md @@ -0,0 +1,41 @@ +# Type System + +## Motivation + +Feast uses an internal type system to provide guarantees on training and serving data. +Feast currently supports eight primitive types - `INT32`, `INT64`, `FLOAT32`, `FLOAT64`, `STRING`, `BYTES`, `BOOL`, and `UNIX_TIMESTAMP` - and the corresponding array types. +Null types are not supported, although the `UNIX_TIMESTAMP` type is nullable. +The type system is controlled by [`Value.proto`](https://github.com/feast-dev/feast/blob/master/protos/feast/types/Value.proto) in protobuf and by [`types.py`](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/types.py) in Python. +Type conversion logic can be found in [`type_map.py`](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/type_map.py). + +## Examples + +### Feature inference + +During `feast apply`, Feast runs schema inference on the data sources underlying feature views. +For example, if the `schema` parameter is not specified for a feature view, Feast will examine the schema of the underlying data source to determine the event timestamp column, feature columns, and entity columns. +Each of these columns must be associated with a Feast type, which requires conversion from the data source type system to the Feast type system. +* The feature inference logic calls `_infer_features_and_entities`. +* `_infer_features_and_entities` calls `source_datatype_to_feast_value_type`. +* `source_datatype_to_feast_value_type` cals the appropriate method in `type_map.py`. For example, if a `SnowflakeSource` is being examined, `snowflake_python_type_to_feast_value_type` from `type_map.py` will be called. + +### Materialization + +Feast serves feature values as [`Value`](https://github.com/feast-dev/feast/blob/master/protos/feast/types/Value.proto) proto objects, which have a type corresponding to Feast types. +Thus Feast must materialize feature values into the online store as `Value` proto objects. +* The local materialization engine first pulls the latest historical features and converts it to pyarrow. +* Then it calls `_convert_arrow_to_proto` to convert the pyarrow table to proto format. +* This calls `python_values_to_proto_values` in `type_map.py` to perform the type conversion. + +### Historical feature retrieval + +The Feast type system is typically not necessary when retrieving historical features. +A call to `get_historical_features` will return a `RetrievalJob` object, which allows the user to export the results to one of several possible locations: a Pandas dataframe, a pyarrow table, a data lake (e.g. S3 or GCS), or the offline store (e.g. a Snowflake table). +In all of these cases, the type conversion is handled natively by the offline store. +For example, a BigQuery query exposes a `to_dataframe` method that will automatically convert the result to a dataframe, without requiring any conversions within Feast. + +### Feature serving + +As mentioned above in the section on [materialization](#materialization), Feast persists feature values into the online store as `Value` proto objects. +A call to `get_online_features` will return an `OnlineResponse` object, which essentially wraps a bunch of `Value` protos with some metadata. +The `OnlineResponse` object can then be converted into a Python dictionary, which calls `feast_value_type_to_python_type` from `type_map.py`, a utility that converts the Feast internal types to Python native types. diff --git a/docs/roadmap.md b/docs/roadmap.md index e481453dff..dc1d9ae1ab 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -10,7 +10,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Redshift source](https://docs.feast.dev/reference/data-sources/redshift) * [x] [BigQuery source](https://docs.feast.dev/reference/data-sources/bigquery) * [x] [Parquet file source](https://docs.feast.dev/reference/data-sources/file) - * [x] [Synapse source (community plugin)](https://github.com/Azure/feast-azure) + * [x] [Azure Synapse + Azure SQL source (contrib plugin)](https://docs.feast.dev/reference/data-sources/mssql) * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/data-sources/postgres) * [x] [Spark (contrib plugin)](https://docs.feast.dev/reference/data-sources/spark) @@ -19,7 +19,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) * [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) * [x] [BigQuery](https://docs.feast.dev/reference/offline-stores/bigquery) - * [x] [Synapse (community plugin)](https://github.com/Azure/feast-azure) + * [x] [Azure Synapse + Azure SQL (contrib plugin)](https://docs.feast.dev/reference/offline-stores/mssql.md) * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/offline-stores/postgres) * [x] [Trino (contrib plugin)](https://github.com/Shopify/feast-trino) @@ -35,7 +35,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Azure Cache for Redis (community plugin)](https://github.com/Azure/feast-azure) * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/online-stores/postgres) * [x] [Custom online store support](https://docs.feast.dev/how-to-guides/adding-support-for-a-new-online-store) - * [x] [Cassandra / AstraDB](https://github.com/datastaxdevs/feast-cassandra-online-store) + * [x] [Cassandra / AstraDB](https://docs.feast.dev/reference/online-stores/cassandra) * [ ] Bigtable (in progress) * **Feature Engineering** * [x] On-demand Transformations (Alpha release. See [RFC](https://docs.google.com/document/d/1lgfIw0Drc65LpaxbUu49RCeJgMew547meSJttnUqz7c/edit#)) diff --git a/docs/specs/online_store_format.md b/docs/specs/online_store_format.md index 9f901ae69c..5c3c545c8d 100644 --- a/docs/specs/online_store_format.md +++ b/docs/specs/online_store_format.md @@ -92,6 +92,86 @@ Other types of entity keys are not supported in this version of the specificatio ![Datastore Online Example](datastore_online_example.png) +## Cassandra/Astra DB Online Store Format + +### Overview + +Apache Cassandra™ is a table-oriented NoSQL distributed database. Astra DB is a managed database-as-a-service +built on Cassandra, and will be assimilated to the former in what follows. + +In Cassandra, tables are grouped in _keyspaces_ (groups of related tables). Each table is comprised of +_rows_, each containing data for a given set of _columns_. Moreover, rows are grouped in _partitions_ according +to a _partition key_ (a portion of the uniqueness-defining _primary key_ set of columns), so that all rows +with the same values for the partition key are guaranteed to be stored on the same Cassandra nodes, next to each other, +which guarantees fast retrieval times. + +This architecture makes Cassandra a good fit for an online feature store in Feast. + +### Cassandra Online Store Format + +Each project (denoted by its name, called "feature store name" elsewhere) may contain an +arbitrary number of `FeatureView`s: these correspond each to a specific table, and +all tables for a project are to be contained in a single keyspace. The keyspace should +have been created by the Feast user preliminarly and is to be specified in the feature store +configuration `yaml`. + +The table for a project `project` and feature view `FeatureView` will have name +`project_FeatureView` (e.g. `feature_repo_driver_hourly_stats`). + +All tables have the same structure. Cassandra is schemaful and the columns are strongly typed. +In the following table schema (which also serves as Chebotko diagram) the Python +and Cassandra data types are both specified: + +|Table: |``_`` | | _(Python type)_ | +|---------------|-----------------------------|--|----------------------| +|`entity_key` |`TEXT` |K | `str` | +|`feature_name` |`TEXT` |C↑| `str` | +|`value` |`BLOB` | | `bytes` | +|`event_ts` |`TIMESTAMP` | | `datetime.datetime` | +|`created_ts` |`TIMESTAMP` | | `datetime.datetime` | + +Each row in the table represents a single value for a feature in a feature view, +thus associated to a specific entity. The choice of partitioning ensures that, +within a given feature view (i.e. a single table), for a given entity any number +of features can be retrieved with a single, best-practice-respecting query +(which is what happens in the `online_read` method implementation). + + +The `entity_key` column is computed as `serialize_entity_key(entityKey).hex()`, +where `entityKey` is of type `feast.protos.feast.types.EntityKey_pb2.EntityKey`. + +The value of `feature_name` is the plain-text name of the feature as defined +in the corresponding `FeatureView`. + +For `value`, the bytes from `[protoValue].SerializeToString()` +are used, where `protoValue` is of type `feast.protos.feast.types.Value_pb2.Value`. + +Column `event_ts` stores the timestamp the feature value refers to, as passed +to the store method. Conversely, column `created_ts`, meant to store the write +time for the entry, is now being deprecated and will be never written by this +online-store implementation. Thanks to the internal storage mechanism of Cassandra, +this does not incur a noticeable performance penalty (hence, for the time being, +the column can be maintained in the schema). + +### Example entry + +For a project `feature_repo` and feature view named `driver_hourly_stats`, +a typical row in table `feature_repo_driver_hourly_stats` might look like: + +|Column |content | notes | +|---------------|-----------------------------------------------------|-------------------------------------------------------------------| +|`entity_key` |`020000006472697665725f69640400000004000000ea030000` | from `"driver_id = 1002"` | +|`feature_name` |`conv_rate` | | +|`value` |`0x35f5696d3f` | from `float_val: 0.9273980259895325`, i.e. `(b'5\xf5im?').hex()` | +|`event_ts` |`2022-07-07 09:00:00.000000+0000` | from `datetime.datetime(2022, 7, 7, 9, 0)` | +|`created_ts` |`null` | not explicitly written to avoid unnecessary tombstones | + +### Known Issues + +If a `FeatureView` ever gets _re-defined_ in a schema-breaking way, the implementation is not able to rearrange the +schema of the underlying table accordingly (neither dropping all data nor, even less so, keeping it somehow). +This should never occur, lest one encounters all sorts of data-retrieval issues anywhere in Feast usage. + # Appendix ##### Appendix A. Value proto format. diff --git a/docs/tutorials/azure/README.md b/docs/tutorials/azure/README.md new file mode 100644 index 0000000000..2bfd53adf7 --- /dev/null +++ b/docs/tutorials/azure/README.md @@ -0,0 +1,88 @@ +# Getting started with Feast on Azure + +The objective of this tutorial is to build a model that predicts if a driver will complete a trip based on a number of features ingested into Feast. During this tutorial you will: + +1. Deploy the infrastructure for a feature store (using an ARM template) +2. Register features into a central feature registry hosted on Blob Storage +3. Consume features from the feature store for training and inference + +## Prerequisites + +For this tutorial you will require: + +1. An Azure subscription. +2. Working knowledge of Python and ML concepts. +3. Basic understanding of Azure Machine Learning - using notebooks, etc. + +## 1. Deploy Infrastructure + +We have created an ARM template that deploys and configures all the infrastructure required to run feast in Azure. This makes the set-up very simple - select the **Deploy to Azure** button below. + +The only 2 required parameters during the set-up are: + +- **Admin Password** for the the Dedicated SQL Pool being deployed. +- **Principal ID** this is to set the storage permissions for the feast registry store. You can find the value for this by opening **Cloud Shell** and run the following command: + +```bash +# If you are using Azure portal CLI or Azure CLI 2.37.0 or above +az ad signed-in-user show --query id -o tsv + +# If you are using Azure CLI below 2.37.0 +az ad signed-in-user show --query objectId -o tsv +``` + +> You may want to first make sure your subscription has registered `Microsoft.Synapse`, `Microsoft.SQL`, `Microsoft.Network` and `Microsoft.Compute` providers before running the template below, as some of them may require explicit registration. If you are on a Free Subscription, you will not be able to deploy the workspace part of this tutorial. + +[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Ffeast-dev%2Ffeast%2Fmaster%2Fdocs%2Ftutorials%2Fazure%2Fdeployment%2Ffs_synapse_azuredeploy.json) + +![feast architecture](media/arch.png) + +The ARM template will not only deploy the infrastructure but it will also: + +- install feast with the azure provider on the compute instance +- set the Registry Blob path, Dedicated SQL Pool and Redis cache connection strings in the Azure ML default Keyvault. + +> **☕ It can take up to 20 minutes for the Redis cache to be provisioned.** + +## 2. Git clone this repo to your compute instance + +In the [Azure Machine Learning Studio](https://ml.azure.com), navigate to the left-hand menu and select **Compute**. You should see your compute instance running, select **Terminal** + +![compute instance terminal](media/ci.png) + +In the terminal you need to clone this GitHub repo: + +```bash +git clone https://github.com/feast-dev/feast +``` + +### 3. Load feature values into Feature Store + +In the Azure ML Studio, select *Notebooks* from the left-hand menu and then open the [Loading feature values into feature store notebook](./notebooks/part1-load-data.ipynb).Work through this notebook. + +> __💁Ensure the Jupyter kernel is set to Python 3.8 - AzureML__ + +![compute instance kernel](media/ci-kernel.png) + + +## 4. Register features in Feature store + +In the Azure ML Studio, select *Notebooks* from the left-hand menu and then open the [register features into your feature registry notebook](notebooks/part2-register-features.ipynb). Work through this notebook. + +> __💁Ensure the Jupyter kernel is set to Python 3.8 - AzureML__ + +## 5.Train and Deploy a model using the Feature Store + +In the Azure ML Studio, select *Notebooks* from the left-hand menu and then open the [train and deploy a model using feast notebook](notebooks/part3-train-and-deploy-with-feast.ipynb). Work through this notebook. + +> __💁Ensure the Jupyter kernel is set to Python 3.8 - AzureML__ +> +> If problems are encountered during model training stage, create a new cell and rexecute `!pip install scikit-learn==0.22.1`. Upon completion, restart the Kernel and start over. + +## 6. Running Feast Azure Tutorials locally without Azure workspace + +* If you are on a free tier instance, you will not be able to deploy the azure deployment because the azure workspace requires VCPUs and the free trial subscription does not have a quota. +* The workaround is to remove the `Microsoft.MachineLearningServices/workspaces/computes` resource from `fs_synapse_azure_deploy.json` and setting up the environment locally. + 1. After deployment, find your `Azure SQL Pool` secrets by going to `Subscriptions->->Resource Group->Key Vault` and giving your account admin permissions to the keyvault. Retrieve the `FEAST-REGISTRY-PATH`, `FEAST-OFFLINE-STORE-CONN`, and `FEAST-ONLINE-STORE-CONN` secrets to use in your local environment. + 2. In your local environment, you will need to install the azure cli and login to the cli using `az login`. + 3. After everything is setup, you should be able to work through the first 2 tutorial notebooks without any errors (The 3rd notebook requires Azure workspace resources). \ No newline at end of file diff --git a/docs/tutorials/azure/data/data_generator.py b/docs/tutorials/azure/data/data_generator.py new file mode 100644 index 0000000000..77fec08296 --- /dev/null +++ b/docs/tutorials/azure/data/data_generator.py @@ -0,0 +1,260 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import numpy as np +import pandas as pd +from datetime import datetime, timedelta +from pytz import FixedOffset, timezone, utc +from random import randint +from enum import Enum +from sqlalchemy import create_engine, DateTime +from datetime import datetime + +DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL = "event_timestamp" + + +class EventTimestampType(Enum): + TZ_NAIVE = 0 + TZ_AWARE_UTC = 1 + TZ_AWARE_FIXED_OFFSET = 2 + TZ_AWARE_US_PACIFIC = 3 + + +def _convert_event_timestamp(event_timestamp: pd.Timestamp, t: EventTimestampType): + if t == EventTimestampType.TZ_NAIVE: + return event_timestamp + elif t == EventTimestampType.TZ_AWARE_UTC: + return event_timestamp.replace(tzinfo=utc) + elif t == EventTimestampType.TZ_AWARE_FIXED_OFFSET: + return event_timestamp.replace(tzinfo=utc).astimezone(FixedOffset(60)) + elif t == EventTimestampType.TZ_AWARE_US_PACIFIC: + return event_timestamp.replace(tzinfo=utc).astimezone(timezone("US/Pacific")) + + +def create_orders_df( + customers, + drivers, + start_date, + end_date, + order_count, + infer_event_timestamp_col=False, +) -> pd.DataFrame: + """ + Example df generated by this function: + | order_id | driver_id | customer_id | order_is_success | event_timestamp | + +----------+-----------+-------------+------------------+---------------------+ + | 100 | 5004 | 1007 | 0 | 2021-03-10 19:31:15 | + | 101 | 5003 | 1006 | 0 | 2021-03-11 22:02:50 | + | 102 | 5010 | 1005 | 0 | 2021-03-13 00:34:24 | + | 103 | 5010 | 1001 | 1 | 2021-03-14 03:05:59 | + """ + df = pd.DataFrame() + df["order_id"] = [order_id for order_id in range(100, 100 + order_count)] + df["driver_id"] = np.random.choice(drivers, order_count) + df["customer_id"] = np.random.choice(customers, order_count) + df["order_is_success"] = np.random.randint(0, 2, size=order_count).astype(np.int32) + + if infer_event_timestamp_col: + df["e_ts"] = [ + _convert_event_timestamp( + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms"), + EventTimestampType(3), + ) + for idx, dt in enumerate( + pd.date_range(start=start_date, end=end_date, periods=order_count) + ) + ] + df.sort_values( + by=["e_ts", "order_id", "driver_id", "customer_id"], inplace=True, + ) + else: + df[DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL] = [ + _convert_event_timestamp( + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms"), + EventTimestampType(idx % 4), + ) + for idx, dt in enumerate( + pd.date_range(start=start_date, end=end_date, periods=order_count) + ) + ] + df.sort_values( + by=[ + DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, + "order_id", + "driver_id", + "customer_id", + ], + inplace=True, + ) + return df + + +def create_driver_hourly_stats_df(drivers, start_date, end_date) -> pd.DataFrame: + """ + Example df generated by this function: + | datetime | driver_id | conv_rate | acc_rate | avg_daily_trips | created | + |------------------+-----------+-----------+----------+-----------------+------------------| + | 2021-03-17 19:31 | 5010 | 0.229297 | 0.685843 | 861 | 2021-03-24 19:34 | + | 2021-03-17 20:31 | 5010 | 0.781655 | 0.861280 | 769 | 2021-03-24 19:34 | + | 2021-03-17 21:31 | 5010 | 0.150333 | 0.525581 | 778 | 2021-03-24 19:34 | + | 2021-03-17 22:31 | 5010 | 0.951701 | 0.228883 | 570 | 2021-03-24 19:34 | + | 2021-03-17 23:31 | 5010 | 0.819598 | 0.262503 | 473 | 2021-03-24 19:34 | + | | ... | ... | ... | ... | | + | 2021-03-24 16:31 | 5001 | 0.061585 | 0.658140 | 477 | 2021-03-24 19:34 | + | 2021-03-24 17:31 | 5001 | 0.088949 | 0.303897 | 618 | 2021-03-24 19:34 | + | 2021-03-24 18:31 | 5001 | 0.096652 | 0.747421 | 480 | 2021-03-24 19:34 | + | 2021-03-17 19:31 | 5005 | 0.142936 | 0.707596 | 466 | 2021-03-24 19:34 | + | 2021-03-17 19:31 | 5005 | 0.142936 | 0.707596 | 466 | 2021-03-24 19:34 | + """ + df_hourly = pd.DataFrame( + { + "datetime": [ + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") + for dt in pd.date_range( + start=start_date, end=end_date, freq="1H", closed="left" + ) + ] + # include a fixed timestamp for get_historical_features in the quickstart + # + [ + # pd.Timestamp( + # year=2021, month=4, day=12, hour=7, minute=0, second=0, tz="UTC" + # ) + # ] + } + ) + df_all_drivers = pd.DataFrame() + dates = df_hourly["datetime"].map(pd.Timestamp.date).unique() + + for driver in drivers: + df_hourly_copy = df_hourly.copy() + df_hourly_copy["driver_id"] = driver + for date in dates: + df_hourly_copy.loc[ + df_hourly_copy["datetime"].map(pd.Timestamp.date) == date, + "avg_daily_trips", + ] = randint(10, 30) + df_all_drivers = pd.concat([df_hourly_copy, df_all_drivers]) + + df_all_drivers.reset_index(drop=True, inplace=True) + rows = df_all_drivers["datetime"].count() + + df_all_drivers["conv_rate"] = np.random.random(size=rows).astype(np.float32) + df_all_drivers["acc_rate"] = np.random.random(size=rows).astype(np.float32) + + df_all_drivers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms")) + + # Create duplicate rows that should be filtered by created timestamp + # TODO: These duplicate rows area indirectly being filtered out by the point in time join already. We need to + # inject a bad row at a timestamp where we know it will get joined to the entity dataframe, and then test that + # we are actually filtering it with the created timestamp + late_row = df_all_drivers.iloc[int(rows / 2)] + df_all_drivers = df_all_drivers.append(late_row).append(late_row) + + return df_all_drivers + + +def create_customer_daily_profile_df(customers, start_date, end_date) -> pd.DataFrame: + """ + Example df generated by this function: + | datetime | customer_id | current_balance | avg_passenger_count | lifetime_trip_count | created | + |------------------+-------------+-----------------+---------------------+---------------------+------------------| + | 2021-03-17 19:31 | 1010 | 0.889188 | 0.049057 | 412 | 2021-03-24 19:38 | + | 2021-03-18 19:31 | 1010 | 0.979273 | 0.212630 | 639 | 2021-03-24 19:38 | + | 2021-03-19 19:31 | 1010 | 0.976549 | 0.176881 | 70 | 2021-03-24 19:38 | + | 2021-03-20 19:31 | 1010 | 0.273697 | 0.325012 | 68 | 2021-03-24 19:38 | + | 2021-03-21 19:31 | 1010 | 0.438262 | 0.313009 | 192 | 2021-03-24 19:38 | + | | ... | ... | ... | ... | | + | 2021-03-19 19:31 | 1001 | 0.738860 | 0.857422 | 344 | 2021-03-24 19:38 | + | 2021-03-20 19:31 | 1001 | 0.848397 | 0.745989 | 106 | 2021-03-24 19:38 | + | 2021-03-21 19:31 | 1001 | 0.301552 | 0.185873 | 812 | 2021-03-24 19:38 | + | 2021-03-22 19:31 | 1001 | 0.943030 | 0.561219 | 322 | 2021-03-24 19:38 | + | 2021-03-23 19:31 | 1001 | 0.354919 | 0.810093 | 273 | 2021-03-24 19:38 | + """ + df_daily = pd.DataFrame( + { + "datetime": [ + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") + for dt in pd.date_range( + start=start_date, end=end_date, freq="1D", closed="left" + ) + ] + } + ) + df_all_customers = pd.DataFrame() + + for customer in customers: + df_daily_copy = df_daily.copy() + rows = df_daily_copy["datetime"].count() + df_daily_copy["customer_id"] = customer + df_daily_copy["current_balance"] = np.random.uniform( + low=10.0, high=50.0, size=rows + ).astype(np.float32) + df_daily_copy["lifetime_trip_count"] = np.linspace( + start=randint(10, 20), stop=randint(40, 50), num=rows + ).astype(np.int32) + df_daily_copy["avg_passenger_count"] = np.random.uniform( + low=1, high=3, size=rows + ).astype(np.float32) + df_all_customers = pd.concat([df_daily_copy, df_all_customers]) + + df_all_customers.reset_index(drop=True, inplace=True) + + rows = df_all_customers["datetime"].count() + + # TODO: Remove created timestamp in order to test whether its really optional + df_all_customers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms")) + return df_all_customers + + +def generate_entities(date, n_customers, n_drivers, order_count): + end_date = date + before_start_date = end_date - timedelta(days=365) + start_date = end_date - timedelta(days=7) + after_end_date = end_date + timedelta(days=365) + customer_entities = [20000 + c_id for c_id in range(n_customers)] + driver_entities = [50000 + d_id for d_id in range(n_drivers)] + orders_df = create_orders_df( + customers=customer_entities, + drivers=driver_entities, + start_date=start_date, + end_date=end_date, + order_count=order_count, + infer_event_timestamp_col=False, + ) + return customer_entities, driver_entities, end_date, orders_df, start_date + + +def save_df_to_csv(df, table_name, dtype): + df.to_csv(table_name+".csv", index=False) + + +if __name__ == "__main__": + start_date = datetime.now().replace(microsecond=0, second=0, minute=0) + ( + customer_entities, + driver_entities, + end_date, + orders_df, + start_date, + ) = generate_entities(start_date, 1000, 1000, 20000) + + customer_df = create_customer_daily_profile_df( + customer_entities, start_date, end_date + ) + print(customer_df.head()) + + drivers_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date) + + print(drivers_df.head()) + + + orders_table = "orders" + driver_hourly_table = "driver_hourly" + customer_profile_table = "customer_profile" + + print("uploading orders") + save_df_to_csv(orders_df, orders_table, dtype={"event_timestamp": DateTime()}) + print("uploading drivers") + save_df_to_csv(drivers_df, driver_hourly_table, dtype={"datetime": DateTime()}) + print("uploading customers") + save_df_to_csv(customer_df, customer_profile_table, dtype={"datetime": DateTime()}) \ No newline at end of file diff --git a/docs/tutorials/azure/deployment/fs_sqldb_azuredeploy.json b/docs/tutorials/azure/deployment/fs_sqldb_azuredeploy.json new file mode 100644 index 0000000000..2846a5341d --- /dev/null +++ b/docs/tutorials/azure/deployment/fs_sqldb_azuredeploy.json @@ -0,0 +1,340 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "administratorLoginPassword": { + "type": "securestring", + "metadata": { + "description": "The administrator password of the SQL logical server." + } + }, + "principalId": { + "type": "string", + "metadata": { + "description": "Specifies the principal ID assigned to the role. You can find in cloud shell using 'az ad signed-in-user show --query id -o tsv'" + } + }, + "administratorLogin": { + "type": "string", + "metadata": { + "description": "The administrator username of the SQL logical server." + }, + "defaultValue": "azureuser" + }, + "location": { + "type": "string", + "metadata": { + "description": "description" + }, + "defaultValue": "[resourceGroup().location]" + }, + "registryBlobStore": { + "type": "string", + "metadata": { + "description": "Storage account to host the feast registry db" + }, + "defaultValue": "[concat('fsregistry',uniqueString(resourceGroup().id))]" + }, + "sqlServerName": { + "type": "string", + "metadata": { + "description": "The SQL Server Name" + }, + "defaultValue": "[concat('fssqlsvr',uniqueString(resourceGroup().id))]" + }, + "sqlDbName": { + "type": "string", + "metadata": { + "description": "SQL DB Name" + }, + "defaultValue": "[concat('fsoffline',uniqueString(resourceGroup().id))]" + }, + "redisCacheName": { + "type": "string", + "metadata": { + "description": "Redis Cache Name" + }, + "defaultValue": "[concat('fsonline',uniqueString(resourceGroup().id))]" + }, + "amlWorkspaceName": { + "type": "string", + "metadata": { + "description": "description" + }, + "defaultValue": "[concat('mlws',uniqueString(resourceGroup().id))]" + }, + "vmSize": { + "type": "string", + "metadata": { + "description": "description" + }, + "defaultValue": "Standard_DS3_v2" + }, + "roleDefinitionID": { + "type": "string", + "metadata": { + "description": "Specifies the role definition ID used in the role assignment." + }, + "defaultValue": "ba92f5b4-2d11-453d-a403-e96b0029c9fe" + } + }, + "functions": [], + "variables": { + "tenantId": "[subscription().tenantId]", + "storageAccountName": "[concat('st', uniqueString(resourceGroup().id))]", + "keyVaultName": "[concat('kv-', uniqueString(resourceGroup().id))]", + "applicationInsightsName": "[concat('appi-', uniqueString(resourceGroup().id))]", + "containerRegistryName": "[concat('cr', uniqueString(resourceGroup().id))]", + "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]", + "registryAccount": "[resourceId('Microsoft.Storage/storageAccounts', parameters('registryBlobStore'))]", + "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyVaultName'))]", + "applicationInsights": "[resourceId('Microsoft.Insights/components', variables('applicationInsightsName'))]", + "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries', variables('containerRegistryName'))]", + "redisCache": "[resourceId('Microsoft.Cache/redis', parameters('redisCacheName'))]", + "roleAssignmentName": "[guid(parameters('principalId'), parameters('roleDefinitionID'), resourceGroup().id)]" + }, + "resources": [ + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-01-01", + "name": "[variables('storageAccountName')]", + "location": "[parameters('location')]", + "sku": { + "name": "Standard_RAGRS" + }, + "kind": "StorageV2", + "properties": { + "encryption": { + "services": { + "blob": { + "enabled": true + }, + "file": { + "enabled": true + } + }, + "keySource": "Microsoft.Storage" + }, + "supportsHttpsTrafficOnly": true + } + }, + { + "type": "Microsoft.KeyVault/vaults", + "apiVersion": "2021-04-01-preview", + "name": "[variables('keyVaultName')]", + "location": "[parameters('location')]", + "properties": { + "tenantId": "[variables('tenantId')]", + "sku": { + "name": "standard", + "family": "A" + }, + "accessPolicies": [], + "enableSoftDelete": true + }, + "resources": [ + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-OFFLINE-STORE-CONN')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat('mssql+pyodbc://',parameters('administratorLogin'),':',parameters('administratorLoginPassword'),'@', parameters('sqlServerName'),'.database.windows.net:1433/', parameters('sqlDbName'), '?driver=ODBC+Driver+17+for+SQL+Server&autocommit=True')]" + }, + "dependsOn": [ + "[variables('keyVault')]" + ] + } + ] + }, + { + "type": "Microsoft.Insights/components", + "apiVersion": "2020-02-02", + "name": "[variables('applicationInsightsName')]", + "location": "[if(or(equals(parameters('location'),'eastus2'), equals(parameters('location'),'westcentralus')),'southcentralus',parameters('location'))]", + "kind": "web", + "properties": { + "Application_Type": "web" + } + }, + { + "type": "Microsoft.ContainerRegistry/registries", + "sku": { + "name": "Standard", + "tier": "Standard" + }, + "name": "[variables('containerRegistryName')]", + "apiVersion": "2019-12-01-preview", + "location": "[parameters('location')]", + "properties": { + "adminUserEnabled": true + } + }, + { + "type": "Microsoft.MachineLearningServices/workspaces", + "apiVersion": "2021-04-01", + "name": "[parameters('amlWorkspaceName')]", + "location": "[resourceGroup().location]", + "identity": { + "type": "SystemAssigned" + }, + "tags": { + "displayName": "Azure ML Workspace" + }, + "dependsOn": [ + "[variables('storageAccount')]", + "[variables('keyVault')]", + "[variables('applicationInsights')]", + "[variables('containerRegistry')]" + ], + "properties": { + "storageAccount": "[variables('storageAccount')]", + "keyVault": "[variables('keyVault')]", + "applicationInsights": "[variables('applicationInsights')]", + "containerRegistry": "[variables('containerRegistry')]" + }, + "resources": [ + { + "type": "Microsoft.MachineLearningServices/workspaces/computes", + "name": "[concat(parameters('amlWorkspaceName'), '/', concat('ci-',uniqueString(resourceGroup().id)))]", + "apiVersion": "2021-07-01", + "dependsOn": [ + "[resourceId('Microsoft.MachineLearningServices/workspaces', concat(parameters('amlWorkspaceName')))]" + ], + "location": "[parameters('location')]", + "properties": { + "computeType": "ComputeInstance", + "properties": { + "vmSize": "[parameters('vmSize')]", + "setupScripts": { + "scripts": { + "creationScript": { + "scriptSource": "inline", + "scriptData": "[base64('conda activate azureml_py38;pip install feast[azure];pip install pymssql')]" + } + } + } + } + } + } + ] + }, + { + "name": "[parameters('registryBlobStore')]", + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-04-01", + "tags": { + "displayName": "Feast Registry Store" + }, + "location": "[resourceGroup().location]", + "kind": "StorageV2", + "sku": { + "name": "Standard_LRS", + "tier": "Standard" + }, + "properties": { + "allowBlobPublicAccess": false + }, + "resources": [ + { + "type": "blobServices/containers", + "apiVersion": "2019-06-01", + "name": "[concat('default/', 'fs-reg-container')]", + "dependsOn": [ + "[variables('registryAccount')]" + ] + } + ] + }, + { + "name": "[parameters('sqlServerName')]", + "type": "Microsoft.Sql/servers", + "apiVersion": "2014-04-01", + "location": "[resourceGroup().location]", + "tags": { + "displayName": "Feast Offline Store Server" + }, + "properties": { + "administratorLogin": "[parameters('administratorLogin')]", + "administratorLoginPassword": "[parameters('administratorLoginPassword')]" + }, + "resources": [ + { + "type": "firewallRules", + "apiVersion": "2014-04-01", + "dependsOn": [ + "[resourceId('Microsoft.Sql/servers', concat(parameters('sqlServerName')))]" + ], + "location": "[resourceGroup().location]", + "name": "AllowAllWindowsAzureIps", + "properties": { + "startIpAddress": "0.0.0.0", + "endIpAddress": "0.0.0.0" + } + }, + { + "name": "[parameters('sqlDbName')]", + "type": "databases", + "apiVersion": "2021-02-01-preview", + "location": "[resourceGroup().location]", + "sku": { + "tier": "Basic", + "name": "Basic" + }, + "tags": { + "displayName": "Feast Offline Store" + }, + "dependsOn": [ + "[resourceId('Microsoft.Sql/servers', concat(parameters('sqlServerName')))]" + ], + "properties": {} + } + ] + }, + { + "type": "Microsoft.Cache/redis", + "name": "[parameters('redisCacheName')]", + "apiVersion": "2020-12-01", + "location": "[resourceGroup().location]", + "tags": { + "displayName": "Feast Online Store" + }, + "properties": { + "sku": { + "name": "Basic", + "family": "C", + "capacity": 2 + } + }, + "resources": [ + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-ONLINE-STORE-CONN')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat(parameters('redisCacheName'),'.redis.cache.windows.net:6380,password=',listKeys(concat('Microsoft.Cache/redis/', parameters('redisCacheName')), providers('Microsoft.Cache', 'Redis').apiVersions[0]).primaryKey, ',ssl=True')]" + }, + "dependsOn": [ + "[variables('keyVault')]", + "[variables('redisCache')]" + ] + } + ] + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2020-04-01-preview", + "name": "[variables('roleAssignmentName')]", + "properties": { + "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', parameters('roleDefinitionId'))]", + "principalId": "[parameters('principalId')]", + "scope": "[resourceGroup().id]" + }, + "dependsOn": [ + "[variables('registryAccount')]" + ] + } + ], + "outputs": {} +} \ No newline at end of file diff --git a/docs/tutorials/azure/deployment/fs_synapse_azuredeploy.json b/docs/tutorials/azure/deployment/fs_synapse_azuredeploy.json new file mode 100644 index 0000000000..476d332c56 --- /dev/null +++ b/docs/tutorials/azure/deployment/fs_synapse_azuredeploy.json @@ -0,0 +1,413 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "administratorLoginPassword": { + "type": "securestring", + "metadata": { + "description": "The administrator password of the SQL logical server." + } + }, + "principalId": { + "type": "string", + "metadata": { + "description": "Specifies the principal ID assigned to the role. You can find in cloud shell using 'az ad signed-in-user show --query id -o tsv'" + } + }, + "sku": { + "type": "string", + "defaultValue": "DW100c", + "allowedValues": [ + "DW100c", + "DW200c", + "DW300c", + "DW400c", + "DW500c", + "DW1000c", + "DW1500c", + "DW2000c", + "DW2500c", + "DW3000c" + ], + "metadata": { + "description": "Select the SKU of the SQL pool." + } + }, + "allowAllConnections": { + "type": "string", + "allowedValues": [ + "true", + "false" + ], + "defaultValue": "true", + "metadata": { + "description": "Specifies whether to allow client IPs to connect to Synapse" + } + }, + "administratorLogin": { + "type": "string", + "metadata": { + "description": "The administrator username of the SQL logical server." + }, + "defaultValue": "azureuser" + }, + "vmSize": { + "type": "string", + "metadata": { + "description": "description" + }, + "defaultValue": "Standard_DS3_v2" + }, + "roleDefinitionID": { + "type": "string", + "metadata": { + "description": "Specifies the role definition ID used in the role assignment. Defaults to Storage Blob Data Contributor." + }, + "defaultValue": "ba92f5b4-2d11-453d-a403-e96b0029c9fe" + } + }, + "functions": [], + "variables": { + "location": "[resourceGroup().location]", + "tenantId": "[subscription().tenantId]", + "registryBlobStore": "[concat('fsregistry',uniqueString(resourceGroup().id))]", + "redisCacheName": "[concat('fsonline',uniqueString(resourceGroup().id))]", + "amlWorkspaceName": "[concat('ml',uniqueString(resourceGroup().id))]", + "synapseName": "[concat('sy',uniqueString(resourceGroup().id))]", + "storageAccountName": "[concat('st', uniqueString(resourceGroup().id))]", + "keyVaultName": "[concat('kv-', uniqueString(resourceGroup().id))]", + "applicationInsightsName": "[concat('appi-', uniqueString(resourceGroup().id))]", + "containerRegistryName": "[concat('cr', uniqueString(resourceGroup().id))]", + "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]", + "registryAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('registryBlobStore'))]", + "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyVaultName'))]", + "applicationInsights": "[resourceId('Microsoft.Insights/components', variables('applicationInsightsName'))]", + "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries', variables('containerRegistryName'))]", + "redisCache": "[resourceId('Microsoft.Cache/redis', variables('redisCacheName'))]", + "roleAssignmentName": "[guid(parameters('principalId'), parameters('roleDefinitionID'), resourceGroup().id)]", + "sqlPoolName": "[toLower(concat(variables('workspaceName'),'p1'))]", + "workspaceName": "[toLower(concat(variables('synapseName'),'ws1'))]", + "dlsName": "[toLower(concat('dls',variables('synapseName')))]", + "dlsFsName": "[toLower(concat(variables('dlsName'),'fs1'))]" + }, + "resources": [ + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-01-01", + "name": "[variables('storageAccountName')]", + "location": "[variables('location')]", + "sku": { + "name": "Standard_RAGRS" + }, + "kind": "StorageV2", + "properties": { + "encryption": { + "services": { + "blob": { + "enabled": true + }, + "file": { + "enabled": true + } + }, + "keySource": "Microsoft.Storage" + }, + "supportsHttpsTrafficOnly": true + } + }, + { + "type": "Microsoft.KeyVault/vaults", + "apiVersion": "2021-04-01-preview", + "name": "[variables('keyVaultName')]", + "location": "[variables('location')]", + "properties": { + "tenantId": "[variables('tenantId')]", + "sku": { + "name": "standard", + "family": "A" + }, + "accessPolicies": [], + "enableSoftDelete": true + }, + "resources": [ + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-OFFLINE-STORE-CONN')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat('mssql+pyodbc://',parameters('administratorLogin'),':',parameters('administratorLoginPassword'),'@', variables('workspaceName'),'.database.windows.net:1433/', variables('sqlPoolName'), '?driver=ODBC+Driver+17+for+SQL+Server&autocommit=True')]" + }, + "dependsOn": [ + "[variables('keyVault')]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-REGISTRY-PATH')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat('https://',variables('registryBlobStore'),'.blob.core.windows.net/fs-reg-container/registry.db')]" + }, + "dependsOn": [ + "[variables('keyVault')]" + ] + } + ] + }, + { + "type": "Microsoft.Insights/components", + "apiVersion": "2020-02-02", + "name": "[variables('applicationInsightsName')]", + "location": "[if(or(equals(variables('location'),'eastus2'), equals(variables('location'),'westcentralus')),'southcentralus',variables('location'))]", + "kind": "web", + "properties": { + "Application_Type": "web" + } + }, + { + "type": "Microsoft.ContainerRegistry/registries", + "sku": { + "name": "Standard", + "tier": "Standard" + }, + "name": "[variables('containerRegistryName')]", + "apiVersion": "2019-12-01-preview", + "location": "[variables('location')]", + "properties": { + "adminUserEnabled": true + } + }, + { + "type": "Microsoft.MachineLearningServices/workspaces", + "apiVersion": "2021-04-01", + "name": "[variables('amlWorkspaceName')]", + "location": "[resourceGroup().location]", + "identity": { + "type": "SystemAssigned" + }, + "tags": { + "displayName": "Azure ML Workspace" + }, + "dependsOn": [ + "[variables('storageAccount')]", + "[variables('keyVault')]", + "[variables('applicationInsights')]", + "[variables('containerRegistry')]" + ], + "properties": { + "storageAccount": "[variables('storageAccount')]", + "keyVault": "[variables('keyVault')]", + "applicationInsights": "[variables('applicationInsights')]", + "containerRegistry": "[variables('containerRegistry')]" + }, + "resources": [ + { + "type": "Microsoft.MachineLearningServices/workspaces/computes", + "name": "[concat(variables('amlWorkspaceName'), '/', concat('ci-',uniqueString(resourceGroup().id)))]", + "apiVersion": "2021-07-01", + "dependsOn": [ + "[resourceId('Microsoft.MachineLearningServices/workspaces', concat(variables('amlWorkspaceName')))]" + ], + "location": "[variables('location')]", + "properties": { + "computeType": "ComputeInstance", + "properties": { + "vmSize": "[parameters('vmSize')]", + "setupScripts": { + "scripts": { + "creationScript": { + "scriptSource": "inline", + "scriptData": "[base64('conda activate azureml_py38;pip install feast[azure];pip install pymssql')]" + } + } + } + } + } + } + ] + }, + { + "name": "[variables('registryBlobStore')]", + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-04-01", + "tags": { + "displayName": "Feast Registry Store" + }, + "location": "[resourceGroup().location]", + "kind": "StorageV2", + "sku": { + "name": "Standard_LRS", + "tier": "Standard" + }, + "properties": { + "allowBlobPublicAccess": false + }, + "resources": [ + { + "type": "blobServices/containers", + "apiVersion": "2019-06-01", + "name": "[concat('default/', 'fs-reg-container')]", + "dependsOn": [ + "[variables('registryAccount')]" + ] + } + ] + }, + { + "type": "Microsoft.Cache/redis", + "name": "[variables('redisCacheName')]", + "apiVersion": "2020-12-01", + "location": "[resourceGroup().location]", + "tags": { + "displayName": "Feast Online Store" + }, + "properties": { + "sku": { + "name": "Basic", + "family": "C", + "capacity": 2 + } + }, + "resources": [ + { + "type": "Microsoft.KeyVault/vaults/secrets", + "name": "[concat(variables('keyVaultName'), '/FEAST-ONLINE-STORE-CONN')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "properties": { + "value": "[concat(variables('redisCacheName'),'.redis.cache.windows.net:6380,password=',listKeys(concat('Microsoft.Cache/redis/', variables('redisCacheName')), providers('Microsoft.Cache', 'Redis').apiVersions[0]).primaryKey, ',ssl=True')]" + }, + "dependsOn": [ + "[variables('keyVault')]", + "[variables('redisCache')]" + ] + } + ] + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2020-04-01-preview", + "name": "[variables('roleAssignmentName')]", + "properties": { + "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', parameters('roleDefinitionId'))]", + "principalId": "[parameters('principalId')]", + "scope": "[resourceGroup().id]" + }, + "dependsOn": [ + "[variables('registryAccount')]" + ] + }, + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2019-06-01", + "name": "[variables('dlsName')]", + "location": "[variables('location')]", + "sku": { + "name": "Standard_LRS" + }, + "kind": "StorageV2", + "properties": { + "accessTier": "Hot", + "supportsHttpsTrafficOnly": true, + "isHnsEnabled": true + }, + "resources": [ + { + "name": "[concat('default/', variables('dlsFsName'))]", + "type": "blobServices/containers", + "apiVersion": "2019-06-01", + "dependsOn": [ + "[variables('dlsName')]" + ], + "properties": { + "publicAccess": "None" + } + } + ] + }, + { + "type": "Microsoft.Synapse/workspaces", + "apiVersion": "2019-06-01-preview", + "name": "[variables('workspaceName')]", + "location": "[variables('location')]", + "identity": { + "type": "SystemAssigned" + }, + "dependsOn": [ + "[variables('dlsName')]", + "[variables('dlsFsName')]" + ], + "properties": { + "defaultDataLakeStorage": { + "accountUrl": "[reference(variables('dlsName')).primaryEndpoints.dfs]", + "filesystem": "[variables('dlsFsName')]" + }, + "sqlAdministratorLogin": "[parameters('administratorLogin')]", + "sqlAdministratorLoginPassword": "[parameters('administratorLoginPassword')]", + "managedVirtualNetwork": "default" + }, + "resources": [ + { + "condition": "[equals(parameters('allowAllConnections'),'true')]", + "type": "firewallrules", + "apiVersion": "2019-06-01-preview", + "name": "allowAll", + "location": "[variables('location')]", + "dependsOn": [ + "[variables('workspaceName')]" + ], + "properties": { + "startIpAddress": "0.0.0.0", + "endIpAddress": "255.255.255.255" + } + }, + { + "type": "firewallrules", + "apiVersion": "2019-06-01-preview", + "name": "AllowAllWindowsAzureIps", + "location": "[variables('location')]", + "dependsOn": [ + "[variables('workspaceName')]" + ], + "properties": { + "startIpAddress": "0.0.0.0", + "endIpAddress": "0.0.0.0" + } + }, + { + "type": "managedIdentitySqlControlSettings", + "apiVersion": "2019-06-01-preview", + "name": "default", + "location": "[variables('location')]", + "dependsOn": [ + "[variables('workspaceName')]" + ], + "properties": { + "grantSqlControlToManagedIdentity": { + "desiredState": "Enabled" + } + } + } + ] + }, + { + "type": "Microsoft.Synapse/workspaces/sqlPools", + "apiVersion": "2019-06-01-preview", + "name": "[concat(variables('workspaceName'), '/', variables('sqlPoolName'))]", + "location": "[variables('location')]", + "sku": { + "name": "[parameters('sku')]" + }, + "dependsOn": [ + "[variables('workspaceName')]" + ], + "properties": { + "createMode": "Default", + "collation": "SQL_Latin1_General_CP1_CI_AS" + } + } + ], + "outputs": {} +} \ No newline at end of file diff --git a/docs/tutorials/azure/media/arch.png b/docs/tutorials/azure/media/arch.png new file mode 100644 index 0000000000..c386c65f53 Binary files /dev/null and b/docs/tutorials/azure/media/arch.png differ diff --git a/docs/tutorials/azure/media/ci-kernel.png b/docs/tutorials/azure/media/ci-kernel.png new file mode 100644 index 0000000000..eeab1993b8 Binary files /dev/null and b/docs/tutorials/azure/media/ci-kernel.png differ diff --git a/docs/tutorials/azure/media/ci.png b/docs/tutorials/azure/media/ci.png new file mode 100644 index 0000000000..3b93391efc Binary files /dev/null and b/docs/tutorials/azure/media/ci.png differ diff --git a/docs/tutorials/azure/media/feast-overview.png b/docs/tutorials/azure/media/feast-overview.png new file mode 100644 index 0000000000..d8eb545143 Binary files /dev/null and b/docs/tutorials/azure/media/feast-overview.png differ diff --git a/docs/tutorials/azure/media/feast-tutorial-arch.png b/docs/tutorials/azure/media/feast-tutorial-arch.png new file mode 100644 index 0000000000..621df4dd2e Binary files /dev/null and b/docs/tutorials/azure/media/feast-tutorial-arch.png differ diff --git a/docs/tutorials/azure/notebooks/feature_repo/feature_store.yaml b/docs/tutorials/azure/notebooks/feature_repo/feature_store.yaml new file mode 100644 index 0000000000..9cd55d2e9a --- /dev/null +++ b/docs/tutorials/azure/notebooks/feature_repo/feature_store.yaml @@ -0,0 +1,11 @@ +registry: + registry_store_type: AzureRegistryStore + path: ${REGISTRY_PATH} # Environment Variable +project: production +provider: azure +online_store: + type: redis + connection_string: ${REDIS_CONN} # Environment Variable +offline_store: + type: mssql + connection_string: ${SQL_CONN} # Environment Variable \ No newline at end of file diff --git a/docs/tutorials/azure/notebooks/part1-load-data.ipynb b/docs/tutorials/azure/notebooks/part1-load-data.ipynb new file mode 100644 index 0000000000..a6ab34bbaa --- /dev/null +++ b/docs/tutorials/azure/notebooks/part1-load-data.ipynb @@ -0,0 +1,224 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load feature values into the Feature store\n", + "\n", + "The objective of this tutorial is to build a model that predicts if a driver will complete a trip based on a number of features ingested into Feast.\n", + "\n", + "This notebook creates you will create and load data into the following 3 feature tables.\n", + "\n", + "**Customer Profile**: This contains features related to a customer entity such as current balance, lifetime trip count, average number of passengers per trip. A snippet of data:\n", + "\n", + "| datetime | customer_id | current_balance | avg_passenger_count | lifetime_trip_count | created |\n", + "|------------------|-------------|-----------------|---------------------|---------------------|------------------|\n", + "| 2021-03-17 19:31 | 1010 | 0.889188 | 0.049057 | 412 | 2021-03-24 19:38 |\n", + "| 2021-03-18 19:31 | 1010 | 0.979273 | 0.212630 | 639 | 2021-03-24 19:38 |\n", + "| 2021-03-19 19:31 | 1010 | 0.976549 | 0.176881 | 70 | 2021-03-24 19:38 |\n", + "| 2021-03-20 19:31 | 1010 | 0.273697 | 0.325012 | 68 | 2021-03-24 19:38 |\n", + "\n", + "**Driver table**: This contains features related to a driver entity such as conversion rate, average number of daily trips. A snippet of data:\n", + "\n", + "| datetime | driver_id | conv_rate | acc_rate | avg_daily_trips | created |\n", + "|------------------|-----------|-----------|----------|-----------------|------------------|\n", + "| 2021-03-17 19:31 | 5010 | 0.229297 | 0.685843 | 861 | 2021-03-24 19:34 |\n", + "| 2021-03-17 20:31 | 5010 | 0.781655 | 0.861280 | 769 | 2021-03-24 19:34 |\n", + "| 2021-03-17 21:31 | 5010 | 0.150333 | 0.525581 | 778 | 2021-03-24 19:34 |\n", + "| 2021-03-17 22:31 | 5010 | 0.951701 | 0.228883 | 570 | 2021-03-24 19:34 |\n", + "\n", + "\n", + "**Orders table**: This is a typical *fact table* that contains the order information such driver/customer id and whether the trip was completed. A snippet of data:\n", + "\n", + "| order_id | driver_id | customer_id | order_is_success | event_timestamp |\n", + "|----------|-----------|-------------|------------------|---------------------|\n", + "| 100 | 5004 | 1007 | 0 | 2021-03-10 19:31:15 |\n", + "| 101 | 5003 | 1006 | 0 | 2021-03-11 22:02:50 |\n", + "| 102 | 5010 | 1005 | 0 | 2021-03-13 00:34:24 |\n", + "| 103 | 5010 | 1001 | 1 | 2021-03-14 03:05:59 |\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "from sqlalchemy.sql import text\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "kv = ws.get_default_keyvault()\n", + "\n", + "engine = create_engine(kv.get_secret(\"FEAST-OFFLINE-STORE-CONN\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Create Customer profile table and load data\n", + "The cell below will create the customer profile table and load the data into the Synapse table. Loading is achieved using the ``COPY INTO` bulk load available in Synapse (the CSV data is available on public blob):\n", + "\n", + "```sql\n", + "COPY INTO dbo.customer_profile\n", + "FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/customer_profile.csv'\n", + "WITH\n", + "(\n", + "\tFILE_TYPE = 'CSV'\n", + "\t,FIRSTROW = 2\n", + "\t,MAXERRORS = 0\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with engine.connect() as con:\n", + " # create and load customer profile table\n", + " file = open(\"../sql/create_cx_profile_table.sql\")\n", + " query = text(file.read())\n", + " print(\"creating customer profile table...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + " file = open(\"../sql/load_cx_profile_data.sql\")\n", + " query = text(file.read())\n", + " print(\"loading customer profile data...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + "\n", + "pd.read_sql_query(\"select top 10 * from dbo.customer_profile\", engine)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create drivers table and load data\n", + "The cell below will create the drivers table and load the data into the Synapse table. Loading is achieved using the ``COPY INTO` bulk load available in Synapse (the CSV data is available on public blob):\n", + "\n", + "```sql\n", + "COPY INTO dbo.driver_hourly\n", + "FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/driver_hourly.csv'\n", + "WITH\n", + "(\n", + "\tFILE_TYPE = 'CSV'\n", + "\t,FIRSTROW = 2\n", + "\t,MAXERRORS = 0\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with engine.connect() as con:\n", + " file = open(\"../sql/create_drivers_table.sql\")\n", + " query = text(file.read())\n", + " print(\"creating drivers table...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + " file = open(\"../sql/load_drivers_data.sql\")\n", + " query = text(file.read())\n", + " print(\"loading drivers data...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + "\n", + "pd.read_sql_query(\"select top 10 * from dbo.driver_hourly\", engine)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create orders table and load data\n", + "The cell below will create the orders table and load the data into the Synapse table. Loading is achieved using the ``COPY INTO` bulk load available in Synapse (the CSV data is available on public blob):\n", + "\n", + "```sql\n", + "COPY INTO dbo.orders\n", + "FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/orders.csv'\n", + "WITH\n", + "(\n", + "\tFILE_TYPE = 'CSV'\n", + "\t,FIRSTROW = 2\n", + "\t,MAXERRORS = 0\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with engine.connect() as con:\n", + " file = open(\"../sql/create_orders_table.sql\")\n", + " query = text(file.read())\n", + " print(\"creating orders table...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + " file = open(\"../sql/load_orders_data.sql\")\n", + " query = text(file.read())\n", + " print(\"loading orders data...\", end=\"\")\n", + " con.execute(query)\n", + " print(\"done\")\n", + "\n", + "pd.read_sql_query(\"select top 10 * from dbo.orders\", engine)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "With the feature values loaded into the feature store, you will need to register the features in the feast central registry. [Follow the Register Features part of the tutorial](./part2-register-features.ipynb)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.12 64-bit ('feast_env')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "b81e56dd72a0de84f7bcdac7bc848ecf5d1ed9826cc75d6e0cb7b6dbe5b95a6d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/azure/notebooks/part2-register-features.ipynb b/docs/tutorials/azure/notebooks/part2-register-features.ipynb new file mode 100644 index 0000000000..6ec87577cf --- /dev/null +++ b/docs/tutorials/azure/notebooks/part2-register-features.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation.\n", + "Licensed under the MIT license.\n", + "\n", + "# Feast Azure Provider Tutorial: Register Features\n", + "\n", + "In this notebook you will connect to your feature store and register features into a central repository hosted on Azure Blob Storage. It should be noted that best practice for registering features would be through a CI/CD process e.g. GitHub Actions, or Azure DevOps.\n", + "\n", + "## What is Feast?\n", + "\n", + "Feast is an operational data system for managing and serving machine learning features to models in production. Feast is able to serve feature data to models from a low-latency online store (for real-time prediction) or from an offline store (for scale-out batch scoring or model training).\n", + "\n", + "![feast overview](../media/feast-overview.png)\n", + "\n", + "## Configure Feature Repository\n", + "\n", + "The cell below displays the feature_store.yaml file - a file that contains infrastructural configuration, such as where the registry file is located, and connection strings to data.\n", + "\n", + "__There is no need to change the details in this file. When you connect to the feature store afterwards, the credentials are resolved from the Azure ML default keyvault.__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!cat feature_repo/feature_store.yaml" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Connect to the feature store\n", + "\n", + "Below you connect to the feature store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from feast import FeatureStore\n", + "from azureml.core import Workspace\n", + "\n", + "# access key vault to get secrets\n", + "ws = Workspace.from_config()\n", + "kv = ws.get_default_keyvault()\n", + "os.environ['REGISTRY_PATH']=kv.get_secret(\"FEAST-REGISTRY-PATH\")\n", + "os.environ['SQL_CONN']=kv.get_secret(\"FEAST-OFFLINE-STORE-CONN\")\n", + "os.environ['REDIS_CONN']=kv.get_secret(\"FEAST-ONLINE-STORE-CONN\")\n", + "\n", + "# connect to feature store\n", + "fs = FeatureStore(\"./feature_repo\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define the data source (offline store)\n", + "\n", + "The data source refers to raw underlying data (a table in Azure SQL DB or Synapse SQL). Feast uses a time-series data model to represent data. This data model is used to interpret feature data in data sources in order to build training datasets or when materializing features into an online store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source import MsSqlServerSource\n", + "\n", + "orders_table = \"orders\"\n", + "driver_hourly_table = \"driver_hourly\"\n", + "customer_profile_table = \"customer_profile\"\n", + "\n", + "driver_source = MsSqlServerSource(\n", + " table_ref=driver_hourly_table,\n", + " event_timestamp_column=\"datetime\",\n", + " created_timestamp_column=\"created\",\n", + ")\n", + "\n", + "customer_source = MsSqlServerSource(\n", + " table_ref=customer_profile_table,\n", + " event_timestamp_column=\"datetime\",\n", + " created_timestamp_column=\"\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Feature Views\n", + "\n", + "A feature view is an object that represents a logical group of time-series feature data as it is found in a data source. Feature views consist of one or more entities, features, and a data source. Feature views allow Feast to model your existing feature data in a consistent way in both an offline (training) and online (serving) environment.\n", + "\n", + "Feature views are used during:\n", + "\n", + "- The generation of training datasets by querying the data source of feature views in order to find historical feature values. A single training dataset may consist of features from multiple feature views. \n", + "- Loading of feature values into an online store. Feature views determine the storage schema in the online store.\n", + "- Retrieval of features from the online store. Feature views provide the schema definition to Feast in order to look up features from the online store.\n", + "\n", + "__NOTE: Feast does not generate feature values. It acts as the ingestion and serving system. The data sources described within feature views should reference feature values in their already computed form.__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from feast import Feature, FeatureView, ValueType\n", + "from datetime import timedelta\n", + "\n", + "driver_fv = FeatureView(\n", + " name=\"driver_stats\",\n", + " entities=[\"driver\"],\n", + " features=[\n", + " Feature(name=\"conv_rate\", dtype=ValueType.FLOAT),\n", + " Feature(name=\"acc_rate\", dtype=ValueType.FLOAT),\n", + " Feature(name=\"avg_daily_trips\", dtype=ValueType.INT32),\n", + " ],\n", + " batch_source=driver_source,\n", + " ttl=timedelta(hours=2),\n", + ")\n", + "\n", + "customer_fv = FeatureView(\n", + " name=\"customer_profile\",\n", + " entities=[\"customer_id\"],\n", + " features=[\n", + " Feature(name=\"current_balance\", dtype=ValueType.FLOAT),\n", + " Feature(name=\"avg_passenger_count\", dtype=ValueType.FLOAT),\n", + " Feature(name=\"lifetime_trip_count\", dtype=ValueType.INT32),\n", + " ],\n", + " batch_source=customer_source,\n", + " ttl=timedelta(days=2),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Define entities\n", + "\n", + "An entity is a collection of semantically related features. Users define entities to map to the domain of their use case. For example, a ride-hailing service could have customers and drivers as their entities, which group related features that correspond to these customers and drivers.\n", + "\n", + "Entities are defined as part of feature views. Entities are used to identify the primary key on which feature values should be stored and retrieved. These keys are used during the lookup of feature values from the online store and the join process in point-in-time joins. It is possible to define composite entities (more than one entity object) in a feature view.\n", + "Entities should be reused across feature views.\n", + "\n", + "## Entity key\n", + "\n", + "A related concept is an entity key. These are one or more entity values that uniquely describe a feature view record. In the case of an entity (like a driver) that only has a single entity field, the entity is an entity key. However, it is also possible for an entity key to consist of multiple entity values. For example, a feature view with the composite entity of (customer, country) might have an entity key of (1001, 5).\n", + "\n", + "Entity keys act as primary keys. They are used during the lookup of features from the online store, and they are also used to match feature rows across feature views during point-in-time joins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from feast import Entity\n", + "driver = Entity(name=\"driver\", join_key=\"driver_id\", value_type=ValueType.INT64)\n", + "customer = Entity(name=\"customer_id\", value_type=ValueType.INT64)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feast `apply()`\n", + "\n", + "Feast `apply` will:\n", + "\n", + "1. Feast will scan Python files in your feature repository and find all Feast object definitions, such as feature views, entities, and data sources.\n", + "1. Feast will validate your feature definitions\n", + "1. Feast will sync the metadata about Feast objects to the registry. If a registry does not exist, then it will be instantiated. The standard registry is a simple protobuf binary file that is stored on Azure Blob Storage.\n", + "1. Feast CLI will create all necessary feature store infrastructure. The exact infrastructure that is deployed or configured depends on the provider configuration that you have set in feature_store.yaml." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fs.apply([driver, driver_fv, customer, customer_fv])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What just happened?\n", + "\n", + "If you look in your feast registry storage account, you will see there is now a registry.db file that contains the metadata for your registered features. Below you can list the feature views:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from google.protobuf.json_format import MessageToDict\n", + "\n", + "for x in fs.list_feature_views():\n", + " d=MessageToDict(x.to_proto())\n", + " print(\"🪟 Feature view name:\", d['spec']['name'])\n", + " print(\"🧑 Entities:\", d['spec']['entities'])\n", + " print(\"🧪 Features:\", d['spec']['features'])\n", + " print(\"💾 Batch source type:\", d['spec']['batchSource']['dataSourceClassType'])\n", + " print(\"\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "In the [next part of this tutorial](./part3-train-and-deploy-with-feast.ipynb) you will:\n", + "\n", + "- Train a model using features stored in your feature store\n", + "- Materialize the data from the offline store to the online store\n", + "- Deploy the model to a real-time endpoint, that consumes feature vectors from the online store.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.12 64-bit ('feast_env')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "b81e56dd72a0de84f7bcdac7bc848ecf5d1ed9826cc75d6e0cb7b6dbe5b95a6d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/azure/notebooks/part3-train-and-deploy-with-feast.ipynb b/docs/tutorials/azure/notebooks/part3-train-and-deploy-with-feast.ipynb new file mode 100644 index 0000000000..ff15aac60d --- /dev/null +++ b/docs/tutorials/azure/notebooks/part3-train-and-deploy-with-feast.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "Copyright (c) Microsoft Corporation. Licensed under the MIT license.\n", + "\n", + "# Train and Deploy a model using Feast\n", + "\n", + "In this notebook we show how to:\n", + "\n", + "1. access a feature store \n", + "1. discover features in the feature store\n", + "1. train a model using the offline store (using the feast function `get_historical_features()`)\n", + "1. use the feast `materialize()` function to push features from the offline store to an online store (redis)\n", + "1. Deploy the model to an Azure ML endpoint where the features are consumed from the online store (feast function `get_online_features()`)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Connect to Feature store\n", + "\n", + "Below we create a Feast repository config, which accesses the registry.db file and also provides the credentials to the offline and online storage. These credentials are done via the Azure Keyvault." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "gather": { + "logged": 1627130565121 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "import os\n", + "from feast import FeatureStore\n", + "from azureml.core import Workspace\n", + "\n", + "# access key vault to get secrets\n", + "ws = Workspace.from_config()\n", + "kv = ws.get_default_keyvault()\n", + "os.environ['REGISTRY_PATH']=kv.get_secret(\"FEAST-REGISTRY-PATH\")\n", + "os.environ['SQL_CONN']=kv.get_secret(\"FEAST-OFFLINE-STORE-CONN\")\n", + "os.environ['REDIS_CONN']=kv.get_secret(\"FEAST-ONLINE-STORE-CONN\")\n", + "\n", + "# connect to feature store\n", + "fs = FeatureStore(\"./feature_repo\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List the feature views\n", + "\n", + "Below lists the registered feature views." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fs.list_feature_views()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "gather": { + "logged": 1627130724228 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load features into a pandas dataframe\n", + "\n", + "Below you load the features from the feature store into a pandas data frame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "gather": { + "logged": 1626933777036 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "sql_job = fs.get_historical_features(\n", + " entity_df=\"SELECT * FROM orders\",\n", + " features=[\n", + " \"driver_stats:conv_rate\",\n", + " \"driver_stats:acc_rate\",\n", + " \"driver_stats:avg_daily_trips\",\n", + " \"customer_profile:current_balance\",\n", + " \"customer_profile:avg_passenger_count\",\n", + " \"customer_profile:lifetime_trip_count\",\n", + " ],\n", + ")\n", + "\n", + "training_df = sql_job.to_df()\n", + "training_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Train a model and capture metrics with MLFlow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from azureml.core import Workspace\n", + "\n", + "# connect to your workspace\n", + "ws = Workspace.from_config()\n", + "\n", + "# create experiment and start logging to a new run in the experiment\n", + "experiment_name = \"order_model\"\n", + "\n", + "# set up MLflow to track the metrics\n", + "mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())\n", + "mlflow.set_experiment(experiment_name)\n", + "mlflow.sklearn.autolog()\n", + "\n", + "training_df = training_df.dropna()\n", + "X = training_df[['conv_rate', 'acc_rate', 'avg_daily_trips', \n", + " 'current_balance', 'avg_passenger_count','lifetime_trip_count' ]].dropna()\n", + "y = training_df['order_is_success']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)\n", + "clf = RandomForestClassifier(n_estimators=10)\n", + "\n", + "# train the model\n", + "with mlflow.start_run() as run:\n", + " clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare for deployment\n", + "\n", + "### Register model and the feature registry " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# register the model\n", + "model_uri = \"runs:/{}/model\".format(run.info.run_id)\n", + "model = mlflow.register_model(model_uri, \"order_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `materialize()` data into the online store (redis)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "\n", + "end_date = datetime.now()\n", + "start_date = end_date - timedelta(days=365)\n", + "fs.materialize(start_date=start_date, end_date=end_date)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up deployment configuration\n", + "\n", + "__Note: You will need to set up a service principal (SP) and add that SP to your blob storage account as a *Storage Blob Data Contributor* role to authenticate to the storage containing the feast registry file.__\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`az ad sp create-for-rbac -n $sp_name --role \"Storage Blob Data Contributor\" \\\n", + "--scopes /subscriptions/$sub_id/resourceGroups/$rg_name`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have set up the SP, populate the `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_CLIENT_SECRET` environment variables below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.environment import Environment\n", + "from azureml.core.webservice import AciWebservice\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "keyvault = ws.get_default_keyvault()\n", + "\n", + "# create deployment config i.e. compute resources\n", + "aciconfig = AciWebservice.deploy_configuration(\n", + " cpu_cores=1,\n", + " memory_gb=1,\n", + " description=\"orders service using feast\",\n", + ")\n", + "\n", + "# get registered environment\n", + "env = Environment(\"feast-env\")\n", + "env.docker.base_image = None\n", + "env.docker.base_dockerfile = \"./inference.dockerfile\"\n", + "env.python.user_managed_dependencies = True\n", + "env.inferencing_stack_version = 'latest'\n", + "env.python.interpreter_path = \"/azureml-envs/feast/bin/python\"\n", + "\n", + "# again ensure that the scoring environment has access to the registry file\n", + "env.environment_variables = {\n", + " \"FEAST_SQL_CONN\": fs.config.offline_store.connection_string,\n", + " \"FEAST_REDIS_CONN\": fs.config.online_store.connection_string,\n", + " \"FEAST_REGISTRY_BLOB\": fs.config.registry.path,\n", + " \"AZURE_CLIENT_ID\": \"PROVIDE YOUR SERVICE PRINCIPLE CLIENT ID HERE\",\n", + " \"AZURE_TENANT_ID\": \"PROVIDE YOUR SERVICE PRINCIPLE TENANT ID HERE\",\n", + " \"AZURE_CLIENT_SECRET\": \"PROVIDE YOUR SERVICE PRINCIPLE CLIENT SECRET HERE\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy model\n", + "\n", + "Next, you deploy the model to Azure Container Instance. Please note that this may take approximately 10 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "from azureml.core.model import InferenceConfig\n", + "from azureml.core.environment import Environment\n", + "from azureml.core.model import Model\n", + "\n", + "# get the registered model\n", + "model = Model(ws, \"order_model\")\n", + "\n", + "# create an inference config i.e. the scoring script and environment\n", + "inference_config = InferenceConfig(\n", + " entry_script=\"./src/score.py\", \n", + " environment=env, \n", + " source_directory=\"src\"\n", + ")\n", + "\n", + "# deploy the service\n", + "service_name = \"orders-service\" + str(uuid.uuid4())[:4]\n", + "service = Model.deploy(\n", + " workspace=ws,\n", + " name=service_name,\n", + " models=[model],\n", + " inference_config=inference_config,\n", + " deployment_config=aciconfig,\n", + ")\n", + "\n", + "service.wait_for_deployment(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test service\n", + "\n", + "Below you test the service. The first score takes a while as the feast registry file is downloaded from blob. Subsequent runs will be faster as feast uses a local cache for the registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "input_payload = json.dumps({\"driver\":50521, \"customer_id\":20265})\n", + "\n", + "service.run(input_data=input_payload)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean up service" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "service.delete()" + ] + } + ], + "metadata": { + "kernel_info": { + "name": "newenv" + }, + "kernelspec": { + "display_name": "Python 3.8.12 64-bit ('feast_env')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "vscode": { + "interpreter": { + "hash": "b81e56dd72a0de84f7bcdac7bc848ecf5d1ed9826cc75d6e0cb7b6dbe5b95a6d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/azure/notebooks/src/score.py b/docs/tutorials/azure/notebooks/src/score.py new file mode 100644 index 0000000000..93b248240d --- /dev/null +++ b/docs/tutorials/azure/notebooks/src/score.py @@ -0,0 +1,76 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import logging +import json +import joblib +from feast import FeatureStore, RepoConfig +from feast.infra.registry.registry import RegistryConfig + +from feast.infra.offline_stores.contrib.mssql_offline_store.mssql import MsSqlServerOfflineStoreConfig +from feast.infra.online_stores.redis import RedisOnlineStoreConfig, RedisOnlineStore + + +def init(): + sql_conn_str = os.getenv("FEAST_SQL_CONN") + redis_conn_str = os.getenv("FEAST_REDIS_CONN") + feast_registry_path = os.getenv("FEAST_REGISTRY_BLOB") + + print("connecting to registry...") + reg_config = RegistryConfig( + registry_store_type="azure", + path=feast_registry_path, + ) + + print("connecting to repo config...") + repo_cfg = RepoConfig( + project="production", + provider="azure", + registry=reg_config, + offline_store=MsSqlServerOfflineStoreConfig(connection_string=sql_conn_str), + online_store=RedisOnlineStoreConfig(connection_string=redis_conn_str), + ) + global store + print("connecting to feature store...") + store = FeatureStore(config=repo_cfg) + + global model + # AZUREML_MODEL_DIR is an environment variable created during deployment. + # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) + model_path = os.path.join(os.getenv("AZUREML_MODEL_DIR"), "model/model.pkl") + # deserialize the model file back into a sklearn model + model = joblib.load(model_path) + print("read model, init complete") + + +def run(raw_data): + data = json.loads(raw_data) + feature_vector = store.get_online_features( + features=[ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips", + "driver_stats:acc_rate", + "customer_profile:current_balance", + "customer_profile:avg_passenger_count", + "customer_profile:lifetime_trip_count", + ], + entity_rows=[data], + ).to_df() + logging.info(feature_vector) + if len(feature_vector.dropna()) > 0: + data = feature_vector[ + [ + "conv_rate", + "avg_daily_trips", + "acc_rate", + "current_balance", + "avg_passenger_count", + "lifetime_trip_count", + ] + ] + + y_hat = model.predict(data) + return y_hat.tolist() + else: + return 0.0 \ No newline at end of file diff --git a/docs/tutorials/azure/sql/create_cx_profile_table.sql b/docs/tutorials/azure/sql/create_cx_profile_table.sql new file mode 100644 index 0000000000..c1cd09c9f3 --- /dev/null +++ b/docs/tutorials/azure/sql/create_cx_profile_table.sql @@ -0,0 +1,14 @@ +CREATE TABLE dbo.customer_profile +( + [datetime] DATETIME2(0), + [customer_id] bigint, + [current_balance] float, + [lifetime_trip_count] bigint, + [avg_passenger_count] float, + [created] datetime2(3) +) +WITH +( +DISTRIBUTION = ROUND_ROBIN, + CLUSTERED COLUMNSTORE INDEX +) diff --git a/docs/tutorials/azure/sql/create_drivers_table.sql b/docs/tutorials/azure/sql/create_drivers_table.sql new file mode 100644 index 0000000000..39b4b1371d --- /dev/null +++ b/docs/tutorials/azure/sql/create_drivers_table.sql @@ -0,0 +1,14 @@ +CREATE TABLE dbo.driver_hourly +( + [datetime] DATETIME2(0), + [driver_id] bigint, + [avg_daily_trips] float, + [conv_rate] float, + [acc_rate] float, + [created] datetime2(3) +) +WITH +( +DISTRIBUTION = ROUND_ROBIN, + CLUSTERED COLUMNSTORE INDEX +) diff --git a/docs/tutorials/azure/sql/create_orders_table.sql b/docs/tutorials/azure/sql/create_orders_table.sql new file mode 100644 index 0000000000..e2325e85f6 --- /dev/null +++ b/docs/tutorials/azure/sql/create_orders_table.sql @@ -0,0 +1,13 @@ +CREATE TABLE dbo.orders +( + [order_id] bigint, + [driver_id] bigint, + [customer_id] bigint, + [order_is_success] int, + [event_timestamp] datetime2(3) +) +WITH +( +DISTRIBUTION = ROUND_ROBIN, + CLUSTERED COLUMNSTORE INDEX +) diff --git a/docs/tutorials/azure/sql/load_cx_profile_data.sql b/docs/tutorials/azure/sql/load_cx_profile_data.sql new file mode 100644 index 0000000000..c3f55f4d72 --- /dev/null +++ b/docs/tutorials/azure/sql/load_cx_profile_data.sql @@ -0,0 +1,8 @@ +COPY INTO dbo.customer_profile +FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/customer_profile.csv' +WITH +( + FILE_TYPE = 'CSV' + ,FIRSTROW = 2 + ,MAXERRORS = 0 +) diff --git a/docs/tutorials/azure/sql/load_drivers_data.sql b/docs/tutorials/azure/sql/load_drivers_data.sql new file mode 100644 index 0000000000..37aa357b9d --- /dev/null +++ b/docs/tutorials/azure/sql/load_drivers_data.sql @@ -0,0 +1,8 @@ +COPY INTO dbo.driver_hourly +FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/driver_hourly.csv' +WITH +( + FILE_TYPE = 'CSV' + ,FIRSTROW = 2 + ,MAXERRORS = 0 +) diff --git a/docs/tutorials/azure/sql/load_orders_data.sql b/docs/tutorials/azure/sql/load_orders_data.sql new file mode 100644 index 0000000000..eaa062eac2 --- /dev/null +++ b/docs/tutorials/azure/sql/load_orders_data.sql @@ -0,0 +1,8 @@ +COPY INTO dbo.orders +FROM 'https://feastonazuredatasamples.blob.core.windows.net/feastdatasamples/orders.csv' +WITH +( + FILE_TYPE = 'CSV' + ,FIRSTROW = 2 + ,MAXERRORS = 0 +) diff --git a/docs/tutorials/driver-ranking-with-feast.md b/docs/tutorials/driver-ranking-with-feast.md deleted file mode 100644 index 4ad34cd9c0..0000000000 --- a/docs/tutorials/driver-ranking-with-feast.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -description: >- - Making a prediction using a linear regression model is a common use case in - ML. This model predicts if a driver will complete a trip based on features - ingested into Feast. ---- - -# Driver ranking - -In this example, you'll learn how to use some of the key functionality in Feast. The tutorial runs in both local mode and on the Google Cloud Platform \(GCP\). For GCP, you must have access to a GCP project already, including read and write permissions to BigQuery. - -## [Driver Ranking Example](https://github.com/feast-dev/feast-driver-ranking-tutorial) - -This tutorial guides you on how to use Feast with [Scikit-learn](https://scikit-learn.org/stable/). You will learn how to: - -* Train a model locally \(on your laptop\) using data from [BigQuery](https://cloud.google.com/bigquery/) -* Test the model for online inference using [SQLite](https://www.sqlite.org/index.html) \(for fast iteration\) -* Test the model for online inference using [Firestore](https://firebase.google.com/products/firestore) \(for production use\) - -Try it and let us know what you think! - -| ![](../.gitbook/assets/colab_logo_32px.png)[ Run in Google Colab ](https://colab.research.google.com/github/feast-dev/feast-driver-ranking-tutorial/blob/master/notebooks/Driver_Ranking_Tutorial.ipynb) | ![](../.gitbook/assets/github-mark-32px.png)[ View Source in Github](https://github.com/feast-dev/feast-driver-ranking-tutorial/blob/master/notebooks/Driver_Ranking_Tutorial.ipynb) | -| :--- | :--- | - - diff --git a/docs/tutorials/tutorials-overview.md b/docs/tutorials/tutorials-overview.md deleted file mode 100644 index 9432783a69..0000000000 --- a/docs/tutorials/tutorials-overview.md +++ /dev/null @@ -1,15 +0,0 @@ -# Overview - -These Feast tutorials showcase how to use Feast to simplify end to end model training / serving. - -{% page-ref page="fraud-detection.md" %} - -{% page-ref page="driver-ranking-with-feast.md" %} - -{% page-ref page="real-time-credit-scoring-on-aws.md" %} - -{% page-ref page="driver-stats-on-snowflake.md" %} - -{% page-ref page="validating-historical-features.md" %} - -{% page-ref page="using-scalable-registry.md" %} diff --git a/docs/tutorials/tutorials-overview/README.md b/docs/tutorials/tutorials-overview/README.md new file mode 100644 index 0000000000..76cb2bea6b --- /dev/null +++ b/docs/tutorials/tutorials-overview/README.md @@ -0,0 +1,19 @@ +# Sample use-case tutorials + +These Feast tutorials showcase how to use Feast to simplify end to end model training / serving. + +{% content-ref url="driver-ranking-with-feast.md" %} +[driver-ranking-with-feast.md](driver-ranking-with-feast.md) +{% endcontent-ref %} + +{% content-ref url="fraud-detection.md" %} +[fraud-detection.md](fraud-detection.md) +{% endcontent-ref %} + +{% content-ref url="real-time-credit-scoring-on-aws.md" %} +[real-time-credit-scoring-on-aws.md](real-time-credit-scoring-on-aws.md) +{% endcontent-ref %} + +{% content-ref url="driver-stats-on-snowflake.md" %} +[driver-stats-on-snowflake.md](driver-stats-on-snowflake.md) +{% endcontent-ref %} diff --git a/docs/tutorials/tutorials-overview/driver-ranking-with-feast.md b/docs/tutorials/tutorials-overview/driver-ranking-with-feast.md new file mode 100644 index 0000000000..54f3035319 --- /dev/null +++ b/docs/tutorials/tutorials-overview/driver-ranking-with-feast.md @@ -0,0 +1,23 @@ +--- +description: >- + Making a prediction using a linear regression model is a common use case in + ML. This model predicts if a driver will complete a trip based on features + ingested into Feast. +--- + +# Driver ranking + +In this example, you'll learn how to use some of the key functionality in Feast. The tutorial runs in both local mode and on the Google Cloud Platform (GCP). For GCP, you must have access to a GCP project already, including read and write permissions to BigQuery. + +## [Driver Ranking Example](https://github.com/feast-dev/feast-driver-ranking-tutorial) + +This tutorial guides you on how to use Feast with [Scikit-learn](https://scikit-learn.org/stable/). You will learn how to: + +* Train a model locally (on your laptop) using data from [BigQuery](https://cloud.google.com/bigquery/) +* Test the model for online inference using [SQLite](https://www.sqlite.org/index.html) (for fast iteration) +* Test the model for online inference using [Firestore](https://firebase.google.com/products/firestore) (for production use) + +Try it and let us know what you think! + +| ![](../../.gitbook/assets/colab\_logo\_32px.png)[ Run in Google Colab](https://colab.research.google.com/github/feast-dev/feast-driver-ranking-tutorial/blob/master/notebooks/Driver\_Ranking\_Tutorial.ipynb) | ![](../../.gitbook/assets/github-mark-32px.png)[ View Source in Github](https://github.com/feast-dev/feast-driver-ranking-tutorial/blob/master/notebooks/Driver\_Ranking\_Tutorial.ipynb) | +| -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | diff --git a/docs/tutorials/driver-stats-on-snowflake.md b/docs/tutorials/tutorials-overview/driver-stats-on-snowflake.md similarity index 88% rename from docs/tutorials/driver-stats-on-snowflake.md rename to docs/tutorials/tutorials-overview/driver-stats-on-snowflake.md index 306ae2f59b..a425248b76 100644 --- a/docs/tutorials/driver-stats-on-snowflake.md +++ b/docs/tutorials/tutorials-overview/driver-stats-on-snowflake.md @@ -6,7 +6,7 @@ description: >- # Drivers stats on Snowflake In the steps below, we will set up a sample Feast project that leverages Snowflake -as an offline store. +as an offline store + materialization engine + online store. Starting with data in a Snowflake table, we will register that table to the feature store and define features associated with the columns in that table. From there, we will generate historical training data based on those feature definitions and then materialize the latest feature values into the online store. Lastly, we will retrieve the materialized feature values. @@ -46,7 +46,7 @@ The following files will automatically be created in your project folder: #### Inspect `feature_store.yaml` -Here you will see the information that you entered. This template will use Snowflake as an offline store and SQLite as the online store. The main thing to remember is by default, Snowflake objects have ALL CAPS names unless lower case was specified. +Here you will see the information that you entered. This template will use Snowflake as the offline store, materialization engine, and the online store. The main thing to remember is by default, Snowflake objects have ALL CAPS names unless lower case was specified. {% code title="feature_store.yaml" %} ```yaml @@ -61,6 +61,14 @@ offline_store: role: ROLE_NAME #case sensitive warehouse: WAREHOUSE_NAME #case sensitive database: DATABASE_NAME #case cap sensitive +batch_engine: + type: snowflake.engine + account: SNOWFLAKE_DEPLOYMENT_URL #drop .snowflakecomputing.com + user: USERNAME + password: PASSWORD + role: ROLE_NAME #case sensitive + warehouse: WAREHOUSE_NAME #case sensitive + database: DATABASE_NAME #case cap sensitive online_store: type: snowflake.online account: SNOWFLAKE_DEPLOYMENT_URL #drop .snowflakecomputing.com diff --git a/docs/tutorials/fraud-detection.md b/docs/tutorials/tutorials-overview/fraud-detection.md similarity index 51% rename from docs/tutorials/fraud-detection.md rename to docs/tutorials/tutorials-overview/fraud-detection.md index 7bdfde760e..30564d0b0c 100644 --- a/docs/tutorials/fraud-detection.md +++ b/docs/tutorials/tutorials-overview/fraud-detection.md @@ -17,13 +17,9 @@ Our end-to-end example will perform the following workflows: * Building point-in-time correct training datasets from feature data and training a model * Making online predictions from feature data -Here's a high-level picture of our system architecture on Google Cloud Platform \(GCP\): - - - -![](../.gitbook/assets/data-systems-fraud-2x.jpg) - -| ![](../.gitbook/assets/colab_logo_32px.png) [Run in Google Colab](https://colab.research.google.com/github/feast-dev/feast-fraud-tutorial/blob/master/notebooks/Fraud_Detection_Tutorial.ipynb) | ![](../.gitbook/assets/github-mark-32px.png)[ View Source on Github](https://github.com/feast-dev/feast-fraud-tutorial/blob/main/notebooks/Fraud_Detection_Tutorial.ipynb) | -| :--- | :--- | +Here's a high-level picture of our system architecture on Google Cloud Platform (GCP): +![](../../.gitbook/assets/data-systems-fraud-2x.jpg) +| ![](../../.gitbook/assets/colab\_logo\_32px.png) [Run in Google Colab](https://colab.research.google.com/github/feast-dev/feast-fraud-tutorial/blob/master/notebooks/Fraud\_Detection\_Tutorial.ipynb) | ![](../../.gitbook/assets/github-mark-32px.png)[ View Source on Github](https://github.com/feast-dev/feast-fraud-tutorial/blob/main/notebooks/Fraud\_Detection\_Tutorial.ipynb) | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | diff --git a/docs/tutorials/real-time-credit-scoring-on-aws.md b/docs/tutorials/tutorials-overview/real-time-credit-scoring-on-aws.md similarity index 74% rename from docs/tutorials/real-time-credit-scoring-on-aws.md rename to docs/tutorials/tutorials-overview/real-time-credit-scoring-on-aws.md index 43f8c98133..6268aba1f1 100644 --- a/docs/tutorials/real-time-credit-scoring-on-aws.md +++ b/docs/tutorials/tutorials-overview/real-time-credit-scoring-on-aws.md @@ -10,20 +10,18 @@ When individuals apply for loans from banks and other credit providers, the deci In this example, we will demonstrate how a real-time credit scoring system can be built using Feast and Scikit-Learn on AWS, using feature data from S3. -This real-time system accepts a loan request from a customer and responds within 100ms with a decision on whether their loan has been approved or rejected. +This real-time system accepts a loan request from a customer and responds within 100ms with a decision on whether their loan has been approved or rejected. ## [Real-time Credit Scoring Example](https://github.com/feast-dev/real-time-credit-scoring-on-aws-tutorial) This end-to-end tutorial will take you through the following steps: -* Deploying S3 with Parquet as your primary data source, containing both [loan features](https://github.com/feast-dev/real-time-credit-scoring-on-aws-tutorial/blob/22fc6c7272ef033e7ba0afc64ffaa6f6f8fc0277/data/loan_table_sample.csv) and [zip code features](https://github.com/feast-dev/real-time-credit-scoring-on-aws-tutorial/blob/22fc6c7272ef033e7ba0afc64ffaa6f6f8fc0277/data/zipcode_table_sample.csv) +* Deploying S3 with Parquet as your primary data source, containing both [loan features](https://github.com/feast-dev/real-time-credit-scoring-on-aws-tutorial/blob/22fc6c7272ef033e7ba0afc64ffaa6f6f8fc0277/data/loan\_table\_sample.csv) and [zip code features](https://github.com/feast-dev/real-time-credit-scoring-on-aws-tutorial/blob/22fc6c7272ef033e7ba0afc64ffaa6f6f8fc0277/data/zipcode\_table\_sample.csv) * Deploying Redshift as the interface Feast uses to build training datasets * Registering your features with Feast and configuring DynamoDB for online serving * Building a training dataset with Feast to train your credit scoring model * Loading feature values from S3 into DynamoDB * Making online predictions with your credit scoring model using features from DynamoDB -| ![](../.gitbook/assets/github-mark-32px.png)[ View Source on Github](https://github.com/feast-dev/real-time-credit-scoring-on-aws-tutorial) | -| :--- | - - +| ![](../../.gitbook/assets/github-mark-32px.png)[ View Source on Github](https://github.com/feast-dev/real-time-credit-scoring-on-aws-tutorial) | +| ---------------------------------------------------------------------------------------------------------------------------------------------- | diff --git a/docs/tutorials/using-scalable-registry.md b/docs/tutorials/using-scalable-registry.md index 51fa50ff33..0ee02674b1 100644 --- a/docs/tutorials/using-scalable-registry.md +++ b/docs/tutorials/using-scalable-registry.md @@ -13,6 +13,11 @@ However, there's inherent limitations with a file-based registry, since changing An alternative to the file-based registry is the [SQLRegistry](https://rtd.feast.dev/en/latest/feast.infra.registry_stores.html#feast.infra.registry_stores.sql.SqlRegistry) which ships with Feast. This implementation stores the registry in a relational database, and allows for changes to individual objects atomically. Under the hood, the SQL Registry implementation uses [SQLAlchemy](https://docs.sqlalchemy.org/en/14/) to abstract over the different databases. Consequently, any [database supported](https://docs.sqlalchemy.org/en/14/core/engines.html#supported-databases) by SQLAlchemy can be used by the SQL Registry. +The following databases are supported and tested out of the box: +- PostgreSQL +- MySQL +- Sqlite + Feast can use the SQL Registry via a config change in the feature_store.yaml file. An example of how to configure this would be: ```yaml diff --git a/examples/java-demo/README.md b/examples/java-demo/README.md index b908bb7625..0ae085e0a7 100644 --- a/examples/java-demo/README.md +++ b/examples/java-demo/README.md @@ -30,21 +30,21 @@ For this tutorial, we setup Feast with Redis, using the Feast CLI to register an 2. Make a bucket in GCS (or S3) 3. The feature repo is already setup here, so you just need to swap in your GCS bucket and Redis credentials. We need to modify the `feature_store.yaml`, which has two fields for you to replace: - ```yaml - registry: gs://[YOUR BUCKET]/demo-repo/registry.db + ```yaml + registry: gs://[YOUR GCS BUCKET]/demo-repo/registry.db project: feast_java_demo provider: gcp online_store: type: redis + # Note: this would normally be using instance URL's to access Redis connection_string: localhost:6379,password=[YOUR PASSWORD] offline_store: type: file - flags: - alpha_features: true - on_demand_transforms: true + entity_key_serialization_version: 2 ``` 4. Run `feast apply` to apply your local features to the remote registry -5. Materialize features to the online store: + - Note: you may need to authenticate to gcloud first with `gcloud auth login` +6. Materialize features to the online store: ```bash CURRENT_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S") feast materialize-incremental $CURRENT_TIME @@ -55,12 +55,12 @@ For this tutorial, we setup Feast with Redis, using the Feast CLI to register an ```bash minikube addons enable gcp-auth ``` -3. Add Feast's Java feature server chart repo +2. Add Feast's Java feature server chart repo ```bash helm repo add feast-charts https://feast-helm-charts.storage.googleapis.com helm repo update ``` -4. Modify the application-override.yaml file to have your credentials + bucket location: +3. Modify the application-override.yaml file to have your credentials + bucket location: ```yaml feature-server: application-override.yaml: @@ -80,16 +80,21 @@ For this tutorial, we setup Feast with Redis, using the Feast CLI to register an cache_ttl_seconds: 60 project: feast_java_demo ``` -5. Install the Feast helm chart: `helm install feast-release feast-charts/feast --values application-override.yaml` -6. (Optional): check logs of the server to make sure it’s working +4. Install the Feast helm chart: `helm install feast-release feast-charts/feast --values application-override.yaml` + > **Dev instructions**: if you're changing the java logic or chart, you can do + >1. `eval $(minikube docker-env)` + >2. `make build-java-docker-dev` + >3. In the `application-override.yaml`, uncomment the two `image: tag: dev` blocks + >4. `helm install feast-release ../../../infra/charts/feast --values application-override.yaml` +5. (Optional): check logs of the server to make sure it’s working ```bash kubectl logs svc/feast-release-feature-server ``` -7. Port forward to expose the grpc endpoint: +6. Port forward to expose the grpc endpoint: ```bash kubectl port-forward svc/feast-release-feature-server 6566:6566 ``` -8. Make a gRPC call: +7. Make a gRPC call: - Python example ```bash python test.py diff --git a/examples/java-demo/feature_repo/application-override.yaml b/examples/java-demo/feature_repo/application-override.yaml index dbdeda4c04..caaa5411e2 100644 --- a/examples/java-demo/feature_repo/application-override.yaml +++ b/examples/java-demo/feature_repo/application-override.yaml @@ -10,8 +10,18 @@ feature-server: host: my-redis-master port: 6379 password: [YOUR PASSWORD] + entityKeySerializationVersion: 2 + # Uncomment below for dev +# image: +# tag: dev + +# Uncomment below for dev +#transformation-service: +# image: +# tag: dev + global: registry: - path: gs://[YOUR BUCKET]/demo-repo/registry.db + path: gs://[YOUR GCS BUCKET]/demo-repo/registry.db cache_ttl_seconds: 60 project: feast_java_demo diff --git a/examples/java-demo/feature_repo/driver_repo.py b/examples/java-demo/feature_repo/driver_repo.py index e17a5d9cf8..f7dd05afff 100644 --- a/examples/java-demo/feature_repo/driver_repo.py +++ b/examples/java-demo/feature_repo/driver_repo.py @@ -1,13 +1,13 @@ +from datetime import timedelta + import pandas as pd + from feast.data_source import RequestSource -from feast.field import Field from feast.on_demand_feature_view import on_demand_feature_view -from feast.request_feature_view import RequestFeatureView from feast.types import Float32, Float64, Int64, String -from google.protobuf.duration_pb2 import Duration from feast.field import Field -from feast import Entity, Feature, BatchFeatureView, FileSource +from feast import Entity, FileSource, FeatureView driver_hourly_stats = FileSource( path="data/driver_stats_with_string.parquet", @@ -15,10 +15,10 @@ created_timestamp_column="created", ) driver = Entity(name="driver_id", description="driver id",) -driver_hourly_stats_view = BatchFeatureView( +driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", - entities=["driver_id"], - ttl=Duration(seconds=86400000), + entities=[driver], + ttl=timedelta(days=365), schema=[ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), @@ -26,7 +26,7 @@ Field(name="string_feature", dtype=String), ], online=True, - batch_source=driver_hourly_stats, + source=driver_hourly_stats, tags={}, ) @@ -40,6 +40,7 @@ ], ) + # Define an on demand feature view which can generate new features based on # existing feature views and RequestSource features @on_demand_feature_view( @@ -58,14 +59,3 @@ def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] return df - -# Define request feature view -driver_age_request_fv = RequestFeatureView( - name="driver_age", - request_data_source=RequestSource( - name="driver_age", - schema=[ - Field(name="driver_age", dtype=Int64), - ], - ), -) diff --git a/examples/java-demo/feature_repo/feature_store.yaml b/examples/java-demo/feature_repo/feature_store.yaml index 91c65b512a..16d426fc5a 100644 --- a/examples/java-demo/feature_repo/feature_store.yaml +++ b/examples/java-demo/feature_repo/feature_store.yaml @@ -1,11 +1,10 @@ -registry: gs://[YOUR BUCKET]/demo-repo/registry.db +registry: gs://[YOUR GCS BUCKET]/demo-repo/registry.db project: feast_java_demo provider: gcp online_store: type: redis + # Note: this would normally be using instance URL's to access Redis connection_string: localhost:6379,password=[YOUR PASSWORD] offline_store: type: file -flags: - alpha_features: true - on_demand_transforms: true +entity_key_serialization_version: 2 \ No newline at end of file diff --git a/examples/java-demo/feature_repo/test_python_fetch.py b/examples/java-demo/feature_repo/test_python_fetch.py new file mode 100644 index 0000000000..5e2781e150 --- /dev/null +++ b/examples/java-demo/feature_repo/test_python_fetch.py @@ -0,0 +1,26 @@ +from feast import FeatureStore + + +def run_demo(): + store = FeatureStore(repo_path=".") + + print("\n--- Online features ---") + features = store.get_online_features( + features=[ + "driver_hourly_stats:conv_rate", + ], + entity_rows=[ + { + "driver_id": 1001, + }, + { + "driver_id": 1002, + } + ], + ).to_dict() + for key, value in sorted(features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo() diff --git a/examples/python-helm-demo/README.md b/examples/python-helm-demo/README.md new file mode 100644 index 0000000000..44cd4799d5 --- /dev/null +++ b/examples/python-helm-demo/README.md @@ -0,0 +1,89 @@ + +# Running Feast Python / Go Feature Server with Redis on Kubernetes + +For this tutorial, we set up Feast with Redis. + +We use the Feast CLI to register and materialize features, and then retrieving via a Feast Python feature server deployed in Kubernetes + +## First, let's set up a Redis cluster +1. Start minikube (`minikube start`) +2. Use helm to install a default Redis cluster + ```bash + helm repo add bitnami https://charts.bitnami.com/bitnami + helm repo update + helm install my-redis bitnami/redis + ``` + ![](redis-screenshot.png) +3. Port forward Redis so we can materialize features to it + + ```bash + kubectl port-forward --namespace default svc/my-redis-master 6379:6379 + ``` +4. Get your Redis password using the command (pasted below for convenience). We'll need this to tell Feast how to communicate with the cluster. + + ```bash + export REDIS_PASSWORD=$(kubectl get secret --namespace default my-redis -o jsonpath="{.data.redis-password}" | base64 --decode) + echo $REDIS_PASSWORD + ``` + +## Next, we setup a local Feast repo +1. Install Feast with Redis dependencies `pip install "feast[redis]"` +2. Make a bucket in GCS (or S3) +3. The feature repo is already setup here, so you just need to swap in your GCS bucket and Redis credentials. + We need to modify the `feature_store.yaml`, which has two fields for you to replace: + ```yaml + registry: gs://[YOUR GCS BUCKET]/demo-repo/registry.db + project: feast_python_demo + provider: gcp + online_store: + type: redis + # Note: this would normally be using instance URL's to access Redis + connection_string: localhost:6379,password=[YOUR PASSWORD] + offline_store: + type: file + entity_key_serialization_version: 2 + ``` +4. Run `feast apply` from within the `feature_repo` directory to apply your local features to the remote registry + - Note: you may need to authenticate to gcloud first with `gcloud auth login` +5. Materialize features to the online store: + ```bash + CURRENT_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S") + feast materialize-incremental $CURRENT_TIME + ``` + +## Now let's setup the Feast Server +1. Add the gcp-auth addon to mount GCP credentials: + ```bash + minikube addons enable gcp-auth + ``` +2. Add Feast's Python/Go feature server chart repo + ```bash + helm repo add feast-charts https://feast-helm-charts.storage.googleapis.com + helm repo update + ``` +3. For this tutorial, because we don't have a direct hosted endpoint into Redis, we need to change `feature_store.yaml` to talk to the Kubernetes Redis service + ```bash + sed -i '' 's/localhost:6379/my-redis-master:6379/g' feature_store.yaml + ``` +4. Install the Feast helm chart: `helm install feast-release feast-charts/feast-feature-server --set feature_store_yaml_base64=$(base64 feature_store.yaml)` + > **Dev instructions**: if you're changing the java logic or chart, you can do + 1. `eval $(minikube docker-env)` + 2. `make build-feature-server-dev` + 3. `helm install feast-release ../../../infra/charts/feast-feature-server --set image.tag=dev --set feature_store_yaml_base64=$(base64 feature_store.yaml)` +5. (Optional): check logs of the server to make sure it’s working + ```bash + kubectl logs svc/feast-feature-server + ``` +6. Port forward to expose the grpc endpoint: + ```bash + kubectl port-forward svc/feast-feature-server 6566:80 + ``` +7. Run test fetches for online features:8. + - First: change back the Redis connection string to allow localhost connections to Redis + ```bash + sed -i '' 's/my-redis-master:6379/localhost:6379/g' feature_store.yaml + ``` + - Then run the included fetch script, which fetches both via the HTTP endpoint and for comparison, via the Python SDK + ```bash + python test_python_fetch.py + ``` \ No newline at end of file diff --git a/sdk/python/feast/infra/registry_stores/__init__.py b/examples/python-helm-demo/feature_repo/__init__.py similarity index 100% rename from sdk/python/feast/infra/registry_stores/__init__.py rename to examples/python-helm-demo/feature_repo/__init__.py diff --git a/examples/python-helm-demo/feature_repo/data/driver_stats_with_string.parquet b/examples/python-helm-demo/feature_repo/data/driver_stats_with_string.parquet new file mode 100644 index 0000000000..83b8c31aa5 Binary files /dev/null and b/examples/python-helm-demo/feature_repo/data/driver_stats_with_string.parquet differ diff --git a/examples/python-helm-demo/feature_repo/driver_repo.py b/examples/python-helm-demo/feature_repo/driver_repo.py new file mode 100644 index 0000000000..f7dd05afff --- /dev/null +++ b/examples/python-helm-demo/feature_repo/driver_repo.py @@ -0,0 +1,61 @@ +from datetime import timedelta + +import pandas as pd + +from feast.data_source import RequestSource +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Float32, Float64, Int64, String +from feast.field import Field + +from feast import Entity, FileSource, FeatureView + +driver_hourly_stats = FileSource( + path="data/driver_stats_with_string.parquet", + timestamp_field="event_timestamp", + created_timestamp_column="created", +) +driver = Entity(name="driver_id", description="driver id",) +driver_hourly_stats_view = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=365), + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + Field(name="string_feature", dtype=String), + ], + online=True, + source=driver_hourly_stats, + tags={}, +) + +# Define a request data source which encodes features / information only +# available at request time (e.g. part of the user initiated HTTP request) +input_request = RequestSource( + name="vals_to_add", + schema=[ + Field(name="val_to_add", dtype=Int64), + Field(name="val_to_add_2", dtype=Int64), + ], +) + + +# Define an on demand feature view which can generate new features based on +# existing feature views and RequestSource features +@on_demand_feature_view( + sources=[ + driver_hourly_stats_view, + input_request, + ], + schema=[ + Field(name="conv_rate_plus_val1", dtype=Float64), + Field(name="conv_rate_plus_val2", dtype=Float64), + ], +) +def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_val1"] = inputs["conv_rate"] + inputs["val_to_add"] + df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] + return df + diff --git a/examples/python-helm-demo/feature_repo/feature_store.yaml b/examples/python-helm-demo/feature_repo/feature_store.yaml new file mode 100644 index 0000000000..d49c0cbd0e --- /dev/null +++ b/examples/python-helm-demo/feature_repo/feature_store.yaml @@ -0,0 +1,10 @@ +registry: gs://[YOUR GCS BUCKET]/demo-repo/registry.db +project: feast_python_demo +provider: gcp +online_store: + type: redis + # Note: this would normally be using instance URL's to access Redis + connection_string: localhost:6379,password=[YOUR PASSWORD] +offline_store: + type: file +entity_key_serialization_version: 2 \ No newline at end of file diff --git a/examples/python-helm-demo/feature_repo/test_python_fetch.py b/examples/python-helm-demo/feature_repo/test_python_fetch.py new file mode 100644 index 0000000000..f9c7c62f4f --- /dev/null +++ b/examples/python-helm-demo/feature_repo/test_python_fetch.py @@ -0,0 +1,43 @@ +from feast import FeatureStore +import requests +import json + + +def run_demo_http(): + print("\n--- Online features with HTTP endpoint ---") + online_request = { + "features": [ + "driver_hourly_stats:conv_rate", + ], + "entities": {"driver_id": [1001, 1002]}, + } + r = requests.post( + "http://localhost:6566/get-online-features", data=json.dumps(online_request) + ) + print(json.dumps(r.json(), indent=4, sort_keys=True)) + + +def run_demo_sdk(): + store = FeatureStore(repo_path=".") + + print("\n--- Online features with SDK ---") + features = store.get_online_features( + features=[ + "driver_hourly_stats:conv_rate", + ], + entity_rows=[ + { + "driver_id": 1001, + }, + { + "driver_id": 1002, + }, + ], + ).to_dict() + for key, value in sorted(features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo_sdk() + run_demo_http() diff --git a/examples/python-helm-demo/redis-screenshot.png b/examples/python-helm-demo/redis-screenshot.png new file mode 100644 index 0000000000..489deb699d Binary files /dev/null and b/examples/python-helm-demo/redis-screenshot.png differ diff --git a/examples/quickstart/quickstart.ipynb b/examples/quickstart/quickstart.ipynb index c7d0fcfe54..68b9d63911 100644 --- a/examples/quickstart/quickstart.ipynb +++ b/examples/quickstart/quickstart.ipynb @@ -8,21 +8,30 @@ "source": [ "# Overview\n", "\n", - "In this tutorial, we use feature stores to generate training data and power online model inference for a ride-sharing driver satisfaction prediction model. Feast addresses several common issues in this flow:\n", - "1. **Training-serving skew and complex data joins:** Feature values often exist across multiple tables. Joining these datasets can be complicated, slow, and error-prone.\n", - " - Feast joins these tables with battle-tested logic that ensures *point-in-time* correctness so future feature values do not leak to models.\n", - " - Feast alerts users to offline / online skew with data quality monitoring. \n", - "2. **Online feature availability:** At inference time, models often need access to features that aren't readily available and need to be precomputed from other datasources. \n", - " - Feast manages deployment to a variety of online stores (e.g. DynamoDB, Redis, Google Cloud Datastore) and ensures necessary features are consistently *available* and *freshly computed* at inference time.\n", - "3. **Feature reusability and model versioning:** Different teams within an organization are often unable to reuse features across projects, resulting in duplicate feature creation logic. Models have data dependencies that need to be versioned, for example when running A/B tests on model versions.\n", - " - Feast enables discovery of and collaboration on previously used features and enables versioning of sets of features (via *feature services*). \n", - " - Feast enables feature transformation so users can re-use transformation logic across online / offline usecases and across models.\n", + "In this tutorial, we'll use Feast to generate training data and power online model inference for a \n", + "ride-sharing driver satisfaction prediction model. Feast solves several common issues in this flow:\n", + "\n", + "1. **Training-serving skew and complex data joins:** Feature values often exist across multiple tables. Joining \n", + " these datasets can be complicated, slow, and error-prone.\n", + " * Feast joins these tables with battle-tested logic that ensures _point-in-time_ correctness so future feature \n", + " values do not leak to models.\n", + "2. **Online feature availability:** At inference time, models often need access to features that aren't readily \n", + " available and need to be precomputed from other data sources.\n", + " * Feast manages deployment to a variety of online stores (e.g. DynamoDB, Redis, Google Cloud Datastore) and \n", + " ensures necessary features are consistently _available_ and _freshly computed_ at inference time.\n", + "3. **Feature and model versioning:** Different teams within an organization are often unable to reuse \n", + " features across projects, resulting in duplicate feature creation logic. Models have data dependencies that need \n", + " to be versioned, for example when running A/B tests on model versions.\n", + " * Feast enables discovery of and collaboration on previously used features and enables versioning of sets of \n", + " features (via _feature services_).\n", + " * _(Experimental)_ Feast enables light-weight feature transformations so users can re-use transformation logic \n", + " across online / offline use cases and across models.\n", "\n", "We will:\n", - "- Deploy a local feature store with a Parquet file offline store and Sqlite online store.\n", - "- Build a training dataset using our time series features from our Parquet files.\n", - "- Materialize feature values from the offline store into the online store in preparation for low latency serving.\n", - "- Read the latest features from the online store for inference." + "1. Deploy a local feature store with a **Parquet file offline store** and **Sqlite online store**.\n", + "2. Build a training dataset using our time series features from our **Parquet files**.\n", + "3. Materialize feature values from the offline store into the online store.\n", + "4. Read the latest features from the online store for inference." ] }, { @@ -40,11 +49,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "rXNMAAJKQPG5", - "outputId": "94fb2260-4453-45c9-ba77-5b384823a621" + "id": "rXNMAAJKQPG5" }, "outputs": [], "source": [ @@ -83,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -96,11 +101,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "Feast is an open source project that collects anonymized error reporting and usage statistics. To opt out or learn more see https://docs.feast.dev/reference/usage\n", - "/usr/local/lib/python3.7/dist-packages/scipy/fft/__init__.py:97: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", - " from numpy.dual import register_func\n", - "/usr/local/lib/python3.7/dist-packages/scipy/sparse/sputils.py:17: DeprecationWarning: `np.typeDict` is a deprecated alias for `np.sctypeDict`.\n", - " supported_dtypes = [np.typeDict[x] for x in supported_dtypes]\n", "\n", "Creating a new Feast repository in \u001b[1m\u001b[32m/content/feature_repo\u001b[0m.\n", "\n" @@ -122,15 +122,17 @@ "Let's take a look at the demo repo itself. It breaks down into\n", "\n", "\n", - "* `data/` contains raw demo parquet data\n", - "* `example.py` contains demo feature definitions\n", - "* `feature_store.yaml` contains a demo setup configuring where data sources are\n", + "* `data/` contains raw demo parquet data\n", + "* `example_repo.py` contains demo feature definitions\n", + "* `feature_store.yaml` contains a demo setup configuring where data sources are\n", + "* `test_workflow.py` showcases how to run all key Feast commands, including defining, retrieving, and pushing features.\n", + " * You can run this with `python test_workflow.py`.\n", "\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -144,8 +146,8 @@ "output_type": "stream", "text": [ "/content/feature_repo\n", - ".:\n", - "data example.py feature_store.yaml __init__.py\n", + "README.md feature_store.yaml\n", + "__init__.py example_repo.py test_workflow.py\n", "\n", "./data:\n", "driver_stats.parquet\n" @@ -164,21 +166,29 @@ }, "source": [ "### Step 2b: Inspecting the project configuration\n", - "Let's inspect the setup of the project in `feature_store.yaml`. The key line defining the overall architecture of the feature store is the **provider**. This defines where the raw data exists (for generating training data & feature values for serving), and where to materialize feature values to in the online store (for serving). \n", + "Let's inspect the setup of the project in `feature_store.yaml`. \n", "\n", - "Valid values for `provider` in `feature_store.yaml` are:\n", + "The key line defining the overall architecture of the feature store is the **provider**. \n", "\n", - "* local: use file source with SQLite/Redis\n", - "* gcp: use BigQuery/Snowflake with Google Cloud Datastore/Redis\n", - "* aws: use Redshift/Snowflake with DynamoDB/Redis\n", + "The provider value sets default offline and online stores. \n", + "* The offline store provides the compute layer to process historical data (for generating training data & feature \n", + " values for serving). \n", + "* The online store is a low latency store of the latest feature values (for powering real-time inference).\n", "\n", - "Note that there are many other sources Feast works with, including Azure, Hive, Trino, and PostgreSQL via community plugins. See https://docs.feast.dev/getting-started/third-party-integrations for all supported datasources.\n", - "A custom setup can also be made by following https://docs.feast.dev/v/master/how-to-guides/creating-a-custom-provider" + "Valid values for `provider` in `feature_store.yaml` are:\n", + "\n", + "* local: use file source with SQLite/Redis\n", + "* gcp: use BigQuery/Snowflake with Google Cloud Datastore/Redis\n", + "* aws: use Redshift/Snowflake with DynamoDB/Redis\n", + "\n", + "Note that there are many other offline / online stores Feast works with, including Azure, Hive, Trino, and PostgreSQL via community plugins. See https://docs.feast.dev/roadmap for all supported connectors.\n", + "\n", + "A custom setup can also be made by following [Customizing Feast](https://docs.feast.dev/v/master/how-to-guides/customizing-feast)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -191,11 +201,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mproject\u001b[39;49;00m: feature_repo\n", - "\u001b[94mregistry\u001b[39;49;00m: data/registry.db\n", - "\u001b[94mprovider\u001b[39;49;00m: local\n", - "\u001b[94monline_store\u001b[39;49;00m:\n", - " \u001b[94mpath\u001b[39;49;00m: data/online_store.db\n" + "\u001b[94mproject\u001b[39;49;00m:\u001b[37m \u001b[39;49;00mfeature_repo\u001b[37m\u001b[39;49;00m\n", + "\u001b[37m# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry)\u001b[39;49;00m\u001b[37m\u001b[39;49;00m\n", + "\u001b[94mregistry\u001b[39;49;00m:\u001b[37m \u001b[39;49;00mdata/registry.db\u001b[37m\u001b[39;49;00m\n", + "\u001b[37m# The provider primarily specifies default offline / online stores & storing the registry in a given cloud\u001b[39;49;00m\u001b[37m\u001b[39;49;00m\n", + "\u001b[94mprovider\u001b[39;49;00m:\u001b[37m \u001b[39;49;00mlocal\u001b[37m\u001b[39;49;00m\n", + "\u001b[94monline_store\u001b[39;49;00m:\u001b[37m\u001b[39;49;00m\n", + "\u001b[37m \u001b[39;49;00m\u001b[94mpath\u001b[39;49;00m:\u001b[37m \u001b[39;49;00mdata/online_store.db\u001b[37m\u001b[39;49;00m\n", + "\u001b[94mentity_key_serialization_version\u001b[39;49;00m:\u001b[37m \u001b[39;49;00m2\u001b[37m\u001b[39;49;00m\n" ] } ], @@ -216,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -229,10 +242,7 @@ { "data": { "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", + "
\n", "\n", - "\n", - " \n", - "
\n", - "
\n", - " " + "
" ], "text/plain": [ " event_timestamp driver_id conv_rate acc_rate \\\n", - "0 2022-05-30 19:00:00+00:00 1005 0.061944 0.517414 \n", - "1 2022-05-30 20:00:00+00:00 1005 0.265881 0.636260 \n", - "2 2022-05-30 21:00:00+00:00 1005 0.830253 0.461117 \n", - "3 2022-05-30 22:00:00+00:00 1005 0.828208 0.520325 \n", - "4 2022-05-30 23:00:00+00:00 1005 0.375715 0.084719 \n", + "0 2022-07-24 14:00:00+00:00 1005 0.423913 0.082831 \n", + "1 2022-07-24 15:00:00+00:00 1005 0.507126 0.427470 \n", + "2 2022-07-24 16:00:00+00:00 1005 0.139810 0.129743 \n", + "3 2022-07-24 17:00:00+00:00 1005 0.383574 0.071728 \n", + "4 2022-07-24 18:00:00+00:00 1005 0.959131 0.440051 \n", "... ... ... ... ... \n", - "1802 2022-06-14 17:00:00+00:00 1001 0.016256 0.293051 \n", - "1803 2022-06-14 18:00:00+00:00 1001 0.651631 0.855919 \n", - "1804 2021-04-12 07:00:00+00:00 1001 0.828805 0.375509 \n", - "1805 2022-06-07 07:00:00+00:00 1003 0.324065 0.970185 \n", - "1806 2022-06-07 07:00:00+00:00 1003 0.324065 0.970185 \n", + "1802 2022-08-08 12:00:00+00:00 1001 0.994883 0.020145 \n", + "1803 2022-08-08 13:00:00+00:00 1001 0.663844 0.864639 \n", + "1804 2021-04-12 07:00:00+00:00 1001 0.068696 0.624977 \n", + "1805 2022-08-01 02:00:00+00:00 1003 0.980869 0.244420 \n", + "1806 2022-08-01 02:00:00+00:00 1003 0.980869 0.244420 \n", "\n", " avg_daily_trips created \n", - "0 467 2022-06-14 19:00:52.584 \n", - "1 709 2022-06-14 19:00:52.584 \n", - "2 731 2022-06-14 19:00:52.584 \n", - "3 919 2022-06-14 19:00:52.584 \n", - "4 874 2022-06-14 19:00:52.584 \n", + "0 201 2022-08-08 14:14:11.200 \n", + "1 690 2022-08-08 14:14:11.200 \n", + "2 845 2022-08-08 14:14:11.200 \n", + "3 839 2022-08-08 14:14:11.200 \n", + "4 2 2022-08-08 14:14:11.200 \n", "... ... ... \n", - "1802 908 2022-06-14 19:00:52.584 \n", - "1803 685 2022-06-14 19:00:52.584 \n", - "1804 106 2022-06-14 19:00:52.584 \n", - "1805 824 2022-06-14 19:00:52.584 \n", - "1806 824 2022-06-14 19:00:52.584 \n", + "1802 650 2022-08-08 14:14:11.200 \n", + "1803 359 2022-08-08 14:14:11.200 \n", + "1804 624 2022-08-08 14:14:11.200 \n", + "1805 790 2022-08-08 14:14:11.200 \n", + "1806 790 2022-08-08 14:14:11.200 \n", "\n", "[1807 rows x 6 columns]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -499,69 +433,88 @@ }, "source": [ "### Step 3a: Inspecting feature definitions\n", - "Let's inspect what `example.py` looks like (the only python file in the repo):" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "DPqXCoNpL0SX", - "outputId": "a31a40c4-e60a-4f62-ae1c-227ce0aedea4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[38;2;64;128;128;03m# This is an example feature definition file\u001b[39;00m\n", - "\n", - "\u001b[38;2;0;128;0;01mfrom\u001b[39;00m \u001b[38;2;0;0;255;01mdatetime\u001b[39;00m \u001b[38;2;0;128;0;01mimport\u001b[39;00m timedelta\n", - "\n", - "\u001b[38;2;0;128;0;01mfrom\u001b[39;00m \u001b[38;2;0;0;255;01mfeast\u001b[39;00m \u001b[38;2;0;128;0;01mimport\u001b[39;00m Entity, FeatureService, FeatureView, Field, FileSource, ValueType\n", - "\u001b[38;2;0;128;0;01mfrom\u001b[39;00m \u001b[38;2;0;0;255;01mfeast\u001b[39;00m\u001b[38;2;0;0;255;01m.\u001b[39;00m\u001b[38;2;0;0;255;01mtypes\u001b[39;00m \u001b[38;2;0;128;0;01mimport\u001b[39;00m Float32, Int64\n", - "\n", - "\u001b[38;2;64;128;128;03m# Read data from parquet files. Parquet is convenient for local development mode. For\u001b[39;00m\n", - "\u001b[38;2;64;128;128;03m# production, you can use your favorite DWH, such as BigQuery. See Feast documentation\u001b[39;00m\n", - "\u001b[38;2;64;128;128;03m# for more info.\u001b[39;00m\n", - "driver_hourly_stats \u001b[38;2;102;102;102m=\u001b[39m FileSource(\n", - " path\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33m/content/feature_repo/data/driver_stats.parquet\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m,\n", - " timestamp_field\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33mevent_timestamp\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m,\n", - " created_timestamp_column\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33mcreated\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m,\n", - ")\n", - "\n", - "\u001b[38;2;64;128;128;03m# Define an entity for the driver. You can think of entity as a primary key used to\u001b[39;00m\n", - "\u001b[38;2;64;128;128;03m# fetch features.\u001b[39;00m\n", - "driver \u001b[38;2;102;102;102m=\u001b[39m Entity(name\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33mdriver\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m, join_keys\u001b[38;2;102;102;102m=\u001b[39m[\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33mdriver_id\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m], value_type\u001b[38;2;102;102;102m=\u001b[39mValueType\u001b[38;2;102;102;102m.\u001b[39mINT64,)\n", - "\n", - "\u001b[38;2;64;128;128;03m# Our parquet files contain sample data that includes a driver_id column, timestamps and\u001b[39;00m\n", - "\u001b[38;2;64;128;128;03m# three feature column. Here we define a Feature View that will allow us to serve this\u001b[39;00m\n", - "\u001b[38;2;64;128;128;03m# data to our model online.\u001b[39;00m\n", - "driver_hourly_stats_view \u001b[38;2;102;102;102m=\u001b[39m FeatureView(\n", - " name\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33mdriver_hourly_stats\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m,\n", - " entities\u001b[38;2;102;102;102m=\u001b[39m[\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33mdriver\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m],\n", - " ttl\u001b[38;2;102;102;102m=\u001b[39mtimedelta(days\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;102;102;102m1\u001b[39m),\n", - " schema\u001b[38;2;102;102;102m=\u001b[39m[\n", - " Field(name\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33mconv_rate\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m, dtype\u001b[38;2;102;102;102m=\u001b[39mFloat32),\n", - " Field(name\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33macc_rate\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m, dtype\u001b[38;2;102;102;102m=\u001b[39mFloat32),\n", - " Field(name\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33mavg_daily_trips\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m, dtype\u001b[38;2;102;102;102m=\u001b[39mInt64),\n", - " ],\n", - " online\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;0;128;0;01mTrue\u001b[39;00m,\n", - " source\u001b[38;2;102;102;102m=\u001b[39mdriver_hourly_stats,\n", - " tags\u001b[38;2;102;102;102m=\u001b[39m{},\n", - ")\n", - "\n", - "driver_stats_fs \u001b[38;2;102;102;102m=\u001b[39m FeatureService(\n", - " name\u001b[38;2;102;102;102m=\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m\u001b[38;2;186;33;33mdriver_activity\u001b[39m\u001b[38;2;186;33;33m\"\u001b[39m, features\u001b[38;2;102;102;102m=\u001b[39m[driver_hourly_stats_view]\n", - ")\n" - ] - } - ], - "source": [ - "!pygmentize -f terminal16m example.py" + "Let's inspect what `example_repo.py` looks like:\n", + "\n", + "```python\n", + "# This is an example feature definition file\n", + "\n", + "from datetime import timedelta\n", + "\n", + "import pandas as pd\n", + "\n", + "from feast import Entity, FeatureService, FeatureView, Field, FileSource, RequestSource, PushSource\n", + "from feast.on_demand_feature_view import on_demand_feature_view\n", + "from feast.types import Float32, Int64, Float64\n", + "\n", + "# Read data from parquet files. Parquet is convenient for local development mode. For\n", + "# production, you can use your favorite DWH, such as BigQuery. See Feast documentation\n", + "# for more info.\n", + "driver_hourly_stats = FileSource(\n", + " name=\"driver_hourly_stats_source\",\n", + " path=\"/content/feature_repo/data/driver_stats.parquet\",\n", + " timestamp_field=\"event_timestamp\",\n", + " created_timestamp_column=\"created\",\n", + ")\n", + "\n", + "# Define an entity for the driver. You can think of entity as a primary key used to\n", + "# fetch features.\n", + "driver = Entity(name=\"driver\", join_keys=[\"driver_id\"])\n", + "\n", + "# Our parquet files contain sample data that includes a driver_id column, timestamps and\n", + "# three feature column. Here we define a Feature View that will allow us to serve this\n", + "# data to our model online.\n", + "driver_hourly_stats_view = FeatureView(\n", + " name=\"driver_hourly_stats\",\n", + " entities=[driver],\n", + " ttl=timedelta(days=1),\n", + " schema=[\n", + " Field(name=\"conv_rate\", dtype=Float32),\n", + " Field(name=\"acc_rate\", dtype=Float32),\n", + " Field(name=\"avg_daily_trips\", dtype=Int64),\n", + " ],\n", + " online=True,\n", + " source=driver_hourly_stats,\n", + " tags={},\n", + ")\n", + "\n", + "# Defines a way to push data (to be available offline, online or both) into Feast.\n", + "driver_stats_push_source = PushSource(\n", + " name=\"driver_stats_push_source\",\n", + " batch_source=driver_hourly_stats,\n", + ")\n", + "\n", + "# Define a request data source which encodes features / information only\n", + "# available at request time (e.g. part of the user initiated HTTP request)\n", + "input_request = RequestSource(\n", + " name=\"vals_to_add\",\n", + " schema=[\n", + " Field(name=\"val_to_add\", dtype=Int64),\n", + " Field(name=\"val_to_add_2\", dtype=Int64),\n", + " ],\n", + ")\n", + "\n", + "\n", + "# Define an on demand feature view which can generate new features based on\n", + "# existing feature views and RequestSource features\n", + "@on_demand_feature_view(\n", + " sources=[driver_hourly_stats_view, input_request],\n", + " schema=[\n", + " Field(name=\"conv_rate_plus_val1\", dtype=Float64),\n", + " Field(name=\"conv_rate_plus_val2\", dtype=Float64),\n", + " ],\n", + ")\n", + "def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame:\n", + " df = pd.DataFrame()\n", + " df[\"conv_rate_plus_val1\"] = inputs[\"conv_rate\"] + inputs[\"val_to_add\"]\n", + " df[\"conv_rate_plus_val2\"] = inputs[\"conv_rate\"] + inputs[\"val_to_add_2\"]\n", + " return df\n", + "\n", + "\n", + "# This groups features into a model version\n", + "driver_stats_fs = FeatureService(\n", + " name=\"driver_activity\", features=[driver_hourly_stats_view, transformed_conv_rate]\n", + ")\n", + "```" ] }, { @@ -571,12 +524,12 @@ }, "source": [ "### Step 3b: Applying feature definitions\n", - "Now we run `feast apply` to register the feature views and entities defined in `example.py`, and sets up SQLite online store tables. Note that we had previously specified SQLite as the online store in `feature_store.yaml` by specifying a `local` provider." + "Now we run `feast apply` to register the feature views and entities defined in `example_repo.py`, and sets up SQLite online store tables. Note that we had previously specified SQLite as the online store in `feature_store.yaml` by specifying a `local` provider." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -589,12 +542,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "/usr/local/lib/python3.7/dist-packages/scipy/fft/__init__.py:97: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", - " from numpy.dual import register_func\n", - "/usr/local/lib/python3.7/dist-packages/scipy/sparse/sputils.py:17: DeprecationWarning: `np.typeDict` is a deprecated alias for `np.sctypeDict`.\n", - " supported_dtypes = [np.typeDict[x] for x in supported_dtypes]\n", + "RuntimeWarning: On demand feature view is an experimental feature. This API is stable, but the functionality does not scale well for offline retrieval\n", + " warnings.warn(\n", "Created entity \u001b[1m\u001b[32mdriver\u001b[0m\n", "Created feature view \u001b[1m\u001b[32mdriver_hourly_stats\u001b[0m\n", + "Created on demand feature view \u001b[1m\u001b[32mtransformed_conv_rate\u001b[0m\n", "Created feature service \u001b[1m\u001b[32mdriver_activity\u001b[0m\n", "\n", "Created sqlite table \u001b[1m\u001b[32mfeature_repo_driver_hourly_stats\u001b[0m\n", @@ -612,17 +564,24 @@ "id": "uV7rtRQgzyf0" }, "source": [ - "## Step 4: Generate training data\n", + "## Step 4: Generating training data or powering batch scoring models\n", + "\n", + "To train a model, we need features and labels. Often, this label data is stored separately (e.g. you have one table storing user survey results and another set of tables with feature values). Feast can help generate the features that map to these labels.\n", + "\n", + "Feast needs a list of **entities** (e.g. driver ids) and **timestamps**. Feast will intelligently join relevant \n", + "tables to create the relevant feature vectors. There are two ways to generate this list:\n", + "1. The user can query that table of labels with timestamps and pass that into Feast as an _entity dataframe_ for \n", + "training data generation. \n", + "2. The user can also query that table with a *SQL query* which pulls entities. See the documentation on [feature retrieval](https://docs.feast.dev/getting-started/concepts/feature-retrieval) for details \n", "\n", - "To train a model, we need features and labels. Often, this label data is stored separately (e.g. you have one table storing user survey results and another set of tables with feature values). \n", + "* Note that we include timestamps because we want the features for the same driver at various timestamps to be used in a model.\n", "\n", - "The user can query that table of labels with timestamps and pass that into Feast as an *entity dataframe* for training data generation. In many cases, Feast will also intelligently join relevant tables to create the relevant feature vectors.\n", - "- Note that we include timestamps because want the features for the same driver at various timestamps to be used in a model." + "### Step 4a: Generating training data" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -631,16 +590,6 @@ "outputId": "58c4c3dd-7a10-4f56-901d-1bb879ebbcb8" }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/scipy/fft/__init__.py:97: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", - " from numpy.dual import register_func\n", - "/usr/local/lib/python3.7/dist-packages/scipy/sparse/sputils.py:17: DeprecationWarning: `np.typeDict` is a deprecated alias for `np.sctypeDict`.\n", - " supported_dtypes = [np.typeDict[x] for x in supported_dtypes]\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -648,50 +597,67 @@ "----- Feature schema -----\n", "\n", "\n", - "Int64Index: 3 entries, 1080 to 359\n", - "Data columns (total 6 columns):\n", + "RangeIndex: 3 entries, 0 to 2\n", + "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 driver_id 3 non-null int64 \n", - " 1 label_driver_reported_satisfaction 3 non-null int64 \n", - " 2 event_timestamp 3 non-null datetime64[ns, UTC]\n", - " 3 conv_rate 3 non-null float32 \n", - " 4 acc_rate 3 non-null float32 \n", - " 5 avg_daily_trips 3 non-null int32 \n", - "dtypes: datetime64[ns, UTC](1), float32(2), int32(1), int64(2)\n", - "memory usage: 132.0 bytes\n", + " 1 event_timestamp 3 non-null datetime64[ns, UTC]\n", + " 2 label_driver_reported_satisfaction 3 non-null int64 \n", + " 3 val_to_add 3 non-null int64 \n", + " 4 val_to_add_2 3 non-null int64 \n", + " 5 conv_rate 3 non-null float32 \n", + " 6 acc_rate 3 non-null float32 \n", + " 7 avg_daily_trips 3 non-null int32 \n", + " 8 conv_rate_plus_val1 3 non-null float64 \n", + " 9 conv_rate_plus_val2 3 non-null float64 \n", + "dtypes: datetime64[ns, UTC](1), float32(2), float64(2), int32(1), int64(4)\n", + "memory usage: 332.0 bytes\n", "None\n", "\n", "----- Example features -----\n", "\n", - " driver_id label_driver_reported_satisfaction \\\n", - "1080 1003 3 \n", - "720 1002 5 \n", - "359 1001 1 \n", + " driver_id event_timestamp label_driver_reported_satisfaction \\\n", + "0 1001 2021-04-12 10:59:42+00:00 1 \n", + "1 1002 2021-04-12 08:12:10+00:00 5 \n", + "2 1003 2021-04-12 16:40:26+00:00 3 \n", + "\n", + " val_to_add val_to_add_2 conv_rate acc_rate avg_daily_trips \\\n", + "0 1 10 0.356766 0.051319 93 \n", + "1 2 20 0.130452 0.359439 522 \n", + "2 3 30 0.666570 0.343380 266 \n", "\n", - " event_timestamp conv_rate acc_rate avg_daily_trips \n", - "1080 2022-06-14 17:48:10.734341+00:00 0.525623 0.217880 488 \n", - "720 2022-06-14 18:25:10.734338+00:00 0.181652 0.659991 974 \n", - "359 2022-06-14 18:50:10.734322+00:00 0.651631 0.855919 685 \n" + " conv_rate_plus_val1 conv_rate_plus_val2 \n", + "0 1.356766 10.356766 \n", + "1 2.130452 20.130452 \n", + "2 3.666570 30.666570 \n" ] } ], "source": [ - "from datetime import datetime, timedelta\n", + "from datetime import datetime\n", "import pandas as pd\n", "\n", "from feast import FeatureStore\n", "\n", "# The entity dataframe is the dataframe we want to enrich with feature values\n", + "# Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for more details on how to retrieve\n", + "# for all entities in the offline store instead\n", "entity_df = pd.DataFrame.from_dict(\n", " {\n", + " # entity's join key -> entity values\n", " \"driver_id\": [1001, 1002, 1003],\n", - " \"label_driver_reported_satisfaction\": [1, 5, 3], \n", + " # \"event_timestamp\" (reserved key) -> timestamps\n", " \"event_timestamp\": [\n", - " datetime.now() - timedelta(minutes=11),\n", - " datetime.now() - timedelta(minutes=36),\n", - " datetime.now() - timedelta(minutes=73),\n", + " datetime(2021, 4, 12, 10, 59, 42),\n", + " datetime(2021, 4, 12, 8, 12, 10),\n", + " datetime(2021, 4, 12, 16, 40, 26),\n", " ],\n", + " # (optional) label name -> label values. Feast does not process these\n", + " \"label_driver_reported_satisfaction\": [1, 5, 3],\n", + " # values we're using for an on-demand transformation\n", + " \"val_to_add\": [1, 2, 3],\n", + " \"val_to_add_2\": [10, 20, 30],\n", " }\n", ")\n", "\n", @@ -703,6 +669,8 @@ " \"driver_hourly_stats:conv_rate\",\n", " \"driver_hourly_stats:acc_rate\",\n", " \"driver_hourly_stats:avg_daily_trips\",\n", + " \"transformed_conv_rate:conv_rate_plus_val1\",\n", + " \"transformed_conv_rate:conv_rate_plus_val2\",\n", " ],\n", ").to_df()\n", "\n", @@ -714,6 +682,65 @@ "print(training_df.head())" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "GFiXVdhz04t0" + }, + "source": [ + "### Step 4b: Run offline inference (batch scoring)\n", + "To power a batch model, we primarily need to generate features with the `get_historical_features` call, but using the current timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rGR_xgIs04t0", + "outputId": "3496e5a1-79ff-4f3c-e35d-22b594992708" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "----- Example features -----\n", + "\n", + " driver_id event_timestamp \\\n", + "0 1001 2022-08-08 18:22:06.555018+00:00 \n", + "1 1002 2022-08-08 18:22:06.555018+00:00 \n", + "2 1003 2022-08-08 18:22:06.555018+00:00 \n", + "\n", + " label_driver_reported_satisfaction val_to_add val_to_add_2 conv_rate \\\n", + "0 1 1 10 0.663844 \n", + "1 5 2 20 0.151189 \n", + "2 3 3 30 0.769165 \n", + "\n", + " acc_rate avg_daily_trips conv_rate_plus_val1 conv_rate_plus_val2 \n", + "0 0.864639 359 1.663844 10.663844 \n", + "1 0.695982 311 2.151189 20.151189 \n", + "2 0.949191 789 3.769165 30.769165 \n" + ] + } + ], + "source": [ + "entity_df[\"event_timestamp\"] = pd.to_datetime(\"now\", utc=True)\n", + "training_df = store.get_historical_features(\n", + " entity_df=entity_df,\n", + " features=[\n", + " \"driver_hourly_stats:conv_rate\",\n", + " \"driver_hourly_stats:acc_rate\",\n", + " \"driver_hourly_stats:avg_daily_trips\",\n", + " \"transformed_conv_rate:conv_rate_plus_val1\",\n", + " \"transformed_conv_rate:conv_rate_plus_val2\",\n", + " ],\n", + ").to_df()\n", + "\n", + "print(\"\\n----- Example features -----\\n\")\n", + "print(training_df.head())" + ] + }, { "cell_type": "markdown", "metadata": { @@ -729,21 +756,21 @@ "id": "KCXUpiQ_pmDk" }, "source": [ - "### Step 5a: Using `feast materialize-incremental`\n", + "### Step 5a: Using `materialize_incremental`\n", "\n", - "We now serialize the latest values of features since the beginning of time to prepare for serving (note: `materialize-incremental` serializes all new features since the last `materialize` call).\n", + "We now serialize the latest values of features since the beginning of time to prepare for serving (note: `materialize_incremental` serializes all new features since the last `materialize` call).\n", "\n", "An alternative to using the CLI command is to use Python:\n", "\n", - "```python\n", - "from datetime import datetime\n", - "store.materialize_incremental(datetime.datetime.now())\n", + "```bash\n", + "CURRENT_TIME=$(date -u +\"%Y-%m-%dT%H:%M:%S\")\n", + "feast materialize-incremental $CURRENT_TIME\n", "```" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -756,20 +783,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "/usr/local/lib/python3.7/dist-packages/scipy/fft/__init__.py:97: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", - " from numpy.dual import register_func\n", - "/usr/local/lib/python3.7/dist-packages/scipy/sparse/sputils.py:17: DeprecationWarning: `np.typeDict` is a deprecated alias for `np.sctypeDict`.\n", - " supported_dtypes = [np.typeDict[x] for x in supported_dtypes]\n", - "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2022-06-14 19:01:13+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", + "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2022-08-08 14:19:04-04:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", - "\u001b[1m\u001b[32mdriver_hourly_stats\u001b[0m from \u001b[1m\u001b[32m2022-06-13 19:01:14+00:00\u001b[0m to \u001b[1m\u001b[32m2022-06-14 19:01:13+00:00\u001b[0m:\n", - "100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 403.44it/s]\n" + "\u001b[1m\u001b[32mdriver_hourly_stats\u001b[0m from \u001b[1m\u001b[32m2022-08-07 18:19:04-04:00\u001b[0m to \u001b[1m\u001b[32m2022-08-08 14:19:04-04:00\u001b[0m:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 346.47it/s]\n" ] } ], "source": [ "from datetime import datetime\n", - "!feast materialize-incremental {datetime.now().isoformat()}" + "store.materialize_incremental(datetime.now())" ] }, { @@ -785,7 +814,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -799,7 +828,7 @@ "output_type": "stream", "text": [ "--- Data directory ---\n", - "driver_stats.parquet online_store.db registry.db\n", + "driver_stats.parquet online_store.db registry.db\n", "\n", "--- Schema of online store ---\n", "['entity_key', 'feature_name', 'value', 'event_ts', 'created_ts']\n" @@ -838,7 +867,7 @@ "id": "GNecKOaI0J2Z" }, "source": [ - "## Step 6: Fetching feature vectors for inference" + "## Step 6: Fetching real-time feature vectors for online inference" ] }, { @@ -852,7 +881,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -865,10 +894,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'acc_rate': [0.11573542654514313, 0.19155333936214447],\n", - " 'avg_daily_trips': [814, 64],\n", - " 'conv_rate': [0.5311259031295776, 0.9771925806999207],\n", - " 'driver_id': [1004, 1005]}\n" + "{'acc_rate': [0.86463862657547, 0.6959823369979858],\n", + " 'avg_daily_trips': [359, 311],\n", + " 'conv_rate_plus_val1': [1000.6638441681862, 1001.1511893719435],\n", + " 'conv_rate_plus_val2': [2000.6638441681862, 2002.1511893719435],\n", + " 'driver_id': [1001, 1002]}\n" ] } ], @@ -880,13 +910,23 @@ "\n", "feature_vector = store.get_online_features(\n", " features=[\n", - " \"driver_hourly_stats:conv_rate\",\n", " \"driver_hourly_stats:acc_rate\",\n", " \"driver_hourly_stats:avg_daily_trips\",\n", + " \"transformed_conv_rate:conv_rate_plus_val1\",\n", + " \"transformed_conv_rate:conv_rate_plus_val2\",\n", " ],\n", " entity_rows=[\n", - " {\"driver_id\": 1004},\n", - " {\"driver_id\": 1005},\n", + " # {join_key: entity_value}\n", + " {\n", + " \"driver_id\": 1001,\n", + " \"val_to_add\": 1000,\n", + " \"val_to_add_2\": 2000,\n", + " },\n", + " {\n", + " \"driver_id\": 1002,\n", + " \"val_to_add\": 1001,\n", + " \"val_to_add_2\": 2002,\n", + " },\n", " ],\n", ").to_dict()\n", "\n", @@ -913,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -926,10 +966,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'acc_rate': [0.11573542654514313, 0.19155333936214447],\n", - " 'avg_daily_trips': [814, 64],\n", - " 'conv_rate': [0.5311259031295776, 0.9771925806999207],\n", - " 'driver_id': [1004, 1005]}\n" + "{'acc_rate': [0.86463862657547, 0.6959823369979858],\n", + " 'avg_daily_trips': [359, 311],\n", + " 'conv_rate': [0.6638441681861877, 0.15118937194347382],\n", + " 'conv_rate_plus_val1': [1000.6638441681862, 1001.1511893719435],\n", + " 'conv_rate_plus_val2': [2000.6638441681862, 2002.1511893719435],\n", + " 'driver_id': [1001, 1002]}\n" ] } ], @@ -942,13 +984,77 @@ " features=feature_service,\n", " entity_rows=[\n", " # {join_key: entity_value}\n", - " {\"driver_id\": 1004},\n", - " {\"driver_id\": 1005},\n", + " {\n", + " \"driver_id\": 1001,\n", + " \"val_to_add\": 1000,\n", + " \"val_to_add_2\": 2000,\n", + " },\n", + " {\n", + " \"driver_id\": 1002,\n", + " \"val_to_add\": 1001,\n", + " \"val_to_add_2\": 2002,\n", + " },\n", " ],\n", ").to_dict()\n", "pprint(feature_vector)" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "PvPOSPV904t7" + }, + "source": [ + "## Step 7: Making streaming features available in Feast\n", + "Feast does not directly ingest from streaming sources. Instead, Feast relies on a push-based model to push features into Feast. You can write a streaming pipeline that generates features, which can then be pushed to the offline store, the online store, or both (depending on your needs).\n", + "\n", + "This relies on the `PushSource` defined above. Pushing to this source will populate all dependent feature views with the pushed feature values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uAg5xKDF04t7", + "outputId": "8288b911-125f-4141-b286-f6f84bcb24ea" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- Simulate a stream event ingestion of the hourly stats df ---\n", + " driver_id event_timestamp created conv_rate acc_rate \\\n", + "0 1001 2021-05-13 10:59:42 2021-05-13 10:59:42 1.0 1.0 \n", + "\n", + " avg_daily_trips \n", + "0 1000 \n" + ] + } + ], + "source": [ + "from feast.data_source import PushMode\n", + "\n", + "print(\"\\n--- Simulate a stream event ingestion of the hourly stats df ---\")\n", + "event_df = pd.DataFrame.from_dict(\n", + " {\n", + " \"driver_id\": [1001],\n", + " \"event_timestamp\": [\n", + " datetime(2021, 5, 13, 10, 59, 42),\n", + " ],\n", + " \"created\": [\n", + " datetime(2021, 5, 13, 10, 59, 42),\n", + " ],\n", + " \"conv_rate\": [1.0],\n", + " \"acc_rate\": [1.0],\n", + " \"avg_daily_trips\": [1000],\n", + " }\n", + ")\n", + "print(event_df)\n", + "store.push(\"driver_stats_push_source\", event_df, to=PushMode.ONLINE_AND_OFFLINE)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -967,17 +1073,32 @@ "metadata": { "colab": { "collapsed_sections": [], - "name": "Feast Codelab", + "name": "quickstart.ipynb", "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.8.10 64-bit ('python-3.8')", + "language": "python", "name": "python3" }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "7d634b9af180bcb32a446a43848522733ff8f5bbf0cc46dba1a83bede04bf237" + } } }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/go/internal/feast/onlinestore/sqliteonlinestore_test.go b/go/internal/feast/onlinestore/sqliteonlinestore_test.go index e5e6e85e56..9a56f4df1a 100644 --- a/go/internal/feast/onlinestore/sqliteonlinestore_test.go +++ b/go/internal/feast/onlinestore/sqliteonlinestore_test.go @@ -16,17 +16,18 @@ import ( func TestSqliteAndFeatureRepoSetup(t *testing.T) { dir := t.TempDir() - feature_repo_path := filepath.Join(dir, "feature_repo") + feature_repo_path := filepath.Join(dir, "my_project", "feature_repo") err := test.SetupCleanFeatureRepo(dir) assert.Nil(t, err) config, err := registry.NewRepoConfigFromFile(feature_repo_path) assert.Nil(t, err) - assert.Equal(t, "feature_repo", config.Project) + assert.Equal(t, "my_project", config.Project) assert.Equal(t, "data/registry.db", config.GetRegistryConfig().Path) assert.Equal(t, "local", config.Provider) assert.Equal(t, map[string]interface{}{ "path": "data/online_store.db", + "type": "sqlite", }, config.OnlineStore) assert.Empty(t, config.OfflineStore) assert.Empty(t, config.FeatureServer) @@ -35,12 +36,12 @@ func TestSqliteAndFeatureRepoSetup(t *testing.T) { func TestSqliteOnlineRead(t *testing.T) { dir := t.TempDir() - feature_repo_path := filepath.Join(dir, "feature_repo") + feature_repo_path := filepath.Join(dir, "my_project", "feature_repo") test.SetupCleanFeatureRepo(dir) config, err := registry.NewRepoConfigFromFile(feature_repo_path) assert.Nil(t, err) - store, err := NewSqliteOnlineStore("feature_repo", config, config.OnlineStore) + store, err := NewSqliteOnlineStore("my_project", config, config.OnlineStore) defer store.Destruct() assert.Nil(t, err) entity_key1 := types.EntityKey{ diff --git a/go/internal/feast/registry/local.go b/go/internal/feast/registry/local.go index 8b35e5756b..124fcba3ed 100644 --- a/go/internal/feast/registry/local.go +++ b/go/internal/feast/registry/local.go @@ -12,15 +12,15 @@ import ( "github.com/feast-dev/feast/go/protos/feast/core" ) -// A LocalRegistryStore is a file-based implementation of the RegistryStore interface. -type LocalRegistryStore struct { +// A FileRegistryStore is a file-based implementation of the RegistryStore interface. +type FileRegistryStore struct { filePath string } -// NewLocalRegistryStore creates a LocalRegistryStore with the given configuration and infers +// NewFileRegistryStore creates a FileRegistryStore with the given configuration and infers // the file path from the repo path and registry path. -func NewLocalRegistryStore(config *RegistryConfig, repoPath string) *LocalRegistryStore { - lr := LocalRegistryStore{} +func NewFileRegistryStore(config *RegistryConfig, repoPath string) *FileRegistryStore { + lr := FileRegistryStore{} registryPath := config.Path if filepath.IsAbs(registryPath) { lr.filePath = registryPath @@ -31,7 +31,7 @@ func NewLocalRegistryStore(config *RegistryConfig, repoPath string) *LocalRegist } // GetRegistryProto reads and parses the registry proto from the file path. -func (r *LocalRegistryStore) GetRegistryProto() (*core.Registry, error) { +func (r *FileRegistryStore) GetRegistryProto() (*core.Registry, error) { registry := &core.Registry{} in, err := ioutil.ReadFile(r.filePath) if err != nil { @@ -43,15 +43,15 @@ func (r *LocalRegistryStore) GetRegistryProto() (*core.Registry, error) { return registry, nil } -func (r *LocalRegistryStore) UpdateRegistryProto(rp *core.Registry) error { +func (r *FileRegistryStore) UpdateRegistryProto(rp *core.Registry) error { return r.writeRegistry(rp) } -func (r *LocalRegistryStore) Teardown() error { +func (r *FileRegistryStore) Teardown() error { return os.Remove(r.filePath) } -func (r *LocalRegistryStore) writeRegistry(rp *core.Registry) error { +func (r *FileRegistryStore) writeRegistry(rp *core.Registry) error { rp.VersionId = uuid.New().String() rp.LastUpdated = timestamppb.Now() bytes, err := proto.Marshal(rp) diff --git a/go/internal/feast/registry/registry.go b/go/internal/feast/registry/registry.go index c67a50a5a6..9d0684d023 100644 --- a/go/internal/feast/registry/registry.go +++ b/go/internal/feast/registry/registry.go @@ -16,8 +16,8 @@ var REGISTRY_SCHEMA_VERSION string = "1" var REGISTRY_STORE_CLASS_FOR_SCHEME map[string]string = map[string]string{ "gs": "GCSRegistryStore", "s3": "S3RegistryStore", - "file": "LocalRegistryStore", - "": "LocalRegistryStore", + "file": "FileRegistryStore", + "": "FileRegistryStore", } /* @@ -335,8 +335,8 @@ func getRegistryStoreFromScheme(registryPath string, registryConfig *RegistryCon func getRegistryStoreFromType(registryStoreType string, registryConfig *RegistryConfig, repoPath string) (RegistryStore, error) { switch registryStoreType { - case "LocalRegistryStore": - return NewLocalRegistryStore(registryConfig, repoPath), nil + case "FileRegistryStore": + return NewFileRegistryStore(registryConfig, repoPath), nil } - return nil, errors.New("only LocalRegistryStore as a RegistryStore is supported at this moment") + return nil, errors.New("only FileRegistryStore as a RegistryStore is supported at this moment") } diff --git a/go/internal/test/feature_repo/example.py b/go/internal/test/feature_repo/example.py index 2b1d74ad32..7084361007 100644 --- a/go/internal/test/feature_repo/example.py +++ b/go/internal/test/feature_repo/example.py @@ -1,10 +1,11 @@ # This is an example feature definition file -from google.protobuf.duration_pb2 import Duration +from datetime import timedelta -from feast import Entity, Feature, FeatureView, FileSource, ValueType, FeatureService +from feast import Entity, Feature, FeatureView, Field, FileSource, FeatureService from feast.feature_logging import LoggingConfig from feast.infra.offline_stores.file_source import FileLoggingDestination +from feast.types import Float32, Int64 # Read data from parquet files. Parquet is convenient for local development mode. For # production, you can use your favorite DWH, such as BigQuery. See Feast documentation @@ -15,24 +16,24 @@ created_timestamp_column="created", ) -# Define an entity for the driver. You can think of entity as a primary key used to +# Define an entity for the driver. You can think of an entity as a primary key used to # fetch features. -driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id",) +driver = Entity(name="driver_id", description="driver id") # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", - entities=["driver_id"], - ttl=Duration(seconds=86400 * 365 * 10), - features=[ - Feature(name="conv_rate", dtype=ValueType.FLOAT), - Feature(name="acc_rate", dtype=ValueType.FLOAT), - Feature(name="avg_daily_trips", dtype=ValueType.INT64), + entities=[driver], + ttl=timedelta(seconds=86400 * 365 * 10), + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), ], online=True, - batch_source=driver_hourly_stats, + source=driver_hourly_stats, tags={}, ) diff --git a/go/internal/test/go_integration_test_utils.go b/go/internal/test/go_integration_test_utils.go index 275edc7b98..3ec9aa2a4c 100644 --- a/go/internal/test/go_integration_test_utils.go +++ b/go/internal/test/go_integration_test_utils.go @@ -88,7 +88,7 @@ func GetLatestFeatures(Rows []*Row, entities map[int64]bool) map[int64]*Row { } func SetupCleanFeatureRepo(basePath string) error { - cmd := exec.Command("feast", "init", "feature_repo") + cmd := exec.Command("feast", "init", "my_project") path, err := filepath.Abs(basePath) cmd.Env = os.Environ() @@ -102,7 +102,7 @@ func SetupCleanFeatureRepo(basePath string) error { } applyCommand := exec.Command("feast", "apply") applyCommand.Env = os.Environ() - featureRepoPath, err := filepath.Abs(filepath.Join(path, "feature_repo")) + featureRepoPath, err := filepath.Abs(filepath.Join(path, "my_project", "feature_repo")) if err != nil { return err } diff --git a/infra/charts/feast-feature-server/Chart.yaml b/infra/charts/feast-feature-server/Chart.yaml index 6c1afc9540..81970bc1a8 100644 --- a/infra/charts/feast-feature-server/Chart.yaml +++ b/infra/charts/feast-feature-server/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: feast-feature-server description: Feast Feature Server in Go or Python type: application -version: 0.22.0 +version: 0.24.0 keywords: - machine learning - big data diff --git a/infra/charts/feast-feature-server/README.md b/infra/charts/feast-feature-server/README.md index a55451e788..1ee114d9c8 100644 --- a/infra/charts/feast-feature-server/README.md +++ b/infra/charts/feast-feature-server/README.md @@ -1,24 +1,33 @@ -# feast-feature-server +# Feast Python / Go Feature Server Helm Charts -![Version: 0.22.0](https://img.shields.io/badge/Version-0.22.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) +Current chart version is `0.24.0` -Feast Feature Server in Go or Python +## Installation -**Homepage:** +Run the following commands to add the repository -## Source Code +``` +helm repo add feast-charts https://feast-helm-charts.storage.googleapis.com +helm repo update +``` + +Install Feast -* +A base64 encoded version of the `feature_store.yaml` file is needed. Helm install example: +``` +helm install feast-feature-server feast-charts/feast-feature-server --set feature_store_yaml_base64=$(base64 feature_store.yaml) +``` ## Values | Key | Type | Default | Description | |-----|------|---------|-------------| | affinity | object | `{}` | | +| feature_store_yaml_base64 | string | `""` | [required] a base64 encoded version of feature_store.yaml | | fullnameOverride | string | `""` | | | image.pullPolicy | string | `"IfNotPresent"` | | -| image.repository | string | `""` | | -| image.tag | string | `""` | | +| image.repository | string | `"feastdev/feature-server"` | Docker image for Feature Server repository | +| image.tag | string | `"0.23.0"` | The Docker image tag (can be overwritten if custom feature server deps are needed for on demand transforms) | | imagePullSecrets | list | `[]` | | | livenessProbe.initialDelaySeconds | int | `30` | | | livenessProbe.periodSeconds | int | `30` | | @@ -33,50 +42,4 @@ Feast Feature Server in Go or Python | securityContext | object | `{}` | | | service.port | int | `80` | | | service.type | string | `"ClusterIP"` | | -| tolerations | list | `[]` | | - ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) - - -Docker repository and tag are required. Helm install example: -``` -helm install feast-feature-server . --set image.repository=REPO --set image.tag=TAG -``` - -Deployment assumes that `feature_store.yaml` exists on docker image. Example docker image: -``` -FROM python:3.8 - -RUN apt update && \ - apt install -y jq - -RUN pip install pip --upgrade - -RUN pip install feast - -COPY feature_store.yaml /feature_store.yaml -``` - -Furthermore, if you wish to use the Go feature server, then you must install the Apache Arrow C++ libraries, and your `feature_store.yaml` should include `go_feature_server: True`. -For more details, see the [docs](https://docs.feast.dev/reference/feature-servers/go-feature-server). -The docker image might look like: -``` -FROM python:3.8 - -RUN apt update && \ - apt install -y jq - -RUN pip install pip --upgrade - -RUN pip install feast - -RUN apt update -RUN apt install -y -V ca-certificates lsb-release wget -RUN wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb -RUN apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb -RUN apt update -RUN apt -y install libarrow-dev - -COPY feature_store.yaml /feature_store.yaml -``` \ No newline at end of file +| tolerations | list | `[]` | | \ No newline at end of file diff --git a/infra/charts/feast-feature-server/README.md.gotmpl b/infra/charts/feast-feature-server/README.md.gotmpl new file mode 100644 index 0000000000..75f2827466 --- /dev/null +++ b/infra/charts/feast-feature-server/README.md.gotmpl @@ -0,0 +1,23 @@ +# Feast Python / Go Feature Server Helm Charts + +Current chart version is `{{ template "chart.version" . }}` + +## Installation + +Run the following commands to add the repository + +``` +helm repo add feast-charts https://feast-helm-charts.storage.googleapis.com +helm repo update +``` + +Install Feast + +A base64 encoded version of the `feature_store.yaml` file is needed. Helm install example: +``` +helm install feast-feature-server feast-charts/feast-feature-server --set feature_store_yaml_base64=$(base64 feature_store.yaml) +``` + +{{ template "chart.requirementsSection" . }} + +{{ template "chart.valuesSection" . }} \ No newline at end of file diff --git a/infra/charts/feast-feature-server/templates/deployment.yaml b/infra/charts/feast-feature-server/templates/deployment.yaml index 69cf92f6c0..94c56de9dd 100644 --- a/infra/charts/feast-feature-server/templates/deployment.yaml +++ b/infra/charts/feast-feature-server/templates/deployment.yaml @@ -30,6 +30,9 @@ spec: {{- toYaml .Values.securityContext | nindent 12 }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + env: + - name: FEATURE_STORE_YAML_BASE64 + value: {{ .Values.feature_store_yaml_base64 }} command: ["feast", "serve", "-h", "0.0.0.0"] ports: - name: http diff --git a/infra/charts/feast-feature-server/values.yaml b/infra/charts/feast-feature-server/values.yaml index f62f95a757..257cf03bfa 100644 --- a/infra/charts/feast-feature-server/values.yaml +++ b/infra/charts/feast-feature-server/values.yaml @@ -5,14 +5,19 @@ replicaCount: 1 image: - repository: "" + # image.repository -- Docker image for Feature Server repository + repository: feastdev/feature-server pullPolicy: IfNotPresent - tag: "" + # image.tag -- The Docker image tag (can be overwritten if custom feature server deps are needed for on demand transforms) + tag: 0.24.0 imagePullSecrets: [] nameOverride: "" fullnameOverride: "" +# feature_store_yaml_base64 -- [required] a base64 encoded version of feature_store.yaml +feature_store_yaml_base64: "" + podAnnotations: {} podSecurityContext: {} diff --git a/infra/charts/feast-python-server/Chart.yaml b/infra/charts/feast-python-server/Chart.yaml index 6ab82b7a65..d2b45ee8b6 100644 --- a/infra/charts/feast-python-server/Chart.yaml +++ b/infra/charts/feast-python-server/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: feast-python-server description: Feast Feature Server in Python type: application -version: 0.23.0 +version: 0.24.0 keywords: - machine learning - big data diff --git a/infra/charts/feast-python-server/README.md b/infra/charts/feast-python-server/README.md index e3da9b1d29..acdf527531 100644 --- a/infra/charts/feast-python-server/README.md +++ b/infra/charts/feast-python-server/README.md @@ -1,14 +1,28 @@ -# feast-python-server +# Feast Python Feature Server Helm Charts (deprecated) -![Version: 0.23.0](https://img.shields.io/badge/Version-0.23.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) +> Note: this helm chart is deprecated in favor of [feast-feature-server](../feast-feature-server/README.md) -Feast Feature Server in Python +Current chart version is `0.24.0` -**Homepage:** +## Installation +Docker repository and tag are required. Helm install example: +``` +helm install feast-python-server . --set image.repository=REPO --set image.tag=TAG +``` + +Deployment assumes that `feature_store.yaml` exists on docker image. Example docker image: +``` +FROM python:3.8 + +RUN apt update && \ + apt install -y jq -## Source Code +RUN pip install pip --upgrade -* +RUN pip install feast + +COPY feature_store.yaml /feature_store.yaml +``` ## Values @@ -33,27 +47,4 @@ Feast Feature Server in Python | securityContext | object | `{}` | | | service.port | int | `80` | | | service.type | string | `"ClusterIP"` | | -| tolerations | list | `[]` | | - ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.5.0](https://github.com/norwoodj/helm-docs/releases/v1.5.0) - - -Docker repository and tag are required. Helm install example: -``` -helm install feast-python-server . --set image.repository=REPO --set image.tag=TAG -``` - -Deployment assumes that `feature_store.yaml` exists on docker image. Example docker image: -``` -FROM python:3.8 - -RUN apt update && \ - apt install -y jq - -RUN pip install pip --upgrade - -RUN pip install feast - -COPY feature_store.yaml /feature_store.yaml -``` \ No newline at end of file +| tolerations | list | `[]` | | \ No newline at end of file diff --git a/infra/charts/feast-python-server/README.md.gotmpl b/infra/charts/feast-python-server/README.md.gotmpl new file mode 100644 index 0000000000..cb264c0066 --- /dev/null +++ b/infra/charts/feast-python-server/README.md.gotmpl @@ -0,0 +1,29 @@ +# Feast Python Feature Server Helm Charts (deprecated) + +> Note: this helm chart is deprecated in favor of [feast-feature-server](../feast-feature-server/README.md) + +Current chart version is `{{ template "chart.version" . }}` + +## Installation +Docker repository and tag are required. Helm install example: +``` +helm install feast-python-server . --set image.repository=REPO --set image.tag=TAG +``` + +Deployment assumes that `feature_store.yaml` exists on docker image. Example docker image: +``` +FROM python:3.8 + +RUN apt update && \ + apt install -y jq + +RUN pip install pip --upgrade + +RUN pip install feast + +COPY feature_store.yaml /feature_store.yaml +``` + +{{ template "chart.requirementsSection" . }} + +{{ template "chart.valuesSection" . }} \ No newline at end of file diff --git a/infra/charts/feast-python-server/values.yaml b/infra/charts/feast-python-server/values.yaml index f62f95a757..6d0ab9c0ae 100644 --- a/infra/charts/feast-python-server/values.yaml +++ b/infra/charts/feast-python-server/values.yaml @@ -5,8 +5,10 @@ replicaCount: 1 image: + # image.repository -- [required] The repository for the Docker image repository: "" pullPolicy: IfNotPresent + # image.tag -- [required] The Docker image tag tag: "" imagePullSecrets: [] diff --git a/infra/charts/feast/Chart.yaml b/infra/charts/feast/Chart.yaml index f4e33de7f3..a657298b52 100644 --- a/infra/charts/feast/Chart.yaml +++ b/infra/charts/feast/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v1 description: Feature store for machine learning name: feast -version: 0.23.0 +version: 0.24.0 keywords: - machine learning - big data diff --git a/infra/charts/feast/README.md b/infra/charts/feast/README.md index f71dcf6124..7a0f5f77aa 100644 --- a/infra/charts/feast/README.md +++ b/infra/charts/feast/README.md @@ -8,7 +8,7 @@ This repo contains Helm charts for Feast components that are being installed on ## Chart: Feast -Feature store for machine learning Current chart version is `0.23.0` +Feature store for machine learning Current chart version is `0.24.0` ## Installation @@ -54,9 +54,9 @@ For more details, please see: https://docs.feast.dev/how-to-guides/running-feast | Repository | Name | Version | |------------|------|---------| -| https://charts.helm.sh/stable | redis | 10.5.6 | -| https://feast-helm-charts.storage.googleapis.com | feature-server(feature-server) | 0.23.0 | -| https://feast-helm-charts.storage.googleapis.com | transformation-service(transformation-service) | 0.23.0 | +| https://charts.helm.sh/stable | redis | 10.5.6 | +| https://feast-helm-charts.storage.googleapis.com | feature-server(feature-server) | 0.24.0 | +| https://feast-helm-charts.storage.googleapis.com | transformation-service(transformation-service) | 0.24.0 | ## Values diff --git a/infra/charts/feast/README.md.gotmpl b/infra/charts/feast/README.md.gotmpl index acb4e830e7..e215858fe0 100644 --- a/infra/charts/feast/README.md.gotmpl +++ b/infra/charts/feast/README.md.gotmpl @@ -1,7 +1,5 @@ # Feast Helm Charts -> :warning: **Disclaimer**: Since Feast 0.10 our vision is to manage all infrastructure for feature store from one place - Feast SDK. But while this new paradigm is still in development, we are planning to support the installation of some Feast components (like Java feature server) through Helm chart presented in this repository. However, we do not expect helm chart to become a long-term solution for deploying Feast components to production, and some frictions still might exist. For example, you will need to manually sync some configurations from [feature_store.yaml](https://docs.feast.dev/reference/feature-repository/feature-store-yaml) into the chart context (like path to the registry file or project name). - This repo contains Helm charts for Feast components that are being installed on Kubernetes: * Feast (root chart): The complete Helm chart containing all Feast components and dependencies. Most users will use this chart, but can selectively enable/disable subcharts using the values.yaml file. * [Feature Server](charts/feature-server): High performant JVM-based implementation of feature server. diff --git a/infra/charts/feast/charts/feature-server/Chart.yaml b/infra/charts/feast/charts/feature-server/Chart.yaml index ee08b0b0f8..f238b6aee4 100644 --- a/infra/charts/feast/charts/feature-server/Chart.yaml +++ b/infra/charts/feast/charts/feature-server/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v1 description: "Feast Feature Server: Online feature serving service for Feast" name: feature-server -version: 0.23.0 -appVersion: v0.23.0 +version: 0.24.0 +appVersion: v0.24.0 keywords: - machine learning - big data diff --git a/infra/charts/feast/charts/feature-server/README.md b/infra/charts/feast/charts/feature-server/README.md index 4717cfff3a..465665fb3b 100644 --- a/infra/charts/feast/charts/feature-server/README.md +++ b/infra/charts/feast/charts/feature-server/README.md @@ -1,6 +1,6 @@ # feature-server -![Version: 0.23.0](https://img.shields.io/badge/Version-0.23.0-informational?style=flat-square) ![AppVersion: v0.23.0](https://img.shields.io/badge/AppVersion-v0.23.0-informational?style=flat-square) +![Version: 0.24.0](https://img.shields.io/badge/Version-0.24.0-informational?style=flat-square) ![AppVersion: v0.24.0](https://img.shields.io/badge/AppVersion-v0.24.0-informational?style=flat-square) Feast Feature Server: Online feature serving service for Feast @@ -8,63 +8,60 @@ Feast Feature Server: Online feature serving service for Feast ## Values -| Key | Type | Default | Description | -|-----|------|-------------------------------------------------------|-------------| -| "application-generated.yaml".enabled | bool | `true` | Flag to include Helm generated configuration. Please set `application-override.yaml` to override this configuration. | -| "application-override.yaml" | object | `{"enabled":true}` | Configuration to override the default [application.yaml](https://github.com/feast-dev/feast/blob/master/java/serving/src/main/resources/application.yml). Will be created as a ConfigMap. `application-override.yaml` has a higher precedence than `application-secret.yaml` | -| "application-secret.yaml" | object | `{"enabled":true}` | Configuration to override the default [application.yaml](https://github.com/feast-dev/feast/blob/master/java/serving/src/main/resources/application.yml). Will be created as a Secret. `application-override.yaml` has a higher precedence than `application-secret.yaml`. It is recommended to either set `application-override.yaml` or `application-secret.yaml` only to simplify config management. | -| "application.yaml".enabled | bool | `true` | Flag to include the default [configuration](https://github.com/feast-dev/feast/blob/master/java/serving/src/main/resources/application.yml). Please set `application-override.yaml` to override this configuration. | -| envOverrides | object | `{}` | Extra environment variables to set | -| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy | -| image.repository | string | `"feastdev/feature-server-java"` | Docker image for Feature Server repository | -| image.tag | string | `"0.23.0"` | Image tag | -| ingress.grpc.annotations | object | `{}` | Extra annotations for the ingress | -| ingress.grpc.auth.enabled | bool | `false` | Flag to enable auth | -| ingress.grpc.class | string | `"nginx"` | Which ingress controller to use | -| ingress.grpc.enabled | bool | `false` | Flag to create an ingress resource for the service | -| ingress.grpc.hosts | list | `[]` | List of hostnames to match when routing requests | -| ingress.grpc.https.enabled | bool | `true` | Flag to enable HTTPS | -| ingress.grpc.https.secretNames | object | `{}` | Map of hostname to TLS secret name | -| ingress.grpc.whitelist | string | `""` | Allowed client IP source ranges | -| ingress.http.annotations | object | `{}` | Extra annotations for the ingress | +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| "application-generated.yaml".enabled | bool | `true` | Flag to include Helm generated configuration. Please set `application-override.yaml` to override this configuration. | +| "application-override.yaml" | object | `{"enabled":true}` | Configuration to override the default [application.yaml](https://github.com/feast-dev/feast/blob/master/java/serving/src/main/resources/application.yml). Will be created as a ConfigMap. `application-override.yaml` has a higher precedence than `application-secret.yaml` | +| "application-secret.yaml" | object | `{"enabled":false}` | Configuration to override the default [application.yaml](https://github.com/feast-dev/feast/blob/master/java/serving/src/main/resources/application.yml). Will be created as a Secret. `application-override.yaml` has a higher precedence than `application-secret.yaml`. It is recommended to either set `application-override.yaml` or `application-secret.yaml` only to simplify config management. | +| "application.yaml".enabled | bool | `true` | Flag to include the default [configuration](https://github.com/feast-dev/feast/blob/master/java/serving/src/main/resources/application.yml). Please set `application-override.yaml` to override this configuration. | +| envOverrides | object | `{}` | Extra environment variables to set | +| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy | +| image.repository | string | `"feastdev/feature-server-java"` | Docker image for Feature Server repository | +| image.tag | string | `"0.24.0"` | Image tag | +| ingress.grpc.annotations | object | `{}` | Extra annotations for the ingress | +| ingress.grpc.auth.enabled | bool | `false` | Flag to enable auth | +| ingress.grpc.class | string | `"nginx"` | Which ingress controller to use | +| ingress.grpc.enabled | bool | `false` | Flag to create an ingress resource for the service | +| ingress.grpc.hosts | list | `[]` | List of hostnames to match when routing requests | +| ingress.grpc.https.enabled | bool | `true` | Flag to enable HTTPS | +| ingress.grpc.https.secretNames | object | `{}` | Map of hostname to TLS secret name | +| ingress.grpc.whitelist | string | `""` | Allowed client IP source ranges | +| ingress.http.annotations | object | `{}` | Extra annotations for the ingress | | ingress.http.auth.authUrl | string | `"http://auth-server.auth-ns.svc.cluster.local/auth"` | URL to an existing authentication service | -| ingress.http.auth.enabled | bool | `false` | Flag to enable auth | -| ingress.http.class | string | `"nginx"` | Which ingress controller to use | -| ingress.http.enabled | bool | `false` | Flag to create an ingress resource for the service | -| ingress.http.hosts | list | `[]` | List of hostnames to match when routing requests | -| ingress.http.https.enabled | bool | `true` | Flag to enable HTTPS | -| ingress.http.https.secretNames | object | `{}` | Map of hostname to TLS secret name | -| ingress.http.whitelist | string | `""` | Allowed client IP source ranges | -| javaOpts | string | `nil` | [JVM options](https://docs.oracle.com/cd/E22289_01/html/821-1274/configuring-the-default-jvm-and-java-arguments.html). For better performance, it is advised to set the min and max heap:
`-Xms2048m -Xmx2048m` | -| livenessProbe.enabled | bool | `true` | Flag to enabled the probe | -| livenessProbe.failureThreshold | int | `5` | Min consecutive failures for the probe to be considered failed | -| livenessProbe.initialDelaySeconds | int | `60` | Delay before the probe is initiated | -| livenessProbe.periodSeconds | int | `10` | How often to perform the probe | -| livenessProbe.successThreshold | int | `1` | Min consecutive success for the probe to be considered successful | -| livenessProbe.timeoutSeconds | int | `5` | When the probe times out | -| logLevel | string | `"WARN"` | Default log level, use either one of `DEBUG`, `INFO`, `WARN` or `ERROR` | -| logType | string | `"Console"` | Log format, either `JSON` or `Console` | -| nodeSelector | object | `{}` | Node labels for pod assignment | -| podAnnotations | object | `{}` | Annotations to be added to Feast Serving pods | -| podLabels | object | `{}` | Labels to be added to Feast Serving pods | -| readinessProbe.enabled | bool | `true` | Flag to enabled the probe | -| readinessProbe.failureThreshold | int | `5` | Min consecutive failures for the probe to be considered failed | -| readinessProbe.initialDelaySeconds | int | `15` | Delay before the probe is initiated | -| readinessProbe.periodSeconds | int | `10` | How often to perform the probe | -| readinessProbe.successThreshold | int | `1` | Min consecutive success for the probe to be considered successful | -| readinessProbe.timeoutSeconds | int | `10` | When the probe times out | -| replicaCount | int | `1` | Number of pods that will be created | -| resources | object | `{}` | CPU/memory [resource requests/limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) | -| secrets | list | `[]` | List of Kubernetes secrets to be mounted. These secrets will be mounted on /etc/secrets/. | -| service.grpc.nodePort | string | `nil` | Port number that each cluster node will listen to | -| service.grpc.port | int | `6566` | Service port for GRPC requests | -| service.grpc.targetPort | int | `6566` | Container port serving GRPC requests | -| service.http.nodePort | string | `nil` | Port number that each cluster node will listen to | -| service.http.port | int | `80` | Service port for HTTP requests | -| service.http.targetPort | int | `8080` | Container port serving HTTP requests and Prometheus metrics | -| service.type | string | `"ClusterIP"` | Kubernetes service type | -| transformationService.host | string | `""` | | -| transformationService.port | int | `6566` | | +| ingress.http.auth.enabled | bool | `false` | Flag to enable auth | +| ingress.http.class | string | `"nginx"` | Which ingress controller to use | +| ingress.http.enabled | bool | `false` | Flag to create an ingress resource for the service | +| ingress.http.hosts | list | `[]` | List of hostnames to match when routing requests | +| ingress.http.https.enabled | bool | `true` | Flag to enable HTTPS | +| ingress.http.https.secretNames | object | `{}` | Map of hostname to TLS secret name | +| ingress.http.whitelist | string | `""` | Allowed client IP source ranges | +| javaOpts | string | `nil` | [JVM options](https://docs.oracle.com/cd/E22289_01/html/821-1274/configuring-the-default-jvm-and-java-arguments.html). For better performance, it is advised to set the min and max heap:
`-Xms2048m -Xmx2048m` | +| livenessProbe.enabled | bool | `true` | Flag to enabled the probe | +| livenessProbe.failureThreshold | int | `5` | Min consecutive failures for the probe to be considered failed | +| livenessProbe.initialDelaySeconds | int | `60` | Delay before the probe is initiated | +| livenessProbe.periodSeconds | int | `10` | How often to perform the probe | +| livenessProbe.successThreshold | int | `1` | Min consecutive success for the probe to be considered successful | +| livenessProbe.timeoutSeconds | int | `5` | When the probe times out | +| logLevel | string | `"WARN"` | Default log level, use either one of `DEBUG`, `INFO`, `WARN` or `ERROR` | +| logType | string | `"Console"` | Log format, either `JSON` or `Console` | +| nodeSelector | object | `{}` | Node labels for pod assignment | +| podAnnotations | object | `{}` | Annotations to be added to Feast Serving pods | +| podLabels | object | `{}` | Labels to be added to Feast Serving pods | +| readinessProbe.enabled | bool | `true` | Flag to enabled the probe | +| readinessProbe.failureThreshold | int | `5` | Min consecutive failures for the probe to be considered failed | +| readinessProbe.initialDelaySeconds | int | `15` | Delay before the probe is initiated | +| readinessProbe.periodSeconds | int | `10` | How often to perform the probe | +| readinessProbe.successThreshold | int | `1` | Min consecutive success for the probe to be considered successful | +| readinessProbe.timeoutSeconds | int | `10` | When the probe times out | +| replicaCount | int | `1` | Number of pods that will be created | +| resources | object | `{}` | CPU/memory [resource requests/limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) | +| secrets | list | `[]` | List of Kubernetes secrets to be mounted. These secrets will be mounted on /etc/secrets/. | +| service.grpc.nodePort | string | `nil` | Port number that each cluster node will listen to | +| service.grpc.port | int | `6566` | Service port for GRPC requests | +| service.grpc.targetPort | int | `6566` | Container port serving GRPC requests | +| service.type | string | `"ClusterIP"` | Kubernetes service type | +| transformationService.host | string | `""` | | +| transformationService.port | int | `6566` | | ---------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.5.0](https://github.com/norwoodj/helm-docs/releases/v1.5.0) +Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/infra/charts/feast/charts/feature-server/templates/configmap.yaml b/infra/charts/feast/charts/feature-server/templates/configmap.yaml index fbf2633e8e..c172e9e288 100644 --- a/infra/charts/feast/charts/feature-server/templates/configmap.yaml +++ b/infra/charts/feast/charts/feature-server/templates/configmap.yaml @@ -28,9 +28,6 @@ data: config: host: {{ .Release.Name }}-redis-master port: 6379 - rest: - server: - port: {{ .Values.service.http.targetPort }} grpc: server: port: {{ .Values.service.grpc.targetPort }} diff --git a/infra/charts/feast/charts/feature-server/templates/deployment.yaml b/infra/charts/feast/charts/feature-server/templates/deployment.yaml index 1d1bc40029..ad0a12b3fc 100644 --- a/infra/charts/feast/charts/feature-server/templates/deployment.yaml +++ b/infra/charts/feast/charts/feature-server/templates/deployment.yaml @@ -106,8 +106,6 @@ spec: {{- end }} ports: - - name: http - containerPort: {{ .Values.service.http.targetPort }} - name: grpc containerPort: {{ .Values.service.grpc.targetPort }} diff --git a/infra/charts/feast/charts/feature-server/templates/service.yaml b/infra/charts/feast/charts/feature-server/templates/service.yaml index 037fe03870..c2455bd9f7 100644 --- a/infra/charts/feast/charts/feature-server/templates/service.yaml +++ b/infra/charts/feast/charts/feature-server/templates/service.yaml @@ -22,12 +22,6 @@ spec: {{ toYaml .Values.service.loadBalancerSourceRanges | indent 2 }} {{- end }} ports: - - name: http - port: {{ .Values.service.http.port }} - targetPort: {{ .Values.service.http.targetPort }} - {{- if .Values.service.http.nodePort }} - nodePort: {{ .Values.service.http.nodePort }} - {{- end }} - name: grpc port: {{ .Values.service.grpc.port }} targetPort: {{ .Values.service.grpc.targetPort }} diff --git a/infra/charts/feast/charts/feature-server/values.yaml b/infra/charts/feast/charts/feature-server/values.yaml index 011ce9dc33..b014d8cee7 100644 --- a/infra/charts/feast/charts/feature-server/values.yaml +++ b/infra/charts/feast/charts/feature-server/values.yaml @@ -5,7 +5,7 @@ image: # image.repository -- Docker image for Feature Server repository repository: feastdev/feature-server-java # image.tag -- Image tag - tag: 0.23.0 + tag: 0.24.0 # image.pullPolicy -- Image pull policy pullPolicy: IfNotPresent @@ -71,13 +71,6 @@ readinessProbe: service: # service.type -- Kubernetes service type type: ClusterIP - http: - # service.http.port -- Service port for HTTP requests - port: 80 - # service.http.targetPort -- Container port serving HTTP requests and Prometheus metrics - targetPort: 8080 - # service.http.nodePort -- Port number that each cluster node will listen to - nodePort: grpc: # service.grpc.port -- Service port for GRPC requests port: 6566 diff --git a/infra/charts/feast/charts/transformation-service/Chart.yaml b/infra/charts/feast/charts/transformation-service/Chart.yaml index 07055730c5..4c650544f5 100644 --- a/infra/charts/feast/charts/transformation-service/Chart.yaml +++ b/infra/charts/feast/charts/transformation-service/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v1 description: "Transformation service: to compute on-demand features" name: transformation-service -version: 0.23.0 -appVersion: v0.23.0 +version: 0.24.0 +appVersion: v0.24.0 keywords: - machine learning - big data diff --git a/infra/charts/feast/charts/transformation-service/README.md b/infra/charts/feast/charts/transformation-service/README.md index 9bc7a1e5d6..7b55e1a10c 100644 --- a/infra/charts/feast/charts/transformation-service/README.md +++ b/infra/charts/feast/charts/transformation-service/README.md @@ -1,6 +1,6 @@ # transformation-service -![Version: 0.23.0](https://img.shields.io/badge/Version-0.23.0-informational?style=flat-square) ![AppVersion: v0.23.0](https://img.shields.io/badge/AppVersion-v0.23.0-informational?style=flat-square) +![Version: 0.24.0](https://img.shields.io/badge/Version-0.24.0-informational?style=flat-square) ![AppVersion: v0.24.0](https://img.shields.io/badge/AppVersion-v0.24.0-informational?style=flat-square) Transformation service: to compute on-demand features @@ -8,20 +8,21 @@ Transformation service: to compute on-demand features ## Values -| Key | Type | Default | Description | -|-----|------|--------------------------------------------|-------------| -| envOverrides | object | `{}` | Extra environment variables to set | -| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy | +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| envOverrides | object | `{}` | Extra environment variables to set | +| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy | | image.repository | string | `"feastdev/feature-transformation-server"` | Docker image for Transformation Server repository | -| image.tag | string | `"0.23.0"` | Image tag | -| nodeSelector | object | `{}` | Node labels for pod assignment | -| podLabels | object | `{}` | Labels to be added to Feast Serving pods | -| replicaCount | int | `1` | Number of pods that will be created | -| resources | object | `{}` | CPU/memory [resource requests/limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) | -| service.grpc.nodePort | string | `nil` | Port number that each cluster node will listen to | -| service.grpc.port | int | `6566` | Service port for GRPC requests | -| service.grpc.targetPort | int | `6566` | Container port serving GRPC requests | -| service.type | string | `"ClusterIP"` | Kubernetes service type | +| image.tag | string | `"0.24.0"` | Image tag | +| nodeSelector | object | `{}` | Node labels for pod assignment | +| podLabels | object | `{}` | Labels to be added to Feast Serving pods | +| replicaCount | int | `1` | Number of pods that will be created | +| resources | object | `{}` | CPU/memory [resource requests/limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) | +| secrets | list | `[]` | List of Kubernetes secrets to be mounted. These secrets will be mounted on /etc/secrets/. | +| service.grpc.nodePort | string | `nil` | Port number that each cluster node will listen to | +| service.grpc.port | int | `6566` | Service port for GRPC requests | +| service.grpc.targetPort | int | `6566` | Container port serving GRPC requests | +| service.type | string | `"ClusterIP"` | Kubernetes service type | ---------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.5.0](https://github.com/norwoodj/helm-docs/releases/v1.5.0) +Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/infra/charts/feast/charts/transformation-service/config/feature_store.yaml b/infra/charts/feast/charts/transformation-service/config/feature_store.yaml index 555e93a306..c003b87cc2 100644 --- a/infra/charts/feast/charts/transformation-service/config/feature_store.yaml +++ b/infra/charts/feast/charts/transformation-service/config/feature_store.yaml @@ -2,7 +2,4 @@ registry: path: {{ .Values.global.registry.path }} cache_ttl_seconds: {{ .Values.global.registry.cache_ttl_seconds }} provider: local -project: {{ .Values.global.project }} -flags: - on_demand_transforms: true - alpha_features: true \ No newline at end of file +project: {{ .Values.global.project }} \ No newline at end of file diff --git a/infra/charts/feast/charts/transformation-service/values.yaml b/infra/charts/feast/charts/transformation-service/values.yaml index c1e506a476..149d613e9f 100644 --- a/infra/charts/feast/charts/transformation-service/values.yaml +++ b/infra/charts/feast/charts/transformation-service/values.yaml @@ -5,7 +5,7 @@ image: # image.repository -- Docker image for Transformation Server repository repository: feastdev/feature-transformation-server # image.tag -- Image tag - tag: 0.23.0 + tag: 0.24.0 # image.pullPolicy -- Image pull policy pullPolicy: IfNotPresent diff --git a/infra/charts/feast/requirements.yaml b/infra/charts/feast/requirements.yaml index c88fb7a4fa..5dd4a4bce1 100644 --- a/infra/charts/feast/requirements.yaml +++ b/infra/charts/feast/requirements.yaml @@ -1,12 +1,12 @@ dependencies: - name: feature-server alias: feature-server - version: 0.23.0 + version: 0.24.0 condition: feature-server.enabled repository: https://feast-helm-charts.storage.googleapis.com - name: transformation-service alias: transformation-service - version: 0.23.0 + version: 0.24.0 condition: transformation-service.enabled repository: https://feast-helm-charts.storage.googleapis.com - name: redis diff --git a/infra/scripts/cleanup_dynamo_ci.py b/infra/scripts/cleanup_dynamo_ci.py new file mode 100644 index 0000000000..2dda36cc5a --- /dev/null +++ b/infra/scripts/cleanup_dynamo_ci.py @@ -0,0 +1,22 @@ +import boto3 +from tqdm import tqdm + + +def main() -> None: + db = boto3.resource("dynamodb") + + num_to_delete = 0 + all_tables = db.tables.all() + for table in all_tables: + if "integration_test" in table.name: + num_to_delete += 1 + with tqdm(total=num_to_delete) as progress: + for table in all_tables: + if "integration_test" in table.name: + table.delete() + progress.update() + print(f"Deleted {num_to_delete} CI DynamoDB tables") + + +if __name__ == "__main__": + main() diff --git a/infra/scripts/helm/push-helm-charts.sh b/infra/scripts/helm/push-helm-charts.sh index 08753adb3c..1c32ee985b 100755 --- a/infra/scripts/helm/push-helm-charts.sh +++ b/infra/scripts/helm/push-helm-charts.sh @@ -17,7 +17,9 @@ helm repo add feast-helm-chart-repo $bucket cd infra/charts helm package feast helm package feast-python-server +helm package feast-feature-server helm gcs push --public feast-${1}.tgz feast-helm-chart-repo --force helm gcs push --public feast-python-server-${1}.tgz feast-helm-chart-repo --force +helm gcs push --public feast-feature-server-${1}.tgz feast-helm-chart-repo --force rm -f ./*.tgz \ No newline at end of file diff --git a/infra/scripts/helm/validate-helm-chart-versions.sh b/infra/scripts/helm/validate-helm-chart-versions.sh index 0ba75bd744..aac79d9315 100755 --- a/infra/scripts/helm/validate-helm-chart-versions.sh +++ b/infra/scripts/helm/validate-helm-chart-versions.sh @@ -3,7 +3,7 @@ set -e # Amount of file locations that need to be bumped in unison when versions increment -UNIQUE_VERSIONS_COUNT=18 +UNIQUE_VERSIONS_COUNT=20 if [ $# -ne 1 ]; then echo "Please provide a single semver version (without a \"v\" prefix) to test the repository against, e.g 0.99.0" diff --git a/infra/scripts/publish-java-sdk.sh b/infra/scripts/publish-java-sdk.sh index 68174db17a..0e8b62478f 100755 --- a/infra/scripts/publish-java-sdk.sh +++ b/infra/scripts/publish-java-sdk.sh @@ -69,4 +69,4 @@ gpg --import --batch --yes $GPG_KEY_IMPORT_DIR/private-key echo "============================================================" echo "Deploying Java SDK with revision: $REVISION" echo "============================================================" -mvn -f java/pom.xml --projects .,datatypes,sdk -Drevision=$REVISION --batch-mode clean deploy +mvn -f java/pom.xml --projects .,datatypes,serving-client -Drevision=$REVISION --batch-mode clean deploy diff --git a/infra/scripts/create-cluster.sh b/infra/scripts/redis-cluster.sh similarity index 100% rename from infra/scripts/create-cluster.sh rename to infra/scripts/redis-cluster.sh diff --git a/infra/scripts/release/files_to_bump.txt b/infra/scripts/release/files_to_bump.txt index a1e2d29623..e94ec88db0 100644 --- a/infra/scripts/release/files_to_bump.txt +++ b/infra/scripts/release/files_to_bump.txt @@ -8,5 +8,9 @@ infra/charts/feast/charts/feature-server/README.md 3 20 infra/charts/feast/charts/feature-server/values.yaml 8 infra/charts/feast/README.md 11 58 59 infra/charts/feast-python-server/Chart.yaml 5 -infra/charts/feast-python-server/README.md 3 -java/pom.xml 41 +infra/charts/feast-python-server/README.md 5 +infra/charts/feast-feature-server/Chart.yaml 5 +infra/charts/feast-feature-server/README.md 3 +infra/charts/feast-feature-server/values.yaml 12 +java/pom.xml 38 +ui/package.json 3 diff --git a/infra/templates/README.md.jinja2 b/infra/templates/README.md.jinja2 index 6a8ebdbab7..e59a364d81 100644 --- a/infra/templates/README.md.jinja2 +++ b/infra/templates/README.md.jinja2 @@ -21,7 +21,7 @@ Feast (**Fea**ture **St**ore) is an open source feature store for machine learni Feast allows ML platform teams to: -* **Make features consistently available for training and serving** by managing an _offline store_ (to process historical data for scale-out batch scoring or model training), a low-latency _online store_ (to power real-time prediction)_,_ and a battle-tested _feature server_ (for serving pre-computed features online). +* **Make features consistently available for training and serving** by managing an _offline store_ (to process historical data for scale-out batch scoring or model training), a low-latency _online store_ (to power real-time prediction)_,_ and a battle-tested _feature server_ (to serve pre-computed features online). * **Avoid data leakage** by generating point-in-time correct feature sets so data scientists can focus on feature engineering rather than debugging error-prone dataset joining logic. This ensure that future feature values do not leak to models during training. * **Decouple ML from data infrastructure** by providing a single data access layer that abstracts feature storage from feature retrieval, ensuring models remain portable as you move from training models to serving models, from batch models to realtime models, and from one data infra system to another. diff --git a/java/CONTRIBUTING.md b/java/CONTRIBUTING.md index f6c789d984..7ccfe108c0 100644 --- a/java/CONTRIBUTING.md +++ b/java/CONTRIBUTING.md @@ -2,17 +2,40 @@ > The higher level [Development Guide](https://docs.feast.dev/v/master/project/development-guide) > gives contributing to Feast codebase as a whole. -### Overview +## Overview This guide is targeted at developers looking to contribute to Feast components in the feast-java Repository: - [Feast Serving](#feast-serving) -- [Feast Java Client](#feast-java-client) +- [Feast Serving Client](#feast-serving-client) > Don't see the Feast component that you want to contribute to here? > Check out the [Development Guide](https://docs.feast.dev/v/master/project/development-guide) > to learn how Feast components are distributed over multiple repositories. -#### Common Setup +### Repository structure +There are four key top level packages: +- `serving`: Feast Serving (a gRPC service to serve features) +- `serving-client`: Feast Serving Client (a thin Java client to communicate with Feast serving via gRPC ) +- `datatypes`: A symlink to the overall project protos. These include the core serving gRPC protos, proto representations of all objects in the Feast registry. +- `coverage`: Generates JaCoCo coverage reports + +#### Feast Serving +> **Note:** there are references to metrics collection in the code. These are unused and exist for legacy reasons (from when this used Spring Boot), but remain in the code until published to StatsD / Prometheus Pushgateway. + +The primary entrypoint into the Feast Serving server is `ServingGuiceApplication`, which connects to the rest of the packages: +- `connectors`: Contains online store connectors (e.g. Redis) +- `exception`: Contains user-facing exceptions thrown by Feast Serving +- `registry`: Logic to parse a Feast file-based registry (in GCS, S3, or local) into the `Registry` proto object, and automatically re-sync the registry. +- `service`: Core logic that exposes and backs the serving APIs. This includes communication with a feature transformation server to execute on demand transformations + - The root code in this package creates the main entrypoint (`ServingServiceV2`) which is injected into `OnlineServingGrpcServiceV2` in `grpc/` implement the gRPC service. + - `config`: Guice modules to power the server and config + - Includes server config / guice modules in `ServerModule` + - Maps overall Feast Serving user configuration from Java to YAML in `ApplicationPropertiesModule` and `ApplicationProperties` + - `controller`: server controllers (right now, only a gRPC health check) + - `grpc`: Implementation of the gRPC serving service + - `interceptors`: gRPC interceptors (currently used to produce metrics around each gRPC request) + +### Common Setup Common Environment Setup for all feast-java Feast components: Ensure following development tools are installed: @@ -20,7 +43,7 @@ Ensure following development tools are installed: - Maven 3.6 - `make` -#### Code Style +### Code Style Feast's Java codebase conforms to the [Google Java Style Guide](https://google.github.io/styleguide/javaguide.html). Automatically format the code to conform the style guide by: @@ -33,27 +56,28 @@ mvn spotless:apply > If you're using IntelliJ, you can import these [code style settings](https://github.com/google/styleguide/blob/gh-pages/intellij-java-google-style.xml) > if you'd like to use the IDE's reformat function. -#### Project Makefile +### Project Makefile The Project Makefile provides useful shorthands for common development tasks: +> Note: These commands rely on a local version of `feast` (Python) to be installed Run all Unit tests: ``` make test-java ``` -Run all Integration tests: +Run all Integration tests (note: this also runs GCS + S3 based tests which should fail): ``` make test-java-integration ``` -Building Docker images for Feast Core & Feast Serving: +Building Docker images for Feast Serving: ``` make build-docker REGISTRY=gcr.io/kf-feast VERSION=develop ``` -#### IDE Setup +### IDE Setup If you're using IntelliJ, some additional steps may be needed to make sure IntelliJ autocomplete works as expected. Specifically, proto-generated code is not indexed by IntelliJ. To fix this, navigate to the following window in IntelliJ: `Project Structure > Modules > datatypes-java`, and mark the following folders as `Source` directorys: @@ -64,12 +88,12 @@ Specifically, proto-generated code is not indexed by IntelliJ. To fix this, navi ## Feast Serving See instructions [here](serving/README.md) for developing. -## Feast Java Client +## Feast Serving Client ### Environment Setup -Setting up your development environment for Feast Java SDK: +Setting up your development environment: 1. Complete the feast-java [Common Setup](#common-setup) -> Feast Java Client is a Java Client for retrieving Features from a running Feast Serving instance. +> Feast Serving Client is a Serving Client for retrieving Features from a running Feast Serving instance. > See the [Feast Serving Section](#feast-serving) section for how to get a Feast Serving instance running. ### Building diff --git a/java/README.md b/java/README.md index 8c3d93628e..53573a6fed 100644 --- a/java/README.md +++ b/java/README.md @@ -3,8 +3,8 @@ ### Overview This repository contains the following Feast components. -* Feast Serving: A service used to serve the latest feature values to models. -* Feast Java SDK: A client used to retrieve features from Feast Serving. +* Feast Serving: A gRPC service used to serve the latest feature values to models. +* Feast Serving Client: A client used to retrieve features from Feast Serving. ### Architecture @@ -16,6 +16,7 @@ Guides on Contributing: - [Contribution Process for Feast](https://docs.feast.dev/v/master/project/contributing) - [Development Guide for Feast](https://docs.feast.dev/v/master/project/development-guide) - [Development Guide for feast-java (this repository)](CONTRIBUTING.md) + - **Note**: includes installing without using Helm ### Installing using Helm Please see the Helm charts in [infra/charts/feast](../infra/charts/feast). diff --git a/java/common/pom.xml b/java/common/pom.xml deleted file mode 100644 index 6b580880f1..0000000000 --- a/java/common/pom.xml +++ /dev/null @@ -1,162 +0,0 @@ - - - - 4.0.0 - - - feast-parent - dev.feast - ${revision} - - - Feast Common - Feast common module with functionality that can be reused - feast-common - - - - dev.feast - feast-datatypes - ${project.version} - compile - - - com.google.protobuf - protobuf-java-util - ${protobuf.version} - - - - org.apache.commons - commons-lang3 - 3.6 - - - - - org.projectlombok - lombok - ${lombok.version} - - - com.google.auto.value - auto-value-annotations - ${auto.value.version} - - - - - com.google.code.gson - gson - ${gson.version} - - - io.gsonfire - gson-fire - ${gson.fire.version} - - - com.fasterxml.jackson.core - jackson-databind - 2.12.6.1 - - - com.fasterxml.jackson.datatype - jackson-datatype-jsr310 - ${jackson.version} - - - - - org.slf4j - slf4j-api - - - org.fluentd - fluent-logger - 0.3.1 - - - - javax.xml.bind - jaxb-api - - - javax.validation - validation-api - - - - - com.google.code.findbugs - jsr305 - 3.0.2 - - - - - org.hamcrest - hamcrest-library - test - ${hamcrest.version} - - - - junit - junit - 4.13.2 - - - org.mockito - mockito-core - ${mockito.version} - test - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - - - org.jacoco - jacoco-maven-plugin - - - org.apache.maven.plugins - maven-surefire-plugin - 3.0.0-M4 - - -Xms2048m -Xmx2048m -Djdk.net.URLClassPath.disableClassPathURLCheck=true - - - - org.sonatype.plugins - nexus-staging-maven-plugin - - true - - - - - diff --git a/java/common/src/main/java/feast/common/logging/AuditLogger.java b/java/common/src/main/java/feast/common/logging/AuditLogger.java deleted file mode 100644 index f3538a794b..0000000000 --- a/java/common/src/main/java/feast/common/logging/AuditLogger.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2020 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging; - -import com.google.protobuf.InvalidProtocolBufferException; -import com.google.protobuf.util.JsonFormat; -import feast.common.logging.config.LoggingProperties; -import feast.common.logging.config.LoggingProperties.AuditLogProperties; -import feast.common.logging.entry.*; -import feast.common.logging.entry.LogResource.ResourceType; -import java.net.InetAddress; -import java.net.UnknownHostException; -import java.util.HashMap; -import java.util.Map; -import lombok.extern.slf4j.Slf4j; -import org.apache.commons.lang3.StringUtils; -import org.fluentd.logger.FluentLogger; -import org.slf4j.Marker; -import org.slf4j.MarkerFactory; -import org.slf4j.event.Level; - -@Slf4j -public class AuditLogger { - private static final String FLUENTD_DESTINATION = "fluentd"; - private static final Marker AUDIT_MARKER = MarkerFactory.getMarker("AUDIT_MARK"); - private static FluentLogger fluentLogger; - private static AuditLogProperties properties; - private static String artifact; - private static String version; - - public AuditLogger(LoggingProperties loggingProperties, String artifact, String version) { - // Spring runs this constructor when creating the AuditLogger bean, - // which allows us to populate the AuditLogger class with dependencies. - // This allows us to use the dependencies in the AuditLogger's static methods - AuditLogger.properties = loggingProperties.getAudit(); - AuditLogger.artifact = artifact; - AuditLogger.version = version; - if (AuditLogger.properties.getMessageLogging() != null - && AuditLogger.properties.getMessageLogging().isEnabled()) { - AuditLogger.fluentLogger = - FluentLogger.getLogger( - "feast", - AuditLogger.properties.getMessageLogging().getFluentdHost(), - AuditLogger.properties.getMessageLogging().getFluentdPort()); - } - } - - /** - * Log the handling of a Protobuf message by a service call. - * - * @param level log level - * @param entryBuilder with all fields set except instance. - */ - public static void logMessage(Level level, MessageAuditLogEntry.Builder entryBuilder) { - log(level, entryBuilder.setComponent(artifact).setVersion(version).build()); - } - - /** - * Log an action being taken on a specific resource - * - * @param level describing the severity of the log. - * @param action name of the action being taken on specific resource. - * @param resourceType the type of resource being logged. - * @param resourceId resource specific identifier identifing the instance of the resource. - */ - public static void logAction( - Level level, String action, ResourceType resourceType, String resourceId) { - log( - level, - ActionAuditLogEntry.of( - artifact, version, LogResource.of(resourceType, resourceId), action)); - } - - /** - * Log a transition in state/status in a specific resource. - * - * @param level describing the severity of the log. - * @param status name of end status which the resource transition to. - * @param resourceType the type of resource being logged. - * @param resourceId resource specific identifier identifing the instance of the resource. - */ - public static void logTransition( - Level level, String status, ResourceType resourceType, String resourceId) { - log( - level, - TransitionAuditLogEntry.of( - artifact, version, LogResource.of(resourceType, resourceId), status)); - } - - /** - * Log given {@link AuditLogEntry} at the given logging {@link Level} to the Audit log. - * - * @param level describing the severity of the log. - * @param entry the {@link AuditLogEntry} to push to the audit log. - */ - private static void log(Level level, AuditLogEntry entry) { - // Check if audit logging is of this specific log entry enabled. - if (!properties.isEnabled()) { - return; - } - - // Either forward log to logging layer or log to console - String destination = properties.getMessageLogging().getDestination(); - if (destination.equals(FLUENTD_DESTINATION)) { - if (entry.getKind() == AuditLogEntryKind.MESSAGE) { - Map fluentdLogs = new HashMap<>(); - MessageAuditLogEntry messageAuditLogEntry = (MessageAuditLogEntry) entry; - String releaseName; - - try { - releaseName = - StringUtils.defaultIfEmpty( - System.getenv("RELEASE_NAME"), InetAddress.getLocalHost().getHostAddress()); - } catch (UnknownHostException e) { - releaseName = StringUtils.defaultIfEmpty(System.getenv("RELEASE_NAME"), ""); - } - - fluentdLogs.put("id", messageAuditLogEntry.getId()); - fluentdLogs.put("identity", messageAuditLogEntry.getIdentity()); - fluentdLogs.put("service", messageAuditLogEntry.getService()); - fluentdLogs.put("status_code", messageAuditLogEntry.getStatusCode()); - fluentdLogs.put("method", messageAuditLogEntry.getMethod()); - fluentdLogs.put("release_name", releaseName); - try { - fluentdLogs.put("request", JsonFormat.printer().print(messageAuditLogEntry.getRequest())); - fluentdLogs.put( - "response", JsonFormat.printer().print(messageAuditLogEntry.getResponse())); - } catch (InvalidProtocolBufferException e) { - } - fluentLogger.log("fluentd", fluentdLogs); - } - } else { - // Log event to audit log through enabled formats - String entryJSON = entry.toJSON(); - switch (level) { - case TRACE: - log.trace(AUDIT_MARKER, entryJSON); - break; - case DEBUG: - log.debug(AUDIT_MARKER, entryJSON); - break; - case INFO: - log.info(AUDIT_MARKER, entryJSON); - break; - case WARN: - log.warn(AUDIT_MARKER, entryJSON); - break; - case ERROR: - log.error(AUDIT_MARKER, entryJSON); - break; - } - } - } -} diff --git a/java/common/src/main/java/feast/common/logging/config/LoggingProperties.java b/java/common/src/main/java/feast/common/logging/config/LoggingProperties.java deleted file mode 100644 index 06e62f71af..0000000000 --- a/java/common/src/main/java/feast/common/logging/config/LoggingProperties.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2019 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging.config; - -import feast.common.validators.OneOfStrings; -import javax.validation.constraints.NotNull; -import lombok.Getter; -import lombok.Setter; - -@Getter -@Setter -public class LoggingProperties { - @NotNull private AuditLogProperties audit; - - @Getter - @Setter - public static class AuditLogProperties { - // Whether to enable/disable audit logging entirely. - private boolean enabled; - - private MessageLogging messageLogging; - - @Getter - @Setter - public static class MessageLogging { - // Whether to enable/disable message level (ie request/response) audit logging. - private boolean enabled; - - // Whether to log to console or fluentd - @OneOfStrings({"console", "fluentd"}) - private String destination; - - // fluentD service host for external (request/response) logging. - private String fluentdHost; - - // fluentD service port for external (request/response) logging. - private Integer fluentdPort; - } - } -} diff --git a/java/common/src/main/java/feast/common/logging/entry/ActionAuditLogEntry.java b/java/common/src/main/java/feast/common/logging/entry/ActionAuditLogEntry.java deleted file mode 100644 index 4fdeaee32a..0000000000 --- a/java/common/src/main/java/feast/common/logging/entry/ActionAuditLogEntry.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2020 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging.entry; - -import com.google.auto.value.AutoValue; - -/** ActionAuditLogEntry records an action being taken on a specific resource */ -@AutoValue -public abstract class ActionAuditLogEntry extends AuditLogEntry { - /** @return The name of the action taken on the resource. */ - public abstract String getAction(); - - /** @return The target resource of which the action was taken on. */ - public abstract LogResource getResource(); - - /** - * Create an {@link AuditLogEntry} that records an action being taken on a specific resource. - * - * @param component The name of th Feast component producing this {@link AuditLogEntry}. - * @param version The version of Feast producing this {@link AuditLogEntry}. - * @param resource The target resource of which the action was taken on. - * @param action The name of the action being taken on the given resource. - * @return log entry that records an action being taken on a specific resource - */ - public static ActionAuditLogEntry of( - String component, String version, LogResource resource, String action) { - return new AutoValue_ActionAuditLogEntry( - component, version, AuditLogEntryKind.ACTION, action, resource); - } -} diff --git a/java/common/src/main/java/feast/common/logging/entry/AuditLogEntry.java b/java/common/src/main/java/feast/common/logging/entry/AuditLogEntry.java deleted file mode 100644 index 8148c474b0..0000000000 --- a/java/common/src/main/java/feast/common/logging/entry/AuditLogEntry.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2019 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging.entry; - -import com.google.gson.Gson; - -/** - * AuditLogEntry represents a single audit Log Entry. Audit log entry can converted into string with - * {{@link #toString()} for human readable representation. Or structured JSON with {{@link - * #toJSON()} for a machine parsable representation. - */ -public abstract class AuditLogEntry { - /** Declare Log Type to allow external Logging systems to filter out {@link AuditLogEntry} */ - public final String logType = "FeastAuditLogEntry"; - - public final String application = "Feast"; - - /** - * The name of the Feast component producing this {@link AuditLogEntry} - * - * @return the component - */ - public abstract String getComponent(); - - /** - * The version of Feast producing this {@link AuditLogEntry} - * - * @return version - */ - public abstract String getVersion(); - - public abstract AuditLogEntryKind getKind(); - - /** - * Return a structured JSON representation of this {@link AuditLogEntry} - * - * @return structured JSON representation - */ - public String toJSON() { - Gson gson = new Gson(); - return gson.toJson(this); - } -} diff --git a/java/common/src/main/java/feast/common/logging/entry/AuditLogEntryKind.java b/java/common/src/main/java/feast/common/logging/entry/AuditLogEntryKind.java deleted file mode 100644 index d673f6bdb3..0000000000 --- a/java/common/src/main/java/feast/common/logging/entry/AuditLogEntryKind.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2019 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging.entry; - -/** AuditLogEntryKind lists the various kinds of {@link AuditLogEntry} */ -public enum AuditLogEntryKind { - MESSAGE, - ACTION, - TRANSITION, -} diff --git a/java/common/src/main/java/feast/common/logging/entry/LogResource.java b/java/common/src/main/java/feast/common/logging/entry/LogResource.java deleted file mode 100644 index 1d0345a404..0000000000 --- a/java/common/src/main/java/feast/common/logging/entry/LogResource.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2019 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging.entry; - -import com.google.auto.value.AutoValue; - -@AutoValue -/** - * LogResource is used in {@link AuditLogEntry} to reference a specific resource as the subject of - * the log - */ -public abstract class LogResource { - public enum ResourceType { - JOB, - FEATURE_TABLE - } - - public abstract ResourceType getType(); - - public abstract String getId(); - - public static LogResource of(ResourceType type, String id) { - return new AutoValue_LogResource(type, id); - } -} diff --git a/java/common/src/main/java/feast/common/logging/entry/MessageAuditLogEntry.java b/java/common/src/main/java/feast/common/logging/entry/MessageAuditLogEntry.java deleted file mode 100644 index 8ad428a3a3..0000000000 --- a/java/common/src/main/java/feast/common/logging/entry/MessageAuditLogEntry.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2020 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging.entry; - -import com.google.auto.value.AutoValue; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import com.google.gson.JsonParser; -import com.google.gson.JsonSerializer; -import com.google.protobuf.Empty; -import com.google.protobuf.InvalidProtocolBufferException; -import com.google.protobuf.Message; -import com.google.protobuf.util.JsonFormat; -import io.grpc.Status.Code; -import java.util.UUID; - -/** MessageAuditLogEntry records the handling of a Protobuf message by a service call. */ -@AutoValue -public abstract class MessageAuditLogEntry extends AuditLogEntry { - /** @return Id used to identify the service call that the log entry is recording */ - public abstract UUID getId(); - - /** @return The name of the service that was used to handle the service call. */ - public abstract String getService(); - - /** @return The name of the method that was used to handle the service call. */ - public abstract String getMethod(); - - /** - * @return The request Protobuf {@link Message} that was passed to the Service in the service - * call. - */ - public abstract Message getRequest(); - - /** - * @return The response Protobuf {@link Message} that was passed to the Service in the service - * call. May be an {@link Empty} protobuf no request could be collected due to an error. - */ - public abstract Message getResponse(); - - /** - * @return The authenticated identity that was assumed during the handling of the service call. - * For example, the user id or email that identifies the user making the call. Empty if the - * service call is not authenticated. - */ - public abstract String getIdentity(); - - /** @return The result status code of the service call. */ - public abstract Code getStatusCode(); - - @AutoValue.Builder - public abstract static class Builder { - public abstract Builder setId(UUID id); - - public abstract Builder setComponent(String component); - - public abstract Builder setVersion(String component); - - public abstract Builder setKind(AuditLogEntryKind kind); - - public abstract Builder setService(String name); - - public abstract Builder setMethod(String name); - - public abstract Builder setRequest(Message request); - - public abstract Builder setResponse(Message response); - - public abstract Builder setIdentity(String identity); - - public abstract Builder setStatusCode(Code statusCode); - - public abstract MessageAuditLogEntry build(); - } - - public static MessageAuditLogEntry.Builder newBuilder() { - return new AutoValue_MessageAuditLogEntry.Builder() - .setKind(AuditLogEntryKind.MESSAGE) - .setId(UUID.randomUUID()); - } - - @Override - public String toJSON() { - // GSON requires custom typeadapter (serializer) to convert Protobuf messages to JSON properly - Gson gson = - new GsonBuilder() - .registerTypeAdapter( - Message.class, - (JsonSerializer) - (message, type, context) -> { - try { - String messageJSON = JsonFormat.printer().print(message); - return new JsonParser().parse(messageJSON); - } catch (InvalidProtocolBufferException e) { - - throw new RuntimeException( - "Unexpected exception converting Protobuf to JSON", e); - } - }) - .create(); - return gson.toJson(this); - } -} diff --git a/java/common/src/main/java/feast/common/logging/entry/TransitionAuditLogEntry.java b/java/common/src/main/java/feast/common/logging/entry/TransitionAuditLogEntry.java deleted file mode 100644 index 224f10e0b5..0000000000 --- a/java/common/src/main/java/feast/common/logging/entry/TransitionAuditLogEntry.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2020 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging.entry; - -import com.google.auto.value.AutoValue; - -/** TransitionAuditLogEntry records a transition in state/status in a specific resource. */ -@AutoValue -public abstract class TransitionAuditLogEntry extends AuditLogEntry { - /** @return The resource which the state/status transition occured. */ - public abstract LogResource getResource(); - - /** @return The end status with the resource transition to. */ - public abstract String getStatus(); - - /** - * Construct a new {@link AuditLogEntry} to record a transition in state/status in a specific - * resource. - * - * @param component The name of th Feast component producing this {@link AuditLogEntry}. - * @param version The version of Feast producing this {@link AuditLogEntry}. - * @param resource the resource which the transtion occured - * @param status the end status which the resource transitioned to. - * @return log entry to record a transition in state/status in a specific resource - */ - public static TransitionAuditLogEntry of( - String component, String version, LogResource resource, String status) { - return new AutoValue_TransitionAuditLogEntry( - component, version, AuditLogEntryKind.TRANSITION, resource, status); - } -} diff --git a/java/common/src/main/java/feast/common/logging/interceptors/GrpcMessageInterceptor.java b/java/common/src/main/java/feast/common/logging/interceptors/GrpcMessageInterceptor.java deleted file mode 100644 index e34fefd115..0000000000 --- a/java/common/src/main/java/feast/common/logging/interceptors/GrpcMessageInterceptor.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2019 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging.interceptors; - -import com.google.protobuf.Empty; -import com.google.protobuf.Message; -import feast.common.logging.AuditLogger; -import feast.common.logging.config.LoggingProperties; -import feast.common.logging.entry.MessageAuditLogEntry; -import io.grpc.ForwardingServerCall.SimpleForwardingServerCall; -import io.grpc.ForwardingServerCallListener.SimpleForwardingServerCallListener; -import io.grpc.Metadata; -import io.grpc.ServerCall; -import io.grpc.ServerCall.Listener; -import io.grpc.ServerCallHandler; -import io.grpc.ServerInterceptor; -import io.grpc.Status; -import org.slf4j.event.Level; - -/** - * GrpcMessageInterceptor intercepts a GRPC calls to log handling of GRPC messages to the Audit Log. - * Intercepts the incoming and outgoing messages logs them to the audit log, together with method - * name and assumed authenticated identity (if authentication is enabled). NOTE: - * GrpcMessageInterceptor assumes that all service calls are unary (ie single request/response). - */ -public class GrpcMessageInterceptor implements ServerInterceptor { - private final LoggingProperties loggingProperties; - - /** - * Construct GrpcMessageIntercetor. - * - * @param loggingProperties properties used to configure logging interceptor. - */ - public GrpcMessageInterceptor(LoggingProperties loggingProperties) { - this.loggingProperties = loggingProperties; - } - - @Override - public Listener interceptCall( - ServerCall call, Metadata headers, ServerCallHandler next) { - // Disable the message logging interceptor entirely if message logging is disabled. - if (!loggingProperties.getAudit().getMessageLogging().isEnabled()) { - return next.startCall(call, headers); - } - - MessageAuditLogEntry.Builder entryBuilder = MessageAuditLogEntry.newBuilder(); - // default response/request message to empty proto in log entry. - // request could be empty when the client closes the connection before sending a request - // message. - // response could be unset when the service encounters an error when processsing the service - // call. - entryBuilder.setRequest(Empty.newBuilder().build()); - entryBuilder.setResponse(Empty.newBuilder().build()); - - // Unpack service & method name from call - // full method name is in format ./ - String fullMethodName = call.getMethodDescriptor().getFullMethodName(); - entryBuilder.setService( - fullMethodName.substring(fullMethodName.lastIndexOf(".") + 1, fullMethodName.indexOf("/"))); - entryBuilder.setMethod(fullMethodName.substring(fullMethodName.indexOf("/") + 1)); - - // Attempt Extract current authenticated identity. - entryBuilder.setIdentity(""); - - // Register forwarding call to intercept outgoing response and log to audit log - call = - new SimpleForwardingServerCall<>(call) { - @Override - public void sendMessage(RespT message) { - // 2. Track the response & Log entry to audit logger - super.sendMessage(message); - entryBuilder.setResponse((Message) message); - } - - @Override - public void close(Status status, Metadata trailers) { - super.close(status, trailers); - // 3. Log the message log entry to the audit log - Level logLevel = (status.isOk()) ? Level.INFO : Level.ERROR; - entryBuilder.setStatusCode(status.getCode()); - AuditLogger.logMessage(logLevel, entryBuilder); - } - }; - - ServerCall.Listener listener = next.startCall(call, headers); - return new SimpleForwardingServerCallListener<>(listener) { - @Override - // Register listener to intercept incoming request messages and log to audit log - public void onMessage(ReqT message) { - super.onMessage(message); - // 1. Track the request. - entryBuilder.setRequest((Message) message); - } - }; - } -} diff --git a/java/common/src/main/java/feast/common/validators/OneOfStringValidator.java b/java/common/src/main/java/feast/common/validators/OneOfStringValidator.java deleted file mode 100644 index 924953a2c4..0000000000 --- a/java/common/src/main/java/feast/common/validators/OneOfStringValidator.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2020 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.validators; - -import java.util.Arrays; -import javax.validation.ConstraintValidator; -import javax.validation.ConstraintValidatorContext; - -/** Validates whether a string value is found within a collection. */ -public class OneOfStringValidator implements ConstraintValidator { - - /** Values that are permitted for a specific instance of this validator */ - String[] allowedValues; - - /** - * Initialize the OneOfStringValidator with a collection of allowed String values. - * - * @param constraintAnnotation constraint annotation - */ - @Override - public void initialize(OneOfStrings constraintAnnotation) { - allowedValues = constraintAnnotation.value(); - } - - /** - * Validates whether a string value is found within the collection defined in the annotation. - * - * @param value String value that should be validated - * @param context Provides contextual data and operation when applying a given constraint - * validator - * @return Boolean value indicating whether the string is found within the allowed values. - */ - @Override - public boolean isValid(String value, ConstraintValidatorContext context) { - return Arrays.asList(allowedValues).contains(value); - } -} diff --git a/java/common/src/main/java/feast/common/validators/OneOfStrings.java b/java/common/src/main/java/feast/common/validators/OneOfStrings.java deleted file mode 100644 index b236f6f1af..0000000000 --- a/java/common/src/main/java/feast/common/validators/OneOfStrings.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2020 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.validators; - -import java.lang.annotation.*; -import javax.validation.Constraint; -import javax.validation.Payload; - -/** - * Annotation for String "one of" validation. Allows for the definition of a collection through an - * annotation. The collection is used to test values defined in the object. - */ -@Target({ - ElementType.METHOD, - ElementType.FIELD, - ElementType.ANNOTATION_TYPE, - ElementType.CONSTRUCTOR, - ElementType.PARAMETER -}) -@Retention(RetentionPolicy.RUNTIME) -@Documented -@Constraint(validatedBy = OneOfStringValidator.class) -public @interface OneOfStrings { - /** @return Default error message that is returned if the incorrect value is set */ - String message() default "Field value must be one of the following: {value}"; - - /** @return Allows for the specification of validation groups to which this constraint belongs. */ - Class[] groups() default {}; - - /** - * @return An attribute payload that can be used to assign custom payload objects to a constraint. - */ - Class[] payload() default {}; - - /** @return Default value that is returned if no allowed values are configured */ - String[] value() default {}; -} diff --git a/java/common/src/main/resources/log4j2.xml b/java/common/src/main/resources/log4j2.xml deleted file mode 100644 index c75c2db13c..0000000000 --- a/java/common/src/main/resources/log4j2.xml +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - %d{yyyy-MM-dd HH:mm:ss.SSS} %5p ${hostName} --- [%15.15t] %-40.40c{1.} : %m%n%ex - - - {"time":"%d{yyyy-MM-dd'T'HH:mm:ssXXX}","hostname":"${hostName}","severity":"%p","message":%m}%n%ex - - - - - - - - - - - - - - - - - - - - - - - diff --git a/java/common/src/test/java/feast/common/logging/entry/AuditLogEntryTest.java b/java/common/src/test/java/feast/common/logging/entry/AuditLogEntryTest.java deleted file mode 100644 index bc3dcbcf74..0000000000 --- a/java/common/src/test/java/feast/common/logging/entry/AuditLogEntryTest.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2020 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.common.logging.entry; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.equalTo; - -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; -import com.google.protobuf.Timestamp; -import feast.common.logging.entry.LogResource.ResourceType; -import feast.proto.serving.ServingAPIProto; -import feast.proto.serving.ServingAPIProto.FeatureReferenceV2; -import feast.proto.serving.ServingAPIProto.GetOnlineFeaturesRequestV2; -import feast.proto.serving.ServingAPIProto.GetOnlineFeaturesResponse; -import feast.proto.types.ValueProto.Value; -import io.grpc.Status; -import java.util.Arrays; -import java.util.List; -import org.junit.Test; - -public class AuditLogEntryTest { - public List getTestAuditLogs() { - GetOnlineFeaturesRequestV2 requestSpec = - GetOnlineFeaturesRequestV2.newBuilder() - .addAllFeatures( - Arrays.asList( - FeatureReferenceV2.newBuilder() - .setFeatureViewName("featuretable_1") - .setFeatureName("feature1") - .build(), - FeatureReferenceV2.newBuilder() - .setFeatureViewName("featuretable_1") - .setFeatureName("feature2") - .build())) - .build(); - - GetOnlineFeaturesResponse responseSpec = - GetOnlineFeaturesResponse.newBuilder() - .setMetadata( - ServingAPIProto.GetOnlineFeaturesResponseMetadata.newBuilder() - .setFeatureNames( - ServingAPIProto.FeatureList.newBuilder() - .addAllVal( - Arrays.asList( - "featuretable_1:feature_1", "featuretable_1:feature2")))) - .addAllResults( - Arrays.asList( - GetOnlineFeaturesResponse.FeatureVector.newBuilder() - .addValues(Value.newBuilder().setInt32Val(32).build()) - .addStatuses(ServingAPIProto.FieldStatus.PRESENT) - .addEventTimestamps(Timestamp.newBuilder().build()) - .build(), - GetOnlineFeaturesResponse.FeatureVector.newBuilder() - .addValues(Value.newBuilder().setInt32Val(64).build()) - .addStatuses(ServingAPIProto.FieldStatus.PRESENT) - .addEventTimestamps(Timestamp.newBuilder().build()) - .build())) - .build(); - - return Arrays.asList( - MessageAuditLogEntry.newBuilder() - .setComponent("feast-serving") - .setVersion("0.9") - .setService("ServingService") - .setMethod("getOnlineFeatures") - .setRequest(requestSpec) - .setResponse(responseSpec) - .setStatusCode(Status.OK.getCode()) - .setIdentity("adam@no.such.email") - .build(), - ActionAuditLogEntry.of( - "core", "0.9", LogResource.of(ResourceType.JOB, "kafka-to-redis"), "CREATE"), - TransitionAuditLogEntry.of( - "core", "0.9", LogResource.of(ResourceType.FEATURE_TABLE, "featuretable_1"), "READY")); - } - - @Test - public void shouldReturnJSONRepresentationOfAuditLog() { - for (AuditLogEntry auditLog : getTestAuditLogs()) { - // Check that auditLog's toJSON() returns valid JSON - String logJSON = auditLog.toJSON(); - System.out.println(logJSON); - JsonParser parser = new JsonParser(); - - // check basic fields are present in JSON representation. - JsonObject logObject = parser.parse(logJSON).getAsJsonObject(); - assertThat(logObject.getAsJsonPrimitive("logType").getAsString(), equalTo(auditLog.logType)); - assertThat( - logObject.getAsJsonPrimitive("kind").getAsString(), equalTo(auditLog.getKind().name())); - } - } -} diff --git a/java/docs/coverage/pom.xml b/java/coverage/pom.xml similarity index 85% rename from java/docs/coverage/pom.xml rename to java/coverage/pom.xml index f6e08909ee..a604135c79 100644 --- a/java/docs/coverage/pom.xml +++ b/java/coverage/pom.xml @@ -30,7 +30,7 @@ dev.feast feast-parent ${revision} - ../.. + .. Feast Coverage Java @@ -41,18 +41,6 @@ - - dev.feast - feast-storage-api - ${project.version} - - - - dev.feast - feast-storage-connector-redis - ${project.version} - - dev.feast feast-serving diff --git a/java/infra/docker/feature-server/Dockerfile b/java/infra/docker/feature-server/Dockerfile index dbd8c91472..a728340d6b 100644 --- a/java/infra/docker/feature-server/Dockerfile +++ b/java/infra/docker/feature-server/Dockerfile @@ -8,13 +8,9 @@ WORKDIR /build COPY java/pom.xml . COPY java/datatypes/pom.xml datatypes/pom.xml -COPY java/common/pom.xml common/pom.xml COPY java/serving/pom.xml serving/pom.xml -COPY java/storage/api/pom.xml storage/api/pom.xml -COPY java/storage/connectors/pom.xml storage/connectors/pom.xml -COPY java/storage/connectors/redis/pom.xml storage/connectors/redis/pom.xml -COPY java/sdk/pom.xml sdk/pom.xml -COPY java/docs/coverage/pom.xml docs/coverage/pom.xml +COPY java/serving-client/pom.xml serving-client/pom.xml +COPY java/coverage/pom.xml coverage/pom.xml # Setting Maven repository .m2 directory relative to /build folder gives the # user to optionally use cached repository when building the image by copying @@ -28,7 +24,7 @@ COPY protos/feast datatypes/src/main/proto/feast ARG VERSION=dev RUN mvn --also-make --projects serving -Drevision=$VERSION \ - -DskipUTs=true --batch-mode clean package + -DskipUTs=true -DskipITs=true --batch-mode clean package # # Download grpc_health_probe to run health check for Feast Serving # https://kubernetes.io/blog/2018/10/01/health-checking-grpc-servers-on-kubernetes/ diff --git a/java/infra/docker/feature-server/Dockerfile.dev b/java/infra/docker/feature-server/Dockerfile.dev index 93bbbbb718..4eaec41ae3 100644 --- a/java/infra/docker/feature-server/Dockerfile.dev +++ b/java/infra/docker/feature-server/Dockerfile.dev @@ -7,7 +7,7 @@ ARG REVISION=dev RUN wget -q https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/v0.3.1/grpc_health_probe-linux-amd64 \ -O /usr/bin/grpc-health-probe && \ chmod +x /usr/bin/grpc-health-probe -ADD $PWD/serving/target/feast-serving-$REVISION-exec.jar /opt/feast/feast-serving.jar +ADD $PWD/java/serving/target/feast-serving-$REVISION-jar-with-dependencies.jar /opt/feast/feast-serving.jar CMD ["java",\ "-Xms1024m",\ "-Xmx1024m",\ diff --git a/java/pom.xml b/java/pom.xml index 0bf92ee244..9cff26daa6 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -29,16 +29,13 @@ datatypes - storage/api - storage/connectors serving - sdk - docs/coverage - common + serving-client + coverage - 0.23.0 + 0.24.0 https://github.com/feast-dev/feast UTF-8 @@ -91,6 +88,7 @@ */ ]]> + ${maven.multiModuleProjectDirectory} false diff --git a/java/sdk/pom.xml b/java/serving-client/pom.xml similarity index 97% rename from java/sdk/pom.xml rename to java/serving-client/pom.xml index 5896214b27..7b8838a009 100644 --- a/java/sdk/pom.xml +++ b/java/serving-client/pom.xml @@ -4,8 +4,8 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - Feast SDK for Java - SDK for registering, storing, and retrieving features + Feast Serving Client + Client for retrieving features from a Feast feature server feast-serving-client diff --git a/java/sdk/src/main/java/dev/feast/FeastClient.java b/java/serving-client/src/main/java/dev/feast/FeastClient.java similarity index 100% rename from java/sdk/src/main/java/dev/feast/FeastClient.java rename to java/serving-client/src/main/java/dev/feast/FeastClient.java diff --git a/java/sdk/src/main/java/dev/feast/RequestUtil.java b/java/serving-client/src/main/java/dev/feast/RequestUtil.java similarity index 100% rename from java/sdk/src/main/java/dev/feast/RequestUtil.java rename to java/serving-client/src/main/java/dev/feast/RequestUtil.java diff --git a/java/sdk/src/main/java/dev/feast/Row.java b/java/serving-client/src/main/java/dev/feast/Row.java similarity index 100% rename from java/sdk/src/main/java/dev/feast/Row.java rename to java/serving-client/src/main/java/dev/feast/Row.java diff --git a/java/sdk/src/main/java/dev/feast/SecurityConfig.java b/java/serving-client/src/main/java/dev/feast/SecurityConfig.java similarity index 100% rename from java/sdk/src/main/java/dev/feast/SecurityConfig.java rename to java/serving-client/src/main/java/dev/feast/SecurityConfig.java diff --git a/java/sdk/src/test/java/dev/feast/FeastClientTest.java b/java/serving-client/src/test/java/dev/feast/FeastClientTest.java similarity index 100% rename from java/sdk/src/test/java/dev/feast/FeastClientTest.java rename to java/serving-client/src/test/java/dev/feast/FeastClientTest.java diff --git a/java/sdk/src/test/java/dev/feast/RequestUtilTest.java b/java/serving-client/src/test/java/dev/feast/RequestUtilTest.java similarity index 100% rename from java/sdk/src/test/java/dev/feast/RequestUtilTest.java rename to java/serving-client/src/test/java/dev/feast/RequestUtilTest.java diff --git a/java/serving/README.md b/java/serving/README.md index 5ac7194924..dc23702d0f 100644 --- a/java/serving/README.md +++ b/java/serving/README.md @@ -3,14 +3,18 @@ ### Overview This guide is targeted at developers looking to contribute to Feast Serving: - [Building and running Feast Serving locally](#building-and-running-feast-serving-locally) +- [Unit / Integration Tests](#unit-/-integration-tests) +- [Developing against Feast Helm charts](#developing-against-feast-helm-charts) -### Pre-requisites: +### Building and running Feast Serving locally: + +#### Pre-requisites - [Maven](https://maven.apache.org/install.html) build tool version 3.6.x - A Feast feature repo (e.g. https://github.com/feast-dev/feast-demo) - A running Store instance e.g. local Redis instance with `redis-server` -### Building and running Feast Serving locally: +#### Steps From the Feast GitHub root, run: 1. `mvn -f java/pom.xml install -Dmaven.test.skip=true` @@ -41,12 +45,12 @@ From the Feast GitHub root, run: java \ -Xms1g \ -Xmx4g \ - -jar java/serving/target/feast-serving-0.17.1-SNAPSHOT-jar-with-dependencies.jar \ + -jar java/serving/target/feast-serving-[YOUR VERSION]-jar-with-dependencies.jar \ classpath:/application.yml,file:./application-override.yaml ``` 5. Now you have a Feast Serving gRPC service running on port 6566 locally! -### Running test queries +#### Running test queries If you have [grpc_cli](https://github.com/grpc/grpc/blob/master/doc/command_line_tool.md) installed, you can check that Feast Serving is running ``` grpc_cli ls localhost:6566 @@ -116,7 +120,7 @@ results { Rpc succeeded with OK status ``` -### Debugging Feast Serving +#### Debugging Feast Serving You can debug this like any other Java executable. Swap the java command above with: ``` java \ @@ -124,7 +128,7 @@ You can debug this like any other Java executable. Swap the java command above w -Xrunjdwp:transport=dt_socket,address=5005,server=y,suspend=y \ -Xms1g \ -Xmx4g \ - -jar java/serving/target/feast-serving-0.17.1-SNAPSHOT-jar-with-dependencies.jar \ + -jar java/serving/target/feast-serving-[YOUR VERSION]-jar-with-dependencies.jar \ classpath:/application.yml,file:./application-override.yaml ``` Now you can attach e.g. a Remote debugger in IntelliJ to port 5005 to debug / make breakpoints. @@ -136,4 +140,10 @@ Unit & Integration Tests can be used to verify functionality: mvn test -pl serving --also-make # run integration tests mvn verify -pl serving --also-make -``` \ No newline at end of file +# run integration tests with debugger +mvn -Dmaven.failsafe.debug verify -pl serving --also-make +``` + +### Developing against Feast Helm charts +Look at [java-demo](../../examples/java-demo) for steps on how to update the helm chart or java logic and test their +interactions. \ No newline at end of file diff --git a/java/serving/pom.xml b/java/serving/pom.xml index f173cdd5fe..8f0cf407e9 100644 --- a/java/serving/pom.xml +++ b/java/serving/pom.xml @@ -82,6 +82,29 @@ + + + org.codehaus.mojo + exec-maven-plugin + 1.6.0 + + + + python + src/test/resources/docker-compose/feast10/ + + setup_it.py + + ${skipITs} + + feast_test_apply + process-test-resources + + exec + + + + @@ -92,24 +115,6 @@ ${project.version} - - dev.feast - feast-common - ${project.version} - - - - dev.feast - feast-storage-api - ${project.version} - - - - dev.feast - feast-storage-connector-redis - ${project.version} - - com.google.inject guice @@ -345,6 +350,16 @@ 2.7.4 test + + io.lettuce + lettuce-core + 6.0.2.RELEASE + + + org.apache.commons + commons-lang3 + 3.10 + diff --git a/java/serving/src/main/java/feast/serving/ServingGuiceApplication.java b/java/serving/src/main/java/feast/serving/ServingGuiceApplication.java index 664d6dd4ec..d91af8abb1 100644 --- a/java/serving/src/main/java/feast/serving/ServingGuiceApplication.java +++ b/java/serving/src/main/java/feast/serving/ServingGuiceApplication.java @@ -18,7 +18,7 @@ import com.google.inject.Guice; import com.google.inject.Injector; -import feast.serving.config.*; +import feast.serving.service.config.*; import io.grpc.Server; import java.io.IOException; @@ -32,9 +32,9 @@ public static void main(String[] args) throws InterruptedException, IOException final Injector i = Guice.createInjector( - new ServingServiceConfigV2(), - new RegistryConfig(), - new InstrumentationConfig(), + new ServingServiceV2Module(), + new RegistryConfigModule(), + new InstrumentationConfigModule(), new ServerModule(), new ApplicationPropertiesModule(args)); diff --git a/java/storage/api/src/main/java/feast/storage/api/retriever/Feature.java b/java/serving/src/main/java/feast/serving/connectors/Feature.java similarity index 94% rename from java/storage/api/src/main/java/feast/storage/api/retriever/Feature.java rename to java/serving/src/main/java/feast/serving/connectors/Feature.java index 92ae1f31fb..af96a90866 100644 --- a/java/storage/api/src/main/java/feast/storage/api/retriever/Feature.java +++ b/java/serving/src/main/java/feast/serving/connectors/Feature.java @@ -14,18 +14,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.api.retriever; +package feast.serving.connectors; import com.google.protobuf.Timestamp; import feast.proto.serving.ServingAPIProto.FeatureReferenceV2; import feast.proto.types.ValueProto; import feast.proto.types.ValueProto.Value; import java.util.HashMap; +import java.util.Map; public interface Feature { - HashMap TYPE_TO_VAL_CASE = - new HashMap() { + Map TYPE_TO_VAL_CASE = + new HashMap<>() { { put(ValueProto.ValueType.Enum.BYTES, ValueProto.Value.ValCase.BYTES_VAL); put(ValueProto.ValueType.Enum.STRING, ValueProto.Value.ValCase.STRING_VAL); diff --git a/java/storage/api/src/main/java/feast/storage/api/retriever/OnlineRetrieverV2.java b/java/serving/src/main/java/feast/serving/connectors/OnlineRetriever.java similarity index 96% rename from java/storage/api/src/main/java/feast/storage/api/retriever/OnlineRetrieverV2.java rename to java/serving/src/main/java/feast/serving/connectors/OnlineRetriever.java index fde8ba7396..79c062814b 100644 --- a/java/storage/api/src/main/java/feast/storage/api/retriever/OnlineRetrieverV2.java +++ b/java/serving/src/main/java/feast/serving/connectors/OnlineRetriever.java @@ -14,14 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.api.retriever; +package feast.serving.connectors; import feast.proto.serving.ServingAPIProto; import feast.proto.types.ValueProto; import java.util.List; import java.util.Map; -public interface OnlineRetrieverV2 { +public interface OnlineRetriever { /** * Get online features for the given entity rows using data retrieved from the Feature references * specified in FeatureTable request. diff --git a/java/storage/api/src/main/java/feast/storage/api/retriever/ProtoFeature.java b/java/serving/src/main/java/feast/serving/connectors/ProtoFeature.java similarity index 98% rename from java/storage/api/src/main/java/feast/storage/api/retriever/ProtoFeature.java rename to java/serving/src/main/java/feast/serving/connectors/ProtoFeature.java index 09f6b75f49..9820898d00 100644 --- a/java/storage/api/src/main/java/feast/storage/api/retriever/ProtoFeature.java +++ b/java/serving/src/main/java/feast/serving/connectors/ProtoFeature.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.api.retriever; +package feast.serving.connectors; import com.google.protobuf.Timestamp; import feast.proto.serving.ServingAPIProto; diff --git a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/common/RedisHashDecoder.java b/java/serving/src/main/java/feast/serving/connectors/redis/common/RedisHashDecoder.java similarity index 96% rename from java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/common/RedisHashDecoder.java rename to java/serving/src/main/java/feast/serving/connectors/redis/common/RedisHashDecoder.java index 78b64fd141..9f5c94924d 100644 --- a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/common/RedisHashDecoder.java +++ b/java/serving/src/main/java/feast/serving/connectors/redis/common/RedisHashDecoder.java @@ -14,15 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.redis.common; +package feast.serving.connectors.redis.common; import com.google.common.hash.Hashing; import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.Timestamp; import feast.proto.serving.ServingAPIProto; import feast.proto.types.ValueProto; -import feast.storage.api.retriever.Feature; -import feast.storage.api.retriever.ProtoFeature; +import feast.serving.connectors.Feature; +import feast.serving.connectors.ProtoFeature; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.*; diff --git a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/common/RedisKeyGenerator.java b/java/serving/src/main/java/feast/serving/connectors/redis/common/RedisKeyGenerator.java similarity index 97% rename from java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/common/RedisKeyGenerator.java rename to java/serving/src/main/java/feast/serving/connectors/redis/common/RedisKeyGenerator.java index 389ca0abfd..defb337a82 100644 --- a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/common/RedisKeyGenerator.java +++ b/java/serving/src/main/java/feast/serving/connectors/redis/common/RedisKeyGenerator.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.redis.common; +package feast.serving.connectors.redis.common; import feast.proto.serving.ServingAPIProto; import feast.proto.storage.RedisProto; diff --git a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializer.java b/java/serving/src/main/java/feast/serving/connectors/redis/retriever/EntityKeySerializer.java similarity index 94% rename from java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializer.java rename to java/serving/src/main/java/feast/serving/connectors/redis/retriever/EntityKeySerializer.java index 6220dd29d4..d25f0da4f9 100644 --- a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializer.java +++ b/java/serving/src/main/java/feast/serving/connectors/redis/retriever/EntityKeySerializer.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.redis.retriever; +package feast.serving.connectors.redis.retriever; import feast.proto.storage.RedisProto; diff --git a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializerV2.java b/java/serving/src/main/java/feast/serving/connectors/redis/retriever/EntityKeySerializerV2.java similarity index 96% rename from java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializerV2.java rename to java/serving/src/main/java/feast/serving/connectors/redis/retriever/EntityKeySerializerV2.java index f99e5cbdb1..672f4d7c31 100644 --- a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializerV2.java +++ b/java/serving/src/main/java/feast/serving/connectors/redis/retriever/EntityKeySerializerV2.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.redis.retriever; +package feast.serving.connectors.redis.retriever; import com.google.protobuf.ProtocolStringList; import feast.proto.storage.RedisProto; @@ -87,14 +87,15 @@ public byte[] serialize(RedisProto.RedisKeyV2 entityKey) { break; case INT64_VAL: buffer.addAll(encodeInteger(ValueProto.ValueType.Enum.INT64.getNumber())); - buffer.addAll(encodeInteger(Integer.BYTES)); /* This is super dumb - but in https://github.com/feast-dev/feast/blob/dcae1606f53028ce5413567fb8b66f92cfef0f8e/sdk/python/feast/infra/key_encoding_utils.py#L9 we use `struct.pack(" tracerOptional; + private final OnlineRetriever retriever; private final RegistryRepository registryRepository; private final OnlineTransformationService onlineTransformationService; private final String project; @@ -56,16 +62,16 @@ public class OnlineServingServiceV2 implements ServingServiceV2 { ValueProto.Value.newBuilder().setStringVal(DUMMY_ENTITY_VAL).build(); public OnlineServingServiceV2( - OnlineRetrieverV2 retriever, - Tracer tracer, + OnlineRetriever retriever, RegistryRepository registryRepository, OnlineTransformationService onlineTransformationService, - String project) { + String project, + Optional tracerOptional) { this.retriever = retriever; - this.tracer = tracer; this.registryRepository = registryRepository; this.onlineTransformationService = onlineTransformationService; this.project = project; + this.tracerOptional = tracerOptional; } /** {@inheritDoc} */ @@ -107,20 +113,21 @@ public ServingAPIProto.GetOnlineFeaturesResponse getOnlineFeatures( List> entityRows = getEntityRows(request); - Span storageRetrievalSpan = tracer.buildSpan("storageRetrieval").start(); + Span storageRetrievalSpan = + tracerOptional.map(tracer -> tracer.buildSpan("storageRetrieval").start()).orElse(null); if (storageRetrievalSpan != null) { storageRetrievalSpan.setTag("entities", entityRows.size()); storageRetrievalSpan.setTag("features", retrievedFeatureReferences.size()); } - List> features = - retrieveFeatures(retrievedFeatureReferences, entityRows); + List> features = retrieveFeatures(retrievedFeatureReferences, entityRows); if (storageRetrievalSpan != null) { storageRetrievalSpan.finish(); } - Span postProcessingSpan = tracer.buildSpan("postProcessing").start(); + Span postProcessingSpan = + tracerOptional.map(tracer -> tracer.buildSpan("postProcessing").start()).orElse(null); ServingAPIProto.GetOnlineFeaturesResponse.Builder responseBuilder = ServingAPIProto.GetOnlineFeaturesResponse.newBuilder(); @@ -141,7 +148,7 @@ public ServingAPIProto.GetOnlineFeaturesResponse getOnlineFeatures( responseBuilder.addResultsBuilder(); for (int rowIdx = 0; rowIdx < features.size(); rowIdx++) { - feast.storage.api.retriever.Feature feature = features.get(rowIdx).get(featureIdx); + Feature feature = features.get(rowIdx).get(featureIdx); if (feature == null) { vectorBuilder.addValues(nullValue); vectorBuilder.addStatuses(FieldStatus.NOT_FOUND); @@ -172,7 +179,7 @@ public ServingAPIProto.GetOnlineFeaturesResponse getOnlineFeatures( ServingAPIProto.FeatureList.newBuilder() .addAllVal( retrievedFeatureReferences.stream() - .map(Feature::getFeatureReference) + .map(FeatureUtil::getFeatureReference) .collect(Collectors.toList())))); if (postProcessingSpan != null) { @@ -202,7 +209,7 @@ private List getFeaturesList( ServingAPIProto.GetOnlineFeaturesRequest request) { if (request.getFeatures().getValCount() > 0) { return request.getFeatures().getValList().stream() - .map(Feature::parseFeatureReference) + .map(FeatureUtil::parseFeatureReference) .collect(Collectors.toList()); } @@ -246,7 +253,7 @@ private List> getEntityRows( return entityRows; } - private List> retrieveFeatures( + private List> retrieveFeatures( List featureReferences, List> entityRows) { // Prepare feature reference to index mapping. This mapping will be used to arrange the // retrieved features to the same order as in the input. @@ -267,10 +274,9 @@ private List> retrieveFeatures( } // Create placeholders for retrieved features. - List> features = new ArrayList<>(entityRows.size()); + List> features = new ArrayList<>(entityRows.size()); for (int i = 0; i < entityRows.size(); i++) { - List featuresPerEntity = - new ArrayList<>(featureReferences.size()); + List featuresPerEntity = new ArrayList<>(featureReferences.size()); for (int j = 0; j < featureReferences.size(); j++) { featuresPerEntity.add(null); } @@ -311,7 +317,7 @@ private List> retrieveFeatures( }); entityRowsPerGroup.add(entityRowPerGroup); } - List> featuresPerGroup = + List> featuresPerGroup = retriever.getOnlineFeatures(entityRowsPerGroup, featureReferencesPerGroup, entityNames); for (int i = 0; i < featuresPerGroup.size(); i++) { for (int j = 0; j < featureReferencesPerGroup.size(); j++) { @@ -329,7 +335,7 @@ private void populateOnDemandFeatures( List onDemandFeatureSources, List retrievedFeatureReferences, ServingAPIProto.GetOnlineFeaturesRequest request, - List> features, + List> features, ServingAPIProto.GetOnlineFeaturesResponse.Builder responseBuilder) { List>> onDemandContext = @@ -366,7 +372,7 @@ private void populateOnDemandFeatures( // Send out requests to the FTS and process the responses. Set onDemandFeatureStringReferences = onDemandFeatureReferences.stream() - .map(r -> Feature.getFeatureReference(r)) + .map(r -> FeatureUtil.getFeatureReference(r)) .collect(Collectors.toSet()); for (FeatureReferenceV2 featureReference : onDemandFeatureReferences) { @@ -418,7 +424,7 @@ private static FieldStatus getFeatureStatus(ValueProto.Value value, boolean isOu * @param maxAge feature's max age. */ private static boolean checkOutsideMaxAge( - feast.storage.api.retriever.Feature feature, Timestamp entityTimestamp, Duration maxAge) { + Feature feature, Timestamp entityTimestamp, Duration maxAge) { if (maxAge.equals(Duration.getDefaultInstance())) { // max age is not set return false; @@ -457,7 +463,7 @@ private void populateHistogramMetrics( private void populateCountMetrics( FeatureReferenceV2 featureRef, ServingAPIProto.GetOnlineFeaturesResponse.FeatureVectorOrBuilder featureVector) { - String featureRefString = Feature.getFeatureReference(featureRef); + String featureRefString = FeatureUtil.getFeatureReference(featureRef); featureVector .getStatusesList() .forEach( @@ -475,7 +481,7 @@ private void populateFeatureCountMetrics(List featureReferen featureReferences.forEach( featureReference -> Metrics.requestFeatureCount - .labels(project, Feature.getFeatureReference(featureReference)) + .labels(project, FeatureUtil.getFeatureReference(featureReference)) .inc()); } } diff --git a/java/serving/src/main/java/feast/serving/config/ApplicationProperties.java b/java/serving/src/main/java/feast/serving/service/config/ApplicationProperties.java similarity index 94% rename from java/serving/src/main/java/feast/serving/config/ApplicationProperties.java rename to java/serving/src/main/java/feast/serving/service/config/ApplicationProperties.java index 5850eb6483..e4c33434a1 100644 --- a/java/serving/src/main/java/feast/serving/config/ApplicationProperties.java +++ b/java/serving/src/main/java/feast/serving/service/config/ApplicationProperties.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.config; +package feast.serving.service.config; // Feast configuration properties that maps Feast configuration from default application.yml file to // a Java object. @@ -23,16 +23,14 @@ import com.fasterxml.jackson.annotation.JsonMerge; import com.fasterxml.jackson.annotation.OptBoolean; -import feast.common.logging.config.LoggingProperties; -import feast.storage.connectors.redis.retriever.RedisClusterStoreConfig; -import feast.storage.connectors.redis.retriever.RedisStoreConfig; +import feast.serving.connectors.redis.retriever.RedisClusterStoreConfig; +import feast.serving.connectors.redis.retriever.RedisStoreConfig; import io.lettuce.core.ReadFrom; import java.time.Duration; import java.util.*; import javax.annotation.PostConstruct; import javax.validation.*; import javax.validation.constraints.NotBlank; -import javax.validation.constraints.NotNull; import org.slf4j.Logger; /** Feast Serving properties. */ @@ -58,14 +56,6 @@ public void setGrpc(GrpcServer grpc) { this.grpc = grpc; } - public RestServer getRest() { - return rest; - } - - public void setRest(RestServer rest) { - this.rest = rest; - } - /** * Validates all FeastProperties. This method runs after properties have been initialized and * individually and conditionally validates each class. @@ -103,8 +93,6 @@ public static class FeastProperties { private List stores = new ArrayList<>(); /* Metric tracing properties. */ private TracingProperties tracing; - /* Feast Audit Logging properties */ - @NotNull private LoggingProperties logging; private String gcpProject; private String awsRegion; private String transformationServiceEndpoint; @@ -195,15 +183,6 @@ public void setTracing(TracingProperties tracing) { this.tracing = tracing; } - /** - * Gets logging properties - * - * @return logging properties - */ - public LoggingProperties getLogging() { - return logging; - } - public String getGcpProject() { return gcpProject; } diff --git a/java/serving/src/main/java/feast/serving/config/ApplicationPropertiesModule.java b/java/serving/src/main/java/feast/serving/service/config/ApplicationPropertiesModule.java similarity index 98% rename from java/serving/src/main/java/feast/serving/config/ApplicationPropertiesModule.java rename to java/serving/src/main/java/feast/serving/service/config/ApplicationPropertiesModule.java index 07183fc710..3575733073 100644 --- a/java/serving/src/main/java/feast/serving/config/ApplicationPropertiesModule.java +++ b/java/serving/src/main/java/feast/serving/service/config/ApplicationPropertiesModule.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.config; +package feast.serving.service.config; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; diff --git a/java/serving/src/main/java/feast/serving/config/InstrumentationConfig.java b/java/serving/src/main/java/feast/serving/service/config/InstrumentationConfigModule.java similarity index 93% rename from java/serving/src/main/java/feast/serving/config/InstrumentationConfig.java rename to java/serving/src/main/java/feast/serving/service/config/InstrumentationConfigModule.java index 7f8590bb84..3eb748e451 100644 --- a/java/serving/src/main/java/feast/serving/config/InstrumentationConfig.java +++ b/java/serving/src/main/java/feast/serving/service/config/InstrumentationConfigModule.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.config; +package feast.serving.service.config; import com.google.inject.AbstractModule; import com.google.inject.Provides; @@ -22,7 +22,7 @@ import io.opentracing.contrib.grpc.TracingServerInterceptor; import io.opentracing.noop.NoopTracerFactory; -public class InstrumentationConfig extends AbstractModule { +public class InstrumentationConfigModule extends AbstractModule { @Provides public Tracer tracer(ApplicationProperties applicationProperties) { diff --git a/java/serving/src/main/java/feast/serving/config/RegistryConfig.java b/java/serving/src/main/java/feast/serving/service/config/RegistryConfigModule.java similarity index 96% rename from java/serving/src/main/java/feast/serving/config/RegistryConfig.java rename to java/serving/src/main/java/feast/serving/service/config/RegistryConfigModule.java index 49e08e03b6..cfb4666f07 100644 --- a/java/serving/src/main/java/feast/serving/config/RegistryConfig.java +++ b/java/serving/src/main/java/feast/serving/service/config/RegistryConfigModule.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.config; +package feast.serving.service.config; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3ClientBuilder; @@ -27,7 +27,7 @@ import java.net.URI; import java.util.Optional; -public class RegistryConfig extends AbstractModule { +public class RegistryConfigModule extends AbstractModule { @Provides Storage googleStorage(ApplicationProperties applicationProperties) { return StorageOptions.newBuilder() diff --git a/java/serving/src/main/java/feast/serving/config/ServerModule.java b/java/serving/src/main/java/feast/serving/service/config/ServerModule.java similarity index 92% rename from java/serving/src/main/java/feast/serving/config/ServerModule.java rename to java/serving/src/main/java/feast/serving/service/config/ServerModule.java index 5428306f2b..a5d902b17b 100644 --- a/java/serving/src/main/java/feast/serving/config/ServerModule.java +++ b/java/serving/src/main/java/feast/serving/service/config/ServerModule.java @@ -14,13 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.config; +package feast.serving.service.config; import com.google.inject.AbstractModule; import com.google.inject.Provides; -import feast.serving.controller.HealthServiceController; -import feast.serving.grpc.OnlineServingGrpcServiceV2; import feast.serving.service.ServingServiceV2; +import feast.serving.service.controller.HealthServiceController; +import feast.serving.service.grpc.OnlineServingGrpcServiceV2; import io.grpc.Server; import io.grpc.ServerBuilder; import io.grpc.health.v1.HealthGrpc; diff --git a/java/serving/src/main/java/feast/serving/config/ServingServiceConfigV2.java b/java/serving/src/main/java/feast/serving/service/config/ServingServiceV2Module.java similarity index 83% rename from java/serving/src/main/java/feast/serving/config/ServingServiceConfigV2.java rename to java/serving/src/main/java/feast/serving/service/config/ServingServiceV2Module.java index 868e3b83d1..564159ceed 100644 --- a/java/serving/src/main/java/feast/serving/config/ServingServiceConfigV2.java +++ b/java/serving/src/main/java/feast/serving/service/config/ServingServiceV2Module.java @@ -14,21 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.config; +package feast.serving.service.config; import com.google.inject.AbstractModule; import com.google.inject.Provides; -import feast.serving.registry.*; +import feast.serving.connectors.OnlineRetriever; +import feast.serving.connectors.redis.retriever.*; +import feast.serving.registry.RegistryRepository; import feast.serving.service.OnlineServingServiceV2; import feast.serving.service.OnlineTransformationService; import feast.serving.service.ServingServiceV2; -import feast.storage.api.retriever.OnlineRetrieverV2; -import feast.storage.connectors.redis.retriever.*; import io.opentracing.Tracer; +import java.util.Optional; import org.slf4j.Logger; -public class ServingServiceConfigV2 extends AbstractModule { - private static final Logger log = org.slf4j.LoggerFactory.getLogger(ServingServiceConfigV2.class); +public class ServingServiceV2Module extends AbstractModule { + private static final Logger log = org.slf4j.LoggerFactory.getLogger(ServingServiceV2Module.class); @Provides public ServingServiceV2 registryBasedServingServiceV2( @@ -38,14 +39,14 @@ public ServingServiceV2 registryBasedServingServiceV2( final ServingServiceV2 servingService; final ApplicationProperties.Store store = applicationProperties.getFeast().getActiveStore(); - OnlineRetrieverV2 retrieverV2; + OnlineRetriever retriever; // TODO: Support more store types, and potentially use a plugin model here. switch (store.getType()) { case REDIS_CLUSTER: RedisClientAdapter redisClusterClient = RedisClusterClient.create(store.getRedisClusterConfig()); - retrieverV2 = - new OnlineRetriever( + retriever = + new RedisOnlineRetriever( applicationProperties.getFeast().getProject(), redisClusterClient, new EntityKeySerializerV2( @@ -54,8 +55,8 @@ public ServingServiceV2 registryBasedServingServiceV2( case REDIS: RedisClientAdapter redisClient = RedisClient.create(store.getRedisConfig()); log.info("Created EntityKeySerializerV2"); - retrieverV2 = - new OnlineRetriever( + retriever = + new RedisOnlineRetriever( applicationProperties.getFeast().getProject(), redisClient, new EntityKeySerializerV2( @@ -77,11 +78,11 @@ public ServingServiceV2 registryBasedServingServiceV2( servingService = new OnlineServingServiceV2( - retrieverV2, - tracer, + retriever, registryRepository, onlineTransformationService, - applicationProperties.getFeast().getProject()); + applicationProperties.getFeast().getProject(), + Optional.of(tracer)); return servingService; } diff --git a/java/serving/src/main/java/feast/serving/controller/HealthServiceController.java b/java/serving/src/main/java/feast/serving/service/controller/HealthServiceController.java similarity index 98% rename from java/serving/src/main/java/feast/serving/controller/HealthServiceController.java rename to java/serving/src/main/java/feast/serving/service/controller/HealthServiceController.java index 2f98ae032f..2ce17a5751 100644 --- a/java/serving/src/main/java/feast/serving/controller/HealthServiceController.java +++ b/java/serving/src/main/java/feast/serving/service/controller/HealthServiceController.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.controller; +package feast.serving.service.controller; import com.google.inject.Inject; import feast.proto.serving.ServingAPIProto.GetFeastServingInfoRequest; diff --git a/java/serving/src/main/java/feast/serving/grpc/OnlineServingGrpcServiceV2.java b/java/serving/src/main/java/feast/serving/service/grpc/OnlineServingGrpcServiceV2.java similarity index 98% rename from java/serving/src/main/java/feast/serving/grpc/OnlineServingGrpcServiceV2.java rename to java/serving/src/main/java/feast/serving/service/grpc/OnlineServingGrpcServiceV2.java index fe024404f3..bc155a7fa2 100644 --- a/java/serving/src/main/java/feast/serving/grpc/OnlineServingGrpcServiceV2.java +++ b/java/serving/src/main/java/feast/serving/service/grpc/OnlineServingGrpcServiceV2.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.grpc; +package feast.serving.service.grpc; import feast.proto.serving.ServingAPIProto; import feast.proto.serving.ServingServiceGrpc; diff --git a/java/serving/src/main/java/feast/serving/interceptors/GrpcMonitoringContext.java b/java/serving/src/main/java/feast/serving/service/interceptors/GrpcMonitoringContext.java similarity index 96% rename from java/serving/src/main/java/feast/serving/interceptors/GrpcMonitoringContext.java rename to java/serving/src/main/java/feast/serving/service/interceptors/GrpcMonitoringContext.java index 48d8d76a91..c1803e99fc 100644 --- a/java/serving/src/main/java/feast/serving/interceptors/GrpcMonitoringContext.java +++ b/java/serving/src/main/java/feast/serving/service/interceptors/GrpcMonitoringContext.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.interceptors; +package feast.serving.service.interceptors; import java.util.Optional; diff --git a/java/serving/src/main/java/feast/serving/interceptors/GrpcMonitoringInterceptor.java b/java/serving/src/main/java/feast/serving/service/interceptors/GrpcMonitoringInterceptor.java similarity index 98% rename from java/serving/src/main/java/feast/serving/interceptors/GrpcMonitoringInterceptor.java rename to java/serving/src/main/java/feast/serving/service/interceptors/GrpcMonitoringInterceptor.java index 735f8c556d..8777e8f844 100644 --- a/java/serving/src/main/java/feast/serving/interceptors/GrpcMonitoringInterceptor.java +++ b/java/serving/src/main/java/feast/serving/service/interceptors/GrpcMonitoringInterceptor.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.serving.interceptors; +package feast.serving.service.interceptors; import feast.serving.util.Metrics; import io.grpc.ForwardingServerCall.SimpleForwardingServerCall; diff --git a/java/serving/src/main/java/feast/serving/util/Metrics.java b/java/serving/src/main/java/feast/serving/util/Metrics.java index dca2b5e82a..669f79ddff 100644 --- a/java/serving/src/main/java/feast/serving/util/Metrics.java +++ b/java/serving/src/main/java/feast/serving/util/Metrics.java @@ -19,6 +19,7 @@ import io.prometheus.client.Counter; import io.prometheus.client.Histogram; +// TODO: send these metrics either via Prometheus push gateway or StatsD public class Metrics { public static final Histogram requestLatency = diff --git a/java/serving/src/main/java/feast/serving/util/RequestHelper.java b/java/serving/src/main/java/feast/serving/util/RequestHelper.java index f730e01982..0f66e806ef 100644 --- a/java/serving/src/main/java/feast/serving/util/RequestHelper.java +++ b/java/serving/src/main/java/feast/serving/util/RequestHelper.java @@ -16,9 +16,9 @@ */ package feast.serving.util; -import feast.common.models.Feature; import feast.proto.serving.ServingAPIProto; import feast.proto.serving.ServingAPIProto.FeatureReferenceV2; +import feast.serving.service.FeatureUtil; public class RequestHelper { @@ -29,7 +29,7 @@ public static void validateOnlineRequest(ServingAPIProto.GetOnlineFeaturesReques } // All FeatureReferences should have FeatureTable name and Feature name for (String featureReference : request.getFeatures().getValList()) { - validateOnlineRequestFeatureReference(Feature.parseFeatureReference(featureReference)); + validateOnlineRequestFeatureReference(FeatureUtil.parseFeatureReference(featureReference)); } } diff --git a/java/serving/src/main/java/feast/serving/util/mappers/ResponseJSONMapper.java b/java/serving/src/main/java/feast/serving/util/mappers/ResponseJSONMapper.java deleted file mode 100644 index 3ab9f43c34..0000000000 --- a/java/serving/src/main/java/feast/serving/util/mappers/ResponseJSONMapper.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2019 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.serving.util.mappers; - -import feast.proto.serving.ServingAPIProto; -import feast.proto.types.ValueProto.Value; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -// ResponseJSONMapper maps GRPC Response types to more human readable JSON responses -public class ResponseJSONMapper { - - public static List> mapGetOnlineFeaturesResponse( - ServingAPIProto.GetOnlineFeaturesResponse response) { - return response.getResultsList().stream() - .map(fieldValues -> convertFieldValuesToMap(fieldValues)) - .collect(Collectors.toList()); - } - - private static Map convertFieldValuesToMap( - ServingAPIProto.GetOnlineFeaturesResponse.FeatureVector vec) { - return Map.of( - "values", - vec.getValuesList().stream() - .map(ResponseJSONMapper::extractValue) - .collect(Collectors.toList()), - "statuses", - vec.getStatusesList(), - "event_timestamp", - vec.getEventTimestampsList()); - } - - private static Object extractValue(Value value) { - switch (value.getValCase().getNumber()) { - case 1: - return value.getBytesVal(); - case 2: - return value.getStringVal(); - case 3: - return value.getInt32Val(); - case 4: - return value.getInt64Val(); - case 5: - return value.getDoubleVal(); - case 6: - return value.getFloatVal(); - case 7: - return value.getBoolVal(); - case 11: - return value.getBytesListVal(); - case 12: - return value.getStringListVal(); - case 13: - return value.getInt32ListVal(); - case 14: - return value.getInt64ListVal(); - case 15: - return value.getDoubleListVal(); - case 16: - return value.getFloatListVal(); - case 17: - return value.getBoolListVal(); - default: - return null; - } - } -} diff --git a/java/serving/src/main/resources/application.yml b/java/serving/src/main/resources/application.yml index 1f6d5b34c4..62ace5018e 100644 --- a/java/serving/src/main/resources/application.yml +++ b/java/serving/src/main/resources/application.yml @@ -34,32 +34,9 @@ feast: # The service name identifier for the tracing data serviceName: feast_serving - logging: - # Audit logging provides a machine readable structured JSON log that can give better - # insight into what is happening in Feast. - audit: - # Whether audit logging is enabled. - enabled: true - # Whether to enable message level (ie request/response) audit logging - messageLogging: - enabled: false - # Logging forwarder currently provides a machine readable structured JSON log to an - # external fluentd service that can give better insight into what is happening in Feast. - # Accepts console / fluentd as destination - destination: console - fluentdHost: localhost - fluentdPort: 24224 - grpc: server: # The port number Feast Serving GRPC service should listen on # It is set default to 6566 so it does not conflict with the GRPC server on Feast Core # which defaults to port 6565 port: 6566 - -rest: - server: - # The port number on which the Tomcat webserver that serves REST API endpoints should listen - # It is set by default to 8081 so it does not conflict with Tomcat webserver on Feast Core - # if both Feast Core and Serving are running on the same machine - port: 8081 diff --git a/java/serving/src/test/java/feast/serving/it/ServingBenchmarkIT.java b/java/serving/src/test/java/feast/serving/it/ServingBenchmarkIT.java index 1d77c2e4f7..4254a80e17 100644 --- a/java/serving/src/test/java/feast/serving/it/ServingBenchmarkIT.java +++ b/java/serving/src/test/java/feast/serving/it/ServingBenchmarkIT.java @@ -22,7 +22,7 @@ import com.google.common.math.Quantiles; import feast.proto.serving.ServingAPIProto; import feast.proto.types.ValueProto; -import feast.serving.config.ApplicationProperties; +import feast.serving.service.config.ApplicationProperties; import feast.serving.util.DataGenerator; import java.util.List; import java.util.LongSummaryStatistics; diff --git a/java/serving/src/test/java/feast/serving/it/ServingEnvironment.java b/java/serving/src/test/java/feast/serving/it/ServingEnvironment.java index ebbfa89d37..43b82345c6 100644 --- a/java/serving/src/test/java/feast/serving/it/ServingEnvironment.java +++ b/java/serving/src/test/java/feast/serving/it/ServingEnvironment.java @@ -24,8 +24,8 @@ import com.google.inject.Module; import com.google.inject.util.Modules; import feast.proto.serving.ServingServiceGrpc; -import feast.serving.config.*; -import feast.serving.grpc.OnlineServingGrpcServiceV2; +import feast.serving.service.config.*; +import feast.serving.service.grpc.OnlineServingGrpcServiceV2; import io.grpc.ManagedChannel; import io.grpc.ManagedChannelBuilder; import io.grpc.Server; @@ -120,16 +120,16 @@ ApplicationProperties applicationProperties() { Module overrideConfig = registryConfig(); Module registryConfig; if (overrideConfig != null) { - registryConfig = Modules.override(new RegistryConfig()).with(registryConfig()); + registryConfig = Modules.override(new RegistryConfigModule()).with(registryConfig()); } else { - registryConfig = new RegistryConfig(); + registryConfig = new RegistryConfigModule(); } injector = Guice.createInjector( - new ServingServiceConfigV2(), + new ServingServiceV2Module(), registryConfig, - new InstrumentationConfig(), + new InstrumentationConfigModule(), appPropertiesModule, new ServerModule()); diff --git a/java/serving/src/test/java/feast/serving/it/ServingRedisGSRegistryIT.java b/java/serving/src/test/java/feast/serving/it/ServingRedisGSRegistryIT.java index 78871cd45c..925f1887d2 100644 --- a/java/serving/src/test/java/feast/serving/it/ServingRedisGSRegistryIT.java +++ b/java/serving/src/test/java/feast/serving/it/ServingRedisGSRegistryIT.java @@ -16,12 +16,12 @@ */ package feast.serving.it; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import com.google.cloud.storage.*; import com.google.cloud.storage.testing.RemoteStorageHelper; import feast.proto.core.RegistryProto; -import feast.serving.config.ApplicationProperties; +import feast.serving.service.config.ApplicationProperties; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import org.junit.jupiter.api.AfterAll; diff --git a/java/serving/src/test/java/feast/serving/it/ServingRedisLocalRegistryIT.java b/java/serving/src/test/java/feast/serving/it/ServingRedisLocalRegistryIT.java index c83d8dbbf1..91fc0ed5e1 100644 --- a/java/serving/src/test/java/feast/serving/it/ServingRedisLocalRegistryIT.java +++ b/java/serving/src/test/java/feast/serving/it/ServingRedisLocalRegistryIT.java @@ -17,7 +17,7 @@ package feast.serving.it; import feast.proto.core.RegistryProto; -import feast.serving.config.ApplicationProperties; +import feast.serving.service.config.ApplicationProperties; public class ServingRedisLocalRegistryIT extends ServingBaseTests { @Override diff --git a/java/serving/src/test/java/feast/serving/it/ServingRedisS3RegistryIT.java b/java/serving/src/test/java/feast/serving/it/ServingRedisS3RegistryIT.java index d67fbf2621..12315c9e48 100644 --- a/java/serving/src/test/java/feast/serving/it/ServingRedisS3RegistryIT.java +++ b/java/serving/src/test/java/feast/serving/it/ServingRedisS3RegistryIT.java @@ -24,7 +24,7 @@ import com.google.inject.AbstractModule; import com.google.inject.Provides; import feast.proto.core.RegistryProto; -import feast.serving.config.ApplicationProperties; +import feast.serving.service.config.ApplicationProperties; import java.io.ByteArrayInputStream; import org.junit.jupiter.api.BeforeAll; import org.testcontainers.junit.jupiter.Container; diff --git a/java/serving/src/test/java/feast/serving/it/TestUtils.java b/java/serving/src/test/java/feast/serving/it/TestUtils.java index 9bca14db4e..180b26fd02 100644 --- a/java/serving/src/test/java/feast/serving/it/TestUtils.java +++ b/java/serving/src/test/java/feast/serving/it/TestUtils.java @@ -22,7 +22,7 @@ import feast.proto.serving.ServingAPIProto.GetOnlineFeaturesRequest; import feast.proto.serving.ServingServiceGrpc; import feast.proto.types.ValueProto; -import feast.serving.config.ApplicationProperties; +import feast.serving.service.config.ApplicationProperties; import io.grpc.Channel; import io.grpc.ManagedChannelBuilder; import java.util.*; @@ -77,7 +77,7 @@ public static ApplicationProperties.FeastProperties createBasicFeastProperties( feastProperties.setActiveStore("online"); feastProperties.setProject("feast_project"); - + feastProperties.setEntityKeySerializationVersion(2); feastProperties.setStores( ImmutableList.of( new ApplicationProperties.Store( diff --git a/java/serving/src/test/java/feast/serving/it/TransformationServiceIT.java b/java/serving/src/test/java/feast/serving/it/TransformationServiceIT.java index 102d851528..81147d4268 100644 --- a/java/serving/src/test/java/feast/serving/it/TransformationServiceIT.java +++ b/java/serving/src/test/java/feast/serving/it/TransformationServiceIT.java @@ -22,7 +22,7 @@ import com.google.common.collect.Lists; import feast.proto.serving.ServingAPIProto; import feast.proto.types.ValueProto; -import feast.serving.config.ApplicationProperties; +import feast.serving.service.config.ApplicationProperties; import feast.serving.util.DataGenerator; import java.util.List; import java.util.Map; diff --git a/java/common/src/test/java/feast/common/models/FeaturesTest.java b/java/serving/src/test/java/feast/serving/service/FeaturesTest.java similarity index 91% rename from java/common/src/test/java/feast/common/models/FeaturesTest.java rename to java/serving/src/test/java/feast/serving/service/FeaturesTest.java index 953da61afe..27c0b21d2c 100644 --- a/java/common/src/test/java/feast/common/models/FeaturesTest.java +++ b/java/serving/src/test/java/feast/serving/service/FeaturesTest.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.common.models; +package feast.serving.service; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.core.IsEqual.equalTo; @@ -38,7 +38,7 @@ public void setUp() { @Test public void shouldReturnFeatureStringRef() { - String actualFeatureStringRef = Feature.getFeatureReference(featureReference); + String actualFeatureStringRef = FeatureUtil.getFeatureReference(featureReference); String expectedFeatureStringRef = "featuretable_1:feature1"; assertThat(actualFeatureStringRef, equalTo(expectedFeatureStringRef)); diff --git a/java/serving/src/test/java/feast/serving/service/OnlineServingServiceTest.java b/java/serving/src/test/java/feast/serving/service/OnlineServingServiceTest.java index 933e38f056..32bf2a4cb8 100644 --- a/java/serving/src/test/java/feast/serving/service/OnlineServingServiceTest.java +++ b/java/serving/src/test/java/feast/serving/service/OnlineServingServiceTest.java @@ -32,16 +32,17 @@ import feast.proto.serving.ServingAPIProto.FieldStatus; import feast.proto.serving.ServingAPIProto.GetOnlineFeaturesResponse; import feast.proto.types.ValueProto; +import feast.serving.connectors.Feature; +import feast.serving.connectors.ProtoFeature; +import feast.serving.connectors.redis.retriever.RedisOnlineRetriever; import feast.serving.registry.Registry; import feast.serving.registry.RegistryRepository; -import feast.storage.api.retriever.Feature; -import feast.storage.api.retriever.ProtoFeature; -import feast.storage.connectors.redis.retriever.OnlineRetriever; import io.opentracing.Tracer; import io.opentracing.Tracer.SpanBuilder; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Optional; import java.util.stream.Collectors; import org.junit.Before; import org.junit.Test; @@ -53,7 +54,7 @@ public class OnlineServingServiceTest { @Mock Registry registry; @Mock Tracer tracer; - @Mock OnlineRetriever retrieverV2; + @Mock RedisOnlineRetriever retrieverV2; private String transformationServiceEndpoint; private OnlineServingServiceV2 onlineServingServiceV2; @@ -73,7 +74,11 @@ public void setUp() { new OnlineTransformationService(transformationServiceEndpoint, registryRepo); onlineServingServiceV2 = new OnlineServingServiceV2( - retrieverV2, tracer, registryRepo, onlineTransformationService, "feast_project"); + retrieverV2, + registryRepo, + onlineTransformationService, + "feast_project", + Optional.of(tracer)); mockedFeatureRows = new ArrayList<>(); mockedFeatureRows.add( @@ -378,7 +383,7 @@ private ServingAPIProto.GetOnlineFeaturesRequest getOnlineFeaturesRequest( ServingAPIProto.FeatureList.newBuilder() .addAllVal( featureReferences.stream() - .map(feast.common.models.Feature::getFeatureReference) + .map(FeatureUtil::getFeatureReference) .collect(Collectors.toList())) .build()) .putAllEntities( diff --git a/java/serving/src/test/resources/docker-compose/feast10/definitions.py b/java/serving/src/test/resources/docker-compose/feast10/definitions.py index 806995ec06..769ac15545 100644 --- a/java/serving/src/test/resources/docker-compose/feast10/definitions.py +++ b/java/serving/src/test/resources/docker-compose/feast10/definitions.py @@ -1,4 +1,7 @@ +from datetime import timedelta + import pandas as pd + from feast.data_source import RequestSource from feast.entity import Entity from feast.feature_service import FeatureService @@ -7,7 +10,6 @@ from feast.on_demand_feature_view import on_demand_feature_view from feast.types import Float32, Float64, Int64 from feast.value_type import ValueType -from google.protobuf.duration_pb2 import Duration from feast import FileSource file_path = "driver_stats.parquet" @@ -17,7 +19,7 @@ created_timestamp_column="created", ) -# Define an entity for the driver. You can think of entity as a primary key used to +# Define an entity for the driver. You can think of an entity as a primary key used to # fetch features. driver = Entity(name="driver_id", description="driver id") @@ -27,7 +29,7 @@ driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=[driver], - ttl=Duration(seconds=86400 * 7), + ttl=timedelta(seconds=86400 * 7), schema=[ Field(name="conv_rate", dtype=Float64), Field(name="acc_rate", dtype=Float32), @@ -71,19 +73,19 @@ def transformed_conv_rate(features_df: pd.DataFrame) -> pd.DataFrame: path="benchmark_data.parquet", timestamp_field="event_timestamp", ) -entity = Entity(name="entity", value_type=ValueType.STRING,) +entity = Entity(name="entity") -benchmark_feature_views = [ - FeatureView( +benchmark_feature_views = [] +for i in range(25): + fv = FeatureView( name=f"feature_view_{i}", entities=[entity], - ttl=Duration(seconds=86400), + ttl=timedelta(seconds=86400), schema=[Field(name=f"feature_{10 * i + j}", dtype=Int64) for j in range(10)], online=True, source=generated_data_source, ) - for i in range(25) -] + benchmark_feature_views.append(fv) benchmark_feature_service = FeatureService( name=f"benchmark_feature_service", features=benchmark_feature_views, diff --git a/java/serving/src/test/resources/docker-compose/feast10/feature_store.yaml b/java/serving/src/test/resources/docker-compose/feast10/feature_store.yaml index 2e6625c025..56e469bbee 100644 --- a/java/serving/src/test/resources/docker-compose/feast10/feature_store.yaml +++ b/java/serving/src/test/resources/docker-compose/feast10/feature_store.yaml @@ -5,6 +5,4 @@ online_store: type: redis connection_string: "redis:6379,password=testpw" offline_store: {} -flags: - alpha_features: true - on_demand_transforms: true +entity_key_serialization_version: 2 diff --git a/java/serving/src/test/resources/docker-compose/feast10/registry.db b/java/serving/src/test/resources/docker-compose/feast10/registry.db deleted file mode 100644 index 746934e3d0..0000000000 Binary files a/java/serving/src/test/resources/docker-compose/feast10/registry.db and /dev/null differ diff --git a/java/serving/src/test/resources/docker-compose/feast10/setup_it.py b/java/serving/src/test/resources/docker-compose/feast10/setup_it.py new file mode 100644 index 0000000000..61aaa6fec8 --- /dev/null +++ b/java/serving/src/test/resources/docker-compose/feast10/setup_it.py @@ -0,0 +1,86 @@ +from pathlib import Path +from feast.repo_config import load_repo_config +from datetime import datetime, timedelta + +import numpy as np +import pandas as pd + +from definitions import ( + benchmark_feature_service, + benchmark_feature_views, + driver, + driver_hourly_stats_view, + entity, + transformed_conv_rate, +) + +from feast import FeatureStore + + +def setup_data(): + start = datetime.now() - timedelta(days=10) + + df = pd.DataFrame() + df["driver_id"] = np.arange(1000, 1010) + df["created"] = datetime.now() + df["conv_rate"] = np.arange(0, 1, 0.1) + df["acc_rate"] = np.arange(0.5, 1, 0.05) + df["avg_daily_trips"] = np.arange(0, 1000, 100) + + # some of rows are beyond 7 days to test OUTSIDE_MAX_AGE status + df["event_timestamp"] = start + pd.Series(np.arange(0, 10)).map( + lambda days: timedelta(days=days) + ) + + # Store data in parquet files. Parquet is convenient for local development mode. For + # production, you can use your favorite DWH, such as BigQuery. See Feast documentation + # for more info. + df.to_parquet("driver_stats.parquet") + + # For Benchmarks + # Please read more in Feast RFC-031 + # (link https://docs.google.com/document/d/12UuvTQnTTCJhdRgy6h10zSbInNGSyEJkIxpOcgOen1I/edit) + # about this benchmark setup + def generate_data( + num_rows, num_features, destination + ): + features = [f"feature_{i}" for i in range(num_features)] + columns = ["entity", "event_timestamp"] + features + df = pd.DataFrame(0, index=np.arange(num_rows), columns=columns) + df["event_timestamp"] = datetime.utcnow() + for column in features: + df[column] = np.random.randint(1, num_rows, num_rows) + + df["entity"] = "key-" + pd.Series(np.arange(1, num_rows + 1)).astype( + pd.StringDtype() + ) + + df.to_parquet(destination) + + generate_data(10**3, 250, "benchmark_data.parquet") + + +def main(): + print("Running setup_it.py") + + setup_data() + existing_repo_config = load_repo_config(Path("."), Path(".") / "feature_store.yaml") + + # Update to default online store since otherwise, relies on Dockerized Redis service + fs = FeatureStore(config=existing_repo_config.copy(update={"online_store": {}})) + fs.apply( + [ + driver_hourly_stats_view, + transformed_conv_rate, + driver, + entity, + benchmark_feature_service, + *benchmark_feature_views, + ] + ) + + print("setup_it finished") + + +if __name__ == "__main__": + main() diff --git a/java/storage/api/pom.xml b/java/storage/api/pom.xml deleted file mode 100644 index 90f656e281..0000000000 --- a/java/storage/api/pom.xml +++ /dev/null @@ -1,72 +0,0 @@ - - - - dev.feast - feast-parent - ${revision} - ../../pom.xml - - - 4.0.0 - feast-storage-api - - Feast Storage API - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - - javax.annotation - - - - - - - - - - dev.feast - feast-datatypes - ${project.version} - - - - com.google.auto.value - auto-value-annotations - 1.6.6 - - - - com.google.auto.value - auto-value - 1.6.6 - provided - - - - org.apache.commons - commons-lang3 - 3.9 - - - - org.apache.avro - avro - 1.10.2 - - - - junit - junit - 4.13.2 - test - - - - diff --git a/java/storage/api/src/main/java/feast/storage/api/retriever/AvroFeature.java b/java/storage/api/src/main/java/feast/storage/api/retriever/AvroFeature.java deleted file mode 100644 index d8f2763010..0000000000 --- a/java/storage/api/src/main/java/feast/storage/api/retriever/AvroFeature.java +++ /dev/null @@ -1,172 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2021 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.storage.api.retriever; - -import com.google.protobuf.ByteString; -import com.google.protobuf.Timestamp; -import feast.proto.serving.ServingAPIProto; -import feast.proto.types.ValueProto; -import java.nio.ByteBuffer; -import java.util.stream.Collectors; -import org.apache.avro.generic.GenericData; -import org.apache.avro.util.Utf8; - -public class AvroFeature implements Feature { - private final ServingAPIProto.FeatureReferenceV2 featureReference; - - private final Timestamp eventTimestamp; - - private final Object featureValue; - - public AvroFeature( - ServingAPIProto.FeatureReferenceV2 featureReference, - Timestamp eventTimestamp, - Object featureValue) { - this.featureReference = featureReference; - this.eventTimestamp = eventTimestamp; - this.featureValue = featureValue; - } - - /** - * Casts feature value of Object type based on Feast valueType. Empty object i.e new Object() is - * interpreted as VAL_NOT_SET Feast valueType. - * - * @param valueType Feast valueType of feature as specified in FeatureSpec - * @return ValueProto.Value representation of feature - */ - @Override - public ValueProto.Value getFeatureValue(ValueProto.ValueType.Enum valueType) { - ValueProto.Value finalValue; - - try { - switch (valueType) { - case STRING: - finalValue = - ValueProto.Value.newBuilder().setStringVal(((Utf8) featureValue).toString()).build(); - break; - case INT32: - finalValue = ValueProto.Value.newBuilder().setInt32Val((Integer) featureValue).build(); - break; - case INT64: - finalValue = ValueProto.Value.newBuilder().setInt64Val((Long) featureValue).build(); - break; - case DOUBLE: - finalValue = ValueProto.Value.newBuilder().setDoubleVal((Double) featureValue).build(); - break; - case FLOAT: - finalValue = ValueProto.Value.newBuilder().setFloatVal((Float) featureValue).build(); - break; - case BYTES: - finalValue = - ValueProto.Value.newBuilder() - .setBytesVal(ByteString.copyFrom(((ByteBuffer) featureValue).array())) - .build(); - break; - case BOOL: - finalValue = ValueProto.Value.newBuilder().setBoolVal((Boolean) featureValue).build(); - break; - case STRING_LIST: - finalValue = - ValueProto.Value.newBuilder() - .setStringListVal( - ValueProto.StringList.newBuilder() - .addAllVal( - ((GenericData.Array) featureValue) - .stream().map(Utf8::toString).collect(Collectors.toList())) - .build()) - .build(); - break; - case INT64_LIST: - finalValue = - ValueProto.Value.newBuilder() - .setInt64ListVal( - ValueProto.Int64List.newBuilder() - .addAllVal(((GenericData.Array) featureValue)) - .build()) - .build(); - break; - case INT32_LIST: - finalValue = - ValueProto.Value.newBuilder() - .setInt32ListVal( - ValueProto.Int32List.newBuilder() - .addAllVal(((GenericData.Array) featureValue)) - .build()) - .build(); - break; - case FLOAT_LIST: - finalValue = - ValueProto.Value.newBuilder() - .setFloatListVal( - ValueProto.FloatList.newBuilder() - .addAllVal(((GenericData.Array) featureValue)) - .build()) - .build(); - break; - case DOUBLE_LIST: - finalValue = - ValueProto.Value.newBuilder() - .setDoubleListVal( - ValueProto.DoubleList.newBuilder() - .addAllVal(((GenericData.Array) featureValue)) - .build()) - .build(); - break; - case BOOL_LIST: - finalValue = - ValueProto.Value.newBuilder() - .setBoolListVal( - ValueProto.BoolList.newBuilder() - .addAllVal(((GenericData.Array) featureValue)) - .build()) - .build(); - break; - case BYTES_LIST: - finalValue = - ValueProto.Value.newBuilder() - .setBytesListVal( - ValueProto.BytesList.newBuilder() - .addAllVal( - ((GenericData.Array) featureValue) - .stream() - .map(byteBuffer -> ByteString.copyFrom(byteBuffer.array())) - .collect(Collectors.toList())) - .build()) - .build(); - break; - default: - throw new RuntimeException( - String.format("FeatureType %s is not supported", valueType.name())); - } - } catch (ClassCastException e) { - // Feature type has changed - finalValue = ValueProto.Value.newBuilder().build(); - } - - return finalValue; - } - - @Override - public ServingAPIProto.FeatureReferenceV2 getFeatureReference() { - return this.featureReference; - } - - @Override - public Timestamp getEventTimestamp() { - return this.eventTimestamp; - } -} diff --git a/java/storage/api/src/main/java/feast/storage/api/retriever/FeatureTableRequest.java b/java/storage/api/src/main/java/feast/storage/api/retriever/FeatureTableRequest.java deleted file mode 100644 index 2f181e6de8..0000000000 --- a/java/storage/api/src/main/java/feast/storage/api/retriever/FeatureTableRequest.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2020 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.storage.api.retriever; - -import com.google.auto.value.AutoValue; -import com.google.common.collect.ImmutableSet; -import feast.proto.core.FeatureTableProto.FeatureTableSpec; -import feast.proto.serving.ServingAPIProto.FeatureReferenceV2; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -@AutoValue -public abstract class FeatureTableRequest { - public abstract FeatureTableSpec getSpec(); - - public abstract ImmutableSet getFeatureReferences(); - - public static Builder newBuilder() { - return new AutoValue_FeatureTableRequest.Builder(); - } - - @AutoValue.Builder - public abstract static class Builder { - public abstract Builder setSpec(FeatureTableSpec spec); - - abstract ImmutableSet.Builder featureReferencesBuilder(); - - public Builder addAllFeatureReferences(List featureReferenceList) { - featureReferencesBuilder().addAll(featureReferenceList); - return this; - } - - public Builder addFeatureReference(FeatureReferenceV2 featureReference) { - featureReferencesBuilder().add(featureReference); - return this; - } - - public abstract FeatureTableRequest build(); - } - - public Map getFeatureRefsByName() { - return getFeatureReferences().stream() - .collect( - Collectors.toMap( - FeatureReferenceV2::getFeatureName, featureReference -> featureReference)); - } -} diff --git a/java/storage/connectors/pom.xml b/java/storage/connectors/pom.xml deleted file mode 100644 index 11e32a154c..0000000000 --- a/java/storage/connectors/pom.xml +++ /dev/null @@ -1,61 +0,0 @@ - - - - dev.feast - feast-parent - ${revision} - ../../pom.xml - - - 4.0.0 - feast-storage-connectors - pom - - Feast Storage Connectors - - - redis - - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - - javax.annotation - - - - - - org.jacoco - jacoco-maven-plugin - - - - - - - dev.feast - feast-datatypes - ${project.version} - - - - dev.feast - feast-common - ${project.version} - - - - dev.feast - feast-storage-api - ${project.version} - - - - diff --git a/java/storage/connectors/redis/pom.xml b/java/storage/connectors/redis/pom.xml deleted file mode 100644 index ce25f41da6..0000000000 --- a/java/storage/connectors/redis/pom.xml +++ /dev/null @@ -1,104 +0,0 @@ - - - - dev.feast - feast-storage-connectors - ${revision} - - - 4.0.0 - feast-storage-connector-redis - - Feast Storage Connector for Redis - - - - io.lettuce - lettuce-core - 6.0.2.RELEASE - - - - io.netty - netty-transport-native-epoll - 4.1.52.Final - linux-x86_64 - - - - org.apache.commons - commons-lang3 - 3.9 - - - - com.google.auto.value - auto-value-annotations - 1.6.6 - - - - com.google.auto.value - auto-value - 1.6.6 - provided - - - - com.google.guava - guava - ${guava.version} - - - - org.mockito - mockito-core - ${mockito.version} - test - - - - - com.github.kstyrc - embedded-redis - 0.6 - test - - - - org.hamcrest - hamcrest-core - test - ${hamcrest.version} - - - - org.hamcrest - hamcrest-library - test - ${hamcrest.version} - - - - net.ishiis.redis - redis-unit - 1.0.3 - test - - - - - junit - junit - 4.13.2 - test - - - org.slf4j - slf4j-simple - 1.7.32 - test - - - diff --git a/java/storage/connectors/redis/src/test/java/feast/storage/connectors/redis/test/TestUtil.java b/java/storage/connectors/redis/src/test/java/feast/storage/connectors/redis/test/TestUtil.java deleted file mode 100644 index 66aba44bc2..0000000000 --- a/java/storage/connectors/redis/src/test/java/feast/storage/connectors/redis/test/TestUtil.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * Copyright 2018-2020 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package feast.storage.connectors.redis.test; - -import java.io.IOException; -import redis.embedded.RedisServer; - -public class TestUtil { - public static class LocalRedis { - - private static RedisServer server; - - /** - * Start local Redis for used in testing at "localhost" - * - * @param port port number - * @throws IOException if Redis failed to start - */ - public static void start(int port) throws IOException { - server = new RedisServer(port); - server.start(); - } - - public static void stop() { - if (server != null) { - server.stop(); - } - } - } -} diff --git a/protos/feast/core/DataSource.proto b/protos/feast/core/DataSource.proto index 62f5859ee8..5258618f3b 100644 --- a/protos/feast/core/DataSource.proto +++ b/protos/feast/core/DataSource.proto @@ -49,6 +49,7 @@ message DataSource { PUSH_SOURCE = 9; BATCH_TRINO = 10; BATCH_SPARK = 11; + BATCH_ATHENA = 12; } // Unique name of data source within the project @@ -171,6 +172,22 @@ message DataSource { string database = 4; } + // Defines options for DataSource that sources features from a Athena Query + message AthenaOptions { + // Athena table name + string table = 1; + + // SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective + // entity columns + string query = 2; + + // Athena database name + string database = 3; + + // Athena schema name + string data_source = 4; + } + // Defines options for DataSource that sources features from a Snowflake Query message SnowflakeOptions { // Snowflake table name @@ -242,5 +259,6 @@ message DataSource { PushOptions push_options = 22; SparkOptions spark_options = 27; TrinoOptions trino_options = 30; + AthenaOptions athena_options = 35; } } diff --git a/protos/feast/core/FeatureService.proto b/protos/feast/core/FeatureService.proto index 51b9c6c02a..80d32eb4de 100644 --- a/protos/feast/core/FeatureService.proto +++ b/protos/feast/core/FeatureService.proto @@ -60,6 +60,7 @@ message LoggingConfig { RedshiftDestination redshift_destination = 5; SnowflakeDestination snowflake_destination = 6; CustomDestination custom_destination = 7; + AthenaDestination athena_destination = 8; } message FileDestination { @@ -80,6 +81,11 @@ message LoggingConfig { string table_name = 1; } + message AthenaDestination { + // Destination table name. data_source and database will be taken from an offline store config + string table_name = 1; + } + message SnowflakeDestination { // Destination table name. Schema and database will be taken from an offline store config string table_name = 1; diff --git a/protos/feast/core/OnDemandFeatureView.proto b/protos/feast/core/OnDemandFeatureView.proto index 33c51f5c4d..50bf8b6f55 100644 --- a/protos/feast/core/OnDemandFeatureView.proto +++ b/protos/feast/core/OnDemandFeatureView.proto @@ -83,4 +83,7 @@ message UserDefinedFunction { // The python-syntax function body (serialized by dill) bytes body = 2; + + // The string representation of the udf + string body_text = 3; } diff --git a/protos/feast/core/SavedDataset.proto b/protos/feast/core/SavedDataset.proto index 53f06f73a9..111548aa48 100644 --- a/protos/feast/core/SavedDataset.proto +++ b/protos/feast/core/SavedDataset.proto @@ -59,6 +59,7 @@ message SavedDatasetStorage { DataSource.TrinoOptions trino_storage = 8; DataSource.SparkOptions spark_storage = 9; DataSource.CustomSourceOptions custom_storage = 10; + DataSource.AthenaOptions athena_storage = 11; } } diff --git a/protos/feast/core/StreamFeatureView.proto b/protos/feast/core/StreamFeatureView.proto index 06e9ee0612..3181bdf360 100644 --- a/protos/feast/core/StreamFeatureView.proto +++ b/protos/feast/core/StreamFeatureView.proto @@ -24,7 +24,6 @@ option java_package = "feast.proto.core"; import "google/protobuf/duration.proto"; -import "google/protobuf/timestamp.proto"; import "feast/core/OnDemandFeatureView.proto"; import "feast/core/FeatureView.proto"; import "feast/core/Feature.proto"; diff --git a/sdk/python/docs/conf.py b/sdk/python/docs/conf.py index 8f873d21b6..5e8fd11d16 100644 --- a/sdk/python/docs/conf.py +++ b/sdk/python/docs/conf.py @@ -25,6 +25,29 @@ sys.path.insert(0, os.path.abspath("../..")) +# -- Build protos --------------------------------------------------------- + +# For an unknown reason, the Python protos stopped being built correctly. +# See https://readthedocs.org/projects/feast/builds/17686555/ for an +# example where the Python protos did not build, which subsequently broke +# the RTD build. In order to fix this, we manually compile the protos. +import subprocess + +from pathlib import Path + +# cwd will be feast/sdk/python/docs/source +cwd = Path(os.getcwd()) + +# Change to feast/ +os.chdir(cwd.parent.parent.parent.parent) + +# Compile Python protos +result = subprocess.run(["python", "setup.py", "build_python_protos", "--inplace"], capture_output=True) +stdout = result.stdout.decode("utf-8") +stderr = result.stderr.decode("utf-8") +print(f"Apply stdout:\n{stdout}") +print(f"Apply stderr:\n{stderr}") + # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. diff --git a/sdk/python/docs/index.rst b/sdk/python/docs/index.rst index 07b9d9a77e..beca384137 100644 --- a/sdk/python/docs/index.rst +++ b/sdk/python/docs/index.rst @@ -1,308 +1,441 @@ Feast Python API Documentation ============================== +.. We prefer 'autoclass' instead of 'autoclass' as 'autoclass' can specify a class, whereas + 'autoclass' will pull in all public classes and methods from that module, which we typically + do not want. Feature Store ================== -.. automodule:: feast.feature_store - :members: - :undoc-members: - :show-inheritance: +.. autoclass:: feast.feature_store.FeatureStore + :members: Config ================== -.. automodule:: feast.repo_config +.. autoclass:: feast.repo_config.RepoConfig + :members: + +.. autoclass:: feast.repo_config.RegistryConfig :members: - :exclude-members: load_repo_config, FeastBaseModel Data Source ================== -.. automodule:: feast.data_source - :inherited-members: +.. autoclass:: feast.data_source.DataSource :members: - :exclude-members: KafkaOptions, KafkaSource, KinesisOptions, KinesisSource, PushSource, RequestSource, RequestDataSource -Request Source +File Source ------------------ -.. automodule:: feast.data_source - :members: RequestSource +.. autoclass:: feast.infra.offline_stores.file_source.FileSource + :members: -Push Source +Snowflake Source ------------------ -.. automodule:: feast.data_source - :members: PushSource +.. autoclass:: feast.infra.offline_stores.snowflake_source.SnowflakeSource + :members: BigQuery Source ------------------ -.. automodule:: feast.infra.offline_stores.bigquery_source +.. autoclass:: feast.infra.offline_stores.bigquery_source.BigQuerySource :members: - :exclude-members: BigQueryOptions Redshift Source ------------------ -.. automodule:: feast.infra.offline_stores.redshift_source +.. autoclass:: feast.infra.offline_stores.redshift_source.RedshiftSource :members: - :exclude-members: RedshiftOptions -Snowflake Source +Spark Source ------------------ -.. automodule:: feast.infra.offline_stores.snowflake_source +.. autoclass:: feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource :members: - :exclude-members: SnowflakeOptions -Spark Source +Trino Source ------------------ -.. automodule:: feast.infra.offline_stores.contrib.spark_offline_store.spark_source +.. autoclass:: feast.infra.offline_stores.contrib.trino_offline_store.trino_source.TrinoSource :members: - :exclude-members: SparkOptions -Trino Source +PostgreSQL Source ------------------ -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.trino_source +.. autoclass:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source.PostgreSQLSource :members: - :exclude-members: TrinoOptions -PostgreSQL Source +Request Source ------------------ -.. automodule:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source +.. autoclass:: feast.data_source.RequestSource :members: - :exclude-members: PostgreSQLOptions -File Source +Push Source ------------------ -.. automodule:: feast.infra.offline_stores.file_source +.. autoclass:: feast.data_source.PushSource + :members: + +Kafka Source +------------------ + +.. autoclass:: feast.data_source.KafkaSource + :members: + +Kinesis Source +------------------ + +.. autoclass:: feast.data_source.KinesisSource :members: - :exclude-members: FileOptions Entity ================== -.. automodule:: feast.entity - :inherited-members: +.. autoclass:: feast.entity.Entity :members: Feature View ================== -.. automodule:: feast.feature_view +.. autoclass:: feast.base_feature_view.BaseFeatureView + :members: + +Feature View +---------------------- + +.. autoclass:: feast.feature_view.FeatureView :members: On Demand Feature View ---------------------- -.. automodule:: feast.on_demand_feature_view +.. autoclass:: feast.on_demand_feature_view.OnDemandFeatureView + :members: + +Batch Feature View +---------------------- + +.. autoclass:: feast.batch_feature_view.BatchFeatureView :members: Stream Feature View ---------------------- -.. automodule:: feast.stream_feature_view +.. autoclass:: feast.stream_feature_view.StreamFeatureView :members: -Feature +Field ================== -.. automodule:: feast.feature - :inherited-members: +.. autoclass:: feast.field.Field :members: Feature Service ================== -.. automodule:: feast.feature_service - :inherited-members: +.. autoclass:: feast.feature_service.FeatureService :members: Registry ================== -.. automodule:: feast.registry - :inherited-members: +.. autoclass:: feast.infra.registry.base_registry.BaseRegistry + :members: + +Registry +---------------------- + +.. autoclass:: feast.infra.registry.registry.Registry + :members: + +SQL Registry +---------------------- + +.. autoclass:: feast.infra.registry.sql.SqlRegistry :members: Registry Store ================== -.. automodule:: feast.registry_store - :inherited-members: +.. autoclass:: feast.infra.registry.registry_store.RegistryStore :members: - :exclude-members: NoopRegistryStore -SQL Registry Store +File Registry Store ----------------------- -.. automodule:: feast.infra.registry_stores.sql +.. autoclass:: feast.infra.registry.file.FileRegistryStore :members: - :noindex: -PostgreSQL Registry Store +GCS Registry Store ----------------------- -.. automodule:: feast.infra.registry_stores.contrib.postgres.registry_store +.. autoclass:: feast.infra.registry.gcs.GCSRegistryStore :members: - :noindex: +S3 Registry Store +----------------------- + +.. autoclass:: feast.infra.registry.s3.S3RegistryStore + :members: + +PostgreSQL Registry Store +----------------------- + +.. autoclass:: feast.infra.registry.contrib.postgres.postgres_registry_store.PostgreSQLRegistryStore + :members: Provider ================== -.. automodule:: feast.infra.provider - :inherited-members: +.. autoclass:: feast.infra.provider.Provider :members: Passthrough Provider -------------------- -.. automodule:: feast.infra.passthrough_provider +.. autoclass:: feast.infra.passthrough_provider.PassthroughProvider :members: Local Provider ------------------ -.. automodule:: feast.infra.local +.. autoclass:: feast.infra.local.LocalProvider :members: - :exclude-members: LocalRegistryStore GCP Provider ------------------ -.. automodule:: feast.infra.gcp +.. autoclass:: feast.infra.gcp.GcpProvider :members: - :exclude-members: GCSRegistryStore AWS Provider ------------------ -.. automodule:: feast.infra.aws +.. autoclass:: feast.infra.aws.AwsProvider :members: - :exclude-members: S3RegistryStore Offline Store ================== -.. automodule:: feast.infra.offline_stores.offline_store +.. autoclass:: feast.infra.offline_stores.offline_store.OfflineStore + :members: + +.. autoclass:: feast.infra.offline_stores.offline_store.RetrievalJob :members: File Offline Store ------------------ -.. automodule:: feast.infra.offline_stores.file +.. autoclass:: feast.infra.offline_stores.file.FileOfflineStore + :members: + +.. autoclass:: feast.infra.offline_stores.file.FileOfflineStoreConfig + :members: + +.. autoclass:: feast.infra.offline_stores.file.FileRetrievalJob + :members: + +Snowflake Offline Store +----------------------- + +.. autoclass:: feast.infra.offline_stores.snowflake.SnowflakeOfflineStore + :members: + +.. autoclass:: feast.infra.offline_stores.snowflake.SnowflakeOfflineStoreConfig + :members: + +.. autoclass:: feast.infra.offline_stores.snowflake.SnowflakeRetrievalJob :members: BigQuery Offline Store ---------------------- -.. automodule:: feast.infra.offline_stores.bigquery +.. autoclass:: feast.infra.offline_stores.bigquery.BigQueryOfflineStore + :members: + +.. autoclass:: feast.infra.offline_stores.bigquery.BigQueryOfflineStoreConfig + :members: + +.. autoclass:: feast.infra.offline_stores.bigquery.BigQueryRetrievalJob :members: Redshift Offline Store ---------------------- -.. automodule:: feast.infra.offline_stores.redshift +.. autoclass:: feast.infra.offline_stores.redshift.RedshiftOfflineStore :members: -Snowflake Offline Store ------------------------ +.. autoclass:: feast.infra.offline_stores.redshift.RedshiftOfflineStoreConfig + :members: -.. automodule:: feast.infra.offline_stores.snowflake +.. autoclass:: feast.infra.offline_stores.redshift.RedshiftRetrievalJob :members: Spark Offline Store ------------------- -.. automodule:: feast.infra.offline_stores.contrib.spark_offline_store.spark +.. autoclass:: feast.infra.offline_stores.contrib.spark_offline_store.spark.SparkOfflineStore + :members: + +.. autoclass:: feast.infra.offline_stores.contrib.spark_offline_store.spark.SparkOfflineStoreConfig + :members: + +.. autoclass:: feast.infra.offline_stores.contrib.spark_offline_store.spark.SparkRetrievalJob :members: Trino Offline Store ------------------- -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.trino +.. autoclass:: feast.infra.offline_stores.contrib.trino_offline_store.trino.TrinoOfflineStore + :members: + +.. autoclass:: feast.infra.offline_stores.contrib.trino_offline_store.trino.TrinoOfflineStoreConfig + :members: + +.. autoclass:: feast.infra.offline_stores.contrib.trino_offline_store.trino.TrinoRetrievalJob :members: PostgreSQL Offline Store ------------------------ -.. automodule:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres +.. autoclass:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres.PostgreSQLOfflineStore :members: +.. autoclass:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres.PostgreSQLOfflineStoreConfig + :members: + +.. autoclass:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres.PostgreSQLRetrievalJob + :members: Online Store ================== -.. automodule:: feast.infra.online_stores.online_store - :inherited-members: +.. autoclass:: feast.infra.online_stores.online_store.OnlineStore :members: Sqlite Online Store ------------------- -.. automodule:: feast.infra.online_stores.sqlite +.. autoclass:: feast.infra.online_stores.sqlite.SqliteOnlineStore + :members: + +.. autoclass:: feast.infra.online_stores.sqlite.SqliteOnlineStoreConfig :members: - :noindex: Datastore Online Store ---------------------- -.. automodule:: feast.infra.online_stores.datastore +.. autoclass:: feast.infra.online_stores.datastore.DatastoreOnlineStore + :members: + +.. autoclass:: feast.infra.online_stores.datastore.DatastoreOnlineStoreConfig :members: - :noindex: DynamoDB Online Store --------------------- -.. automodule:: feast.infra.online_stores.dynamodb +.. autoclass:: feast.infra.online_stores.dynamodb.DynamoDBOnlineStore + :members: + +.. autoclass:: feast.infra.online_stores.dynamodb.DynamoDBOnlineStoreConfig :members: - :noindex: Redis Online Store ------------------ -.. automodule:: feast.infra.online_stores.redis +.. autoclass:: feast.infra.online_stores.redis.RedisOnlineStore + :members: + +.. autoclass:: feast.infra.online_stores.redis.RedisOnlineStoreConfig :members: - :noindex: PostgreSQL Online Store ----------------------- -.. automodule:: feast.infra.online_stores.contrib.postgres +.. autoclass:: feast.infra.online_stores.contrib.postgres.PostgreSQLOnlineStore + :members: + +.. autoclass:: feast.infra.online_stores.contrib.postgres.PostgreSQLOnlineStoreConfig :members: - :noindex: HBase Online Store ----------------------- -.. automodule:: feast.infra.online_stores.contrib.hbase_online_store.hbase +.. autoclass:: feast.infra.online_stores.contrib.hbase_online_store.hbase.HbaseOnlineStore :members: - :noindex: +.. autoclass:: feast.infra.online_stores.contrib.hbase_online_store.hbase.HbaseOnlineStoreConfig + :members: + +Cassandra Online Store +----------------------- + +.. autoclass:: feast.infra.online_stores.contrib.cassandra_online_store.cassandra_online_store.CassandraOnlineStore + :members: + +.. autoclass:: feast.infra.online_stores.contrib.cassandra_online_store.cassandra_online_store.CassandraOnlineStoreConfig + :members: Batch Materialization Engine ============================ -.. automodule:: feast.infra.materialization - :members: BatchMaterializationEngine, MaterializationJob, MaterializationTask +.. autoclass:: feast.infra.materialization.batch_materialization_engine.BatchMaterializationEngine + :members: + +.. autoclass:: feast.infra.materialization.batch_materialization_engine.MaterializationJob + :members: + +.. autoclass:: feast.infra.materialization.batch_materialization_engine.MaterializationTask + :members: Local Engine ------------ -.. autoclass:: feast.infra.materialization.LocalMaterializationEngine + +.. autoclass:: feast.infra.materialization.local_engine.LocalMaterializationEngine :members: - :noindex: -(Alpha) Lambda Based Engine +.. autoclass:: feast.infra.materialization.local_engine.LocalMaterializationEngineConfig + :members: + +.. autoclass:: feast.infra.materialization.local_engine.LocalMaterializationJob + :members: + +Bytewax Engine --------------------------- -.. autoclass:: feast.infra.materialization.lambda.lambda_engine +.. autoclass:: feast.infra.materialization.contrib.bytewax.bytewax_materialization_engine.BytewaxMaterializationEngine + :members: + +.. autoclass:: feast.infra.materialization.contrib.bytewax.bytewax_materialization_engine.BytewaxMaterializationEngineConfig + :members: + +.. autoclass:: feast.infra.materialization.contrib.bytewax.bytewax_materialization_job.BytewaxMaterializationJob + :members: + +Snowflake Engine +--------------------------- + +.. autoclass:: feast.infra.materialization.snowflake_engine.SnowflakeMaterializationEngine + :members: + +.. autoclass:: feast.infra.materialization.snowflake_engine.SnowflakeMaterializationEngineConfig + :members: + +.. autoclass:: feast.infra.materialization.snowflake_engine.SnowflakeMaterializationJob + :members: + +(Alpha) AWS Lambda Engine +--------------------------- + +.. autoclass:: feast.infra.materialization.aws_lambda.lambda_engine.LambdaMaterializationEngine + :members: + +.. autoclass:: feast.infra.materialization.aws_lambda.lambda_engine.LambdaMaterializationEngineConfig + :members: + +.. autoclass:: feast.infra.materialization.aws_lambda.lambda_engine.LambdaMaterializationJob :members: - :noindex: diff --git a/sdk/python/docs/source/conf.py b/sdk/python/docs/source/conf.py deleted file mode 100644 index 8f873d21b6..0000000000 --- a/sdk/python/docs/source/conf.py +++ /dev/null @@ -1,178 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Feast documentation build configuration file, created by -# sphinx-quickstart on Sat Nov 30 15:06:53 2019. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys - -import sphinx_rtd_theme - -sys.path.insert(0, os.path.abspath("../../feast")) -sys.path.insert(0, os.path.abspath("../..")) - - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.doctest", - "sphinx.ext.intersphinx", - "sphinx.ext.todo", - "sphinx.ext.coverage", - "sphinx.ext.mathjax", - "sphinx.ext.ifconfig", - "sphinx.ext.viewcode", - "sphinx.ext.githubpages", - "sphinx.ext.napoleon", - "sphinx.ext.autodoc", - "sphinx_rtd_theme", -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = ".rst" - -# The master toctree document. -master_doc = "index" - -# General information about the project. -project = "Feast" -copyright = "2021, Feast Authors" -author = "Feast Authors" - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. - -# TODO: Add the below versions back to documentation building. -# version = ( -# os.popen("git describe --tags $(git rev-list --tags --max-count=1)").read().strip() -# ) -# The full version, including alpha/beta/rc tags. -# release = ( -# os.popen("git describe --tags $(git rev-list --tags --max-count=1)").read().strip() -# ) - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = [] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = True - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] - - -# -- Options for HTMLHelp output ------------------------------------------ - -# Output file base name for HTML help builder. -htmlhelp_basename = "Feastdoc" - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, "Feast.tex", "Feast Documentation", "Feast Authors", "manual") -] - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "feast", "Feast Documentation", [author], 1)] - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ( - master_doc, - "Feast", - "Feast Documentation", - author, - "Feast", - "One line description of project.", - "Miscellaneous", - ) -] - - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {"https://docs.python.org/": None} diff --git a/sdk/python/docs/source/feast.diff.rst b/sdk/python/docs/source/feast.diff.rst deleted file mode 100644 index e414217171..0000000000 --- a/sdk/python/docs/source/feast.diff.rst +++ /dev/null @@ -1,37 +0,0 @@ -feast.diff package -================== - -Submodules ----------- - -feast.diff.infra\_diff module ------------------------------ - -.. automodule:: feast.diff.infra_diff - :members: - :undoc-members: - :show-inheritance: - -feast.diff.property\_diff module --------------------------------- - -.. automodule:: feast.diff.property_diff - :members: - :undoc-members: - :show-inheritance: - -feast.diff.registry\_diff module --------------------------------- - -.. automodule:: feast.diff.registry_diff - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.diff - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.dqm.profilers.rst b/sdk/python/docs/source/feast.dqm.profilers.rst deleted file mode 100644 index 24f452ada8..0000000000 --- a/sdk/python/docs/source/feast.dqm.profilers.rst +++ /dev/null @@ -1,29 +0,0 @@ -feast.dqm.profilers package -=========================== - -Submodules ----------- - -feast.dqm.profilers.ge\_profiler module ---------------------------------------- - -.. automodule:: feast.dqm.profilers.ge_profiler - :members: - :undoc-members: - :show-inheritance: - -feast.dqm.profilers.profiler module ------------------------------------ - -.. automodule:: feast.dqm.profilers.profiler - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.dqm.profilers - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.dqm.rst b/sdk/python/docs/source/feast.dqm.rst deleted file mode 100644 index 0c1b82f0fa..0000000000 --- a/sdk/python/docs/source/feast.dqm.rst +++ /dev/null @@ -1,29 +0,0 @@ -feast.dqm package -================= - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.dqm.profilers - -Submodules ----------- - -feast.dqm.errors module ------------------------ - -.. automodule:: feast.dqm.errors - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.dqm - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.materialization.lambda.rst b/sdk/python/docs/source/feast.infra.materialization.lambda.rst deleted file mode 100644 index 7ca1d44314..0000000000 --- a/sdk/python/docs/source/feast.infra.materialization.lambda.rst +++ /dev/null @@ -1,29 +0,0 @@ -feast.infra.materialization.lambda package -========================================== - -Submodules ----------- - -feast.infra.materialization.lambda.app module ---------------------------------------------- - -.. automodule:: feast.infra.materialization.lambda.app - :members: - :undoc-members: - :show-inheritance: - -feast.infra.materialization.lambda.lambda\_engine module --------------------------------------------------------- - -.. automodule:: feast.infra.materialization.lambda.lambda_engine - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.materialization.lambda - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.materialization.rst b/sdk/python/docs/source/feast.infra.materialization.rst deleted file mode 100644 index ff3e1cf135..0000000000 --- a/sdk/python/docs/source/feast.infra.materialization.rst +++ /dev/null @@ -1,37 +0,0 @@ -feast.infra.materialization package -=================================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.materialization.lambda - -Submodules ----------- - -feast.infra.materialization.batch\_materialization\_engine module ------------------------------------------------------------------ - -.. automodule:: feast.infra.materialization.batch_materialization_engine - :members: - :undoc-members: - :show-inheritance: - -feast.infra.materialization.local\_engine module ------------------------------------------------- - -.. automodule:: feast.infra.materialization.local_engine - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.materialization - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.postgres_offline_store.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.postgres_offline_store.rst deleted file mode 100644 index a80690fe85..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.postgres_offline_store.rst +++ /dev/null @@ -1,37 +0,0 @@ -feast.infra.offline\_stores.contrib.postgres\_offline\_store package -==================================================================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.offline_stores.contrib.postgres_offline_store.tests - -Submodules ----------- - -feast.infra.offline\_stores.contrib.postgres\_offline\_store.postgres module ----------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.contrib.postgres\_offline\_store.postgres\_source module ------------------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores.contrib.postgres_offline_store - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.postgres_offline_store.tests.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.postgres_offline_store.tests.rst deleted file mode 100644 index 35e60d2998..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.postgres_offline_store.tests.rst +++ /dev/null @@ -1,21 +0,0 @@ -feast.infra.offline\_stores.contrib.postgres\_offline\_store.tests package -========================================================================== - -Submodules ----------- - -feast.infra.offline\_stores.contrib.postgres\_offline\_store.tests.data\_source module --------------------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.postgres_offline_store.tests.data_source - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores.contrib.postgres_offline_store.tests - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.rst deleted file mode 100644 index 39902da130..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.rst +++ /dev/null @@ -1,39 +0,0 @@ -feast.infra.offline\_stores.contrib package -=========================================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.offline_stores.contrib.postgres_offline_store - feast.infra.offline_stores.contrib.spark_offline_store - feast.infra.offline_stores.contrib.trino_offline_store - -Submodules ----------- - -feast.infra.offline\_stores.contrib.contrib\_repo\_configuration module ------------------------------------------------------------------------ - -.. automodule:: feast.infra.offline_stores.contrib.contrib_repo_configuration - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.contrib.postgres\_repo\_configuration module ------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.postgres_repo_configuration - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores.contrib - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.spark_offline_store.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.spark_offline_store.rst deleted file mode 100644 index b8b79bb48e..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.spark_offline_store.rst +++ /dev/null @@ -1,37 +0,0 @@ -feast.infra.offline\_stores.contrib.spark\_offline\_store package -================================================================= - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.offline_stores.contrib.spark_offline_store.tests - -Submodules ----------- - -feast.infra.offline\_stores.contrib.spark\_offline\_store.spark module ----------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.spark_offline_store.spark - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.contrib.spark\_offline\_store.spark\_source module ------------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.spark_offline_store.spark_source - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores.contrib.spark_offline_store - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.spark_offline_store.tests.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.spark_offline_store.tests.rst deleted file mode 100644 index 8b0f9bd88b..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.spark_offline_store.tests.rst +++ /dev/null @@ -1,21 +0,0 @@ -feast.infra.offline\_stores.contrib.spark\_offline\_store.tests package -======================================================================= - -Submodules ----------- - -feast.infra.offline\_stores.contrib.spark\_offline\_store.tests.data\_source module ------------------------------------------------------------------------------------ - -.. automodule:: feast.infra.offline_stores.contrib.spark_offline_store.tests.data_source - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores.contrib.spark_offline_store.tests - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.connectors.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.connectors.rst deleted file mode 100644 index a0ee8dceab..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.connectors.rst +++ /dev/null @@ -1,21 +0,0 @@ -feast.infra.offline\_stores.contrib.trino\_offline\_store.connectors package -============================================================================ - -Submodules ----------- - -feast.infra.offline\_stores.contrib.trino\_offline\_store.connectors.upload module ----------------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.connectors.upload - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.connectors - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.rst deleted file mode 100644 index 857326003f..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.rst +++ /dev/null @@ -1,55 +0,0 @@ -feast.infra.offline\_stores.contrib.trino\_offline\_store package -================================================================= - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.offline_stores.contrib.trino_offline_store.connectors - feast.infra.offline_stores.contrib.trino_offline_store.test_config - feast.infra.offline_stores.contrib.trino_offline_store.tests - -Submodules ----------- - -feast.infra.offline\_stores.contrib.trino\_offline\_store.trino module ----------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.trino - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.contrib.trino\_offline\_store.trino\_queries module -------------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.trino_queries - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.contrib.trino\_offline\_store.trino\_source module ------------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.trino_source - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.contrib.trino\_offline\_store.trino\_type\_map module ---------------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.trino_type_map - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.test_config.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.test_config.rst deleted file mode 100644 index ef43a191d0..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.test_config.rst +++ /dev/null @@ -1,21 +0,0 @@ -feast.infra.offline\_stores.contrib.trino\_offline\_store.test\_config package -============================================================================== - -Submodules ----------- - -feast.infra.offline\_stores.contrib.trino\_offline\_store.test\_config.manual\_tests module -------------------------------------------------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.test_config.manual_tests - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.test_config - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.tests.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.tests.rst deleted file mode 100644 index 9102f1f8d6..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.trino_offline_store.tests.rst +++ /dev/null @@ -1,21 +0,0 @@ -feast.infra.offline\_stores.contrib.trino\_offline\_store.tests package -======================================================================= - -Submodules ----------- - -feast.infra.offline\_stores.contrib.trino\_offline\_store.tests.data\_source module ------------------------------------------------------------------------------------ - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.tests.data_source - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.tests - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.rst b/sdk/python/docs/source/feast.infra.offline_stores.rst deleted file mode 100644 index 7949c9efb3..0000000000 --- a/sdk/python/docs/source/feast.infra.offline_stores.rst +++ /dev/null @@ -1,101 +0,0 @@ -feast.infra.offline\_stores package -=================================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.offline_stores.contrib - -Submodules ----------- - -feast.infra.offline\_stores.bigquery module -------------------------------------------- - -.. automodule:: feast.infra.offline_stores.bigquery - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.bigquery\_source module ---------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.bigquery_source - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.file module ---------------------------------------- - -.. automodule:: feast.infra.offline_stores.file - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.file\_source module ------------------------------------------------ - -.. automodule:: feast.infra.offline_stores.file_source - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.offline\_store module -------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.offline_store - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.offline\_utils module -------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.offline_utils - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.redshift module -------------------------------------------- - -.. automodule:: feast.infra.offline_stores.redshift - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.redshift\_source module ---------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.redshift_source - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.snowflake module --------------------------------------------- - -.. automodule:: feast.infra.offline_stores.snowflake - :members: - :undoc-members: - :show-inheritance: - -feast.infra.offline\_stores.snowflake\_source module ----------------------------------------------------- - -.. automodule:: feast.infra.offline_stores.snowflake_source - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.offline_stores - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.online_stores.contrib.hbase_online_store.rst b/sdk/python/docs/source/feast.infra.online_stores.contrib.hbase_online_store.rst deleted file mode 100644 index ce24902304..0000000000 --- a/sdk/python/docs/source/feast.infra.online_stores.contrib.hbase_online_store.rst +++ /dev/null @@ -1,21 +0,0 @@ -feast.infra.online\_stores.contrib.hbase\_online\_store package -=============================================================== - -Submodules ----------- - -feast.infra.online\_stores.contrib.hbase\_online\_store.hbase module --------------------------------------------------------------------- - -.. automodule:: feast.infra.online_stores.contrib.hbase_online_store.hbase - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.online_stores.contrib.hbase_online_store - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.online_stores.contrib.rst b/sdk/python/docs/source/feast.infra.online_stores.contrib.rst deleted file mode 100644 index 7315bb741e..0000000000 --- a/sdk/python/docs/source/feast.infra.online_stores.contrib.rst +++ /dev/null @@ -1,37 +0,0 @@ -feast.infra.online\_stores.contrib package -========================================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.online_stores.contrib.hbase_online_store - -Submodules ----------- - -feast.infra.online\_stores.contrib.hbase\_repo\_configuration module --------------------------------------------------------------------- - -.. automodule:: feast.infra.online_stores.contrib.hbase_repo_configuration - :members: - :undoc-members: - :show-inheritance: - -feast.infra.online\_stores.contrib.postgres module --------------------------------------------------- - -.. automodule:: feast.infra.online_stores.contrib.postgres - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.online_stores.contrib - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.online_stores.rst b/sdk/python/docs/source/feast.infra.online_stores.rst deleted file mode 100644 index 65758c409c..0000000000 --- a/sdk/python/docs/source/feast.infra.online_stores.rst +++ /dev/null @@ -1,77 +0,0 @@ -feast.infra.online\_stores package -================================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.online_stores.contrib - -Submodules ----------- - -feast.infra.online\_stores.datastore module -------------------------------------------- - -.. automodule:: feast.infra.online_stores.datastore - :members: - :undoc-members: - :show-inheritance: - -feast.infra.online\_stores.dynamodb module ------------------------------------------- - -.. automodule:: feast.infra.online_stores.dynamodb - :members: - :undoc-members: - :show-inheritance: - -feast.infra.online\_stores.helpers module ------------------------------------------ - -.. automodule:: feast.infra.online_stores.helpers - :members: - :undoc-members: - :show-inheritance: - -feast.infra.online\_stores.online\_store module ------------------------------------------------ - -.. automodule:: feast.infra.online_stores.online_store - :members: - :undoc-members: - :show-inheritance: - -feast.infra.online\_stores.redis module ---------------------------------------- - -.. automodule:: feast.infra.online_stores.redis - :members: - :undoc-members: - :show-inheritance: - -feast.infra.online\_stores.snowflake module -------------------------------------------- - -.. automodule:: feast.infra.online_stores.snowflake - :members: - :undoc-members: - :show-inheritance: - -feast.infra.online\_stores.sqlite module ----------------------------------------- - -.. automodule:: feast.infra.online_stores.sqlite - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.online_stores - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.registry_stores.rst b/sdk/python/docs/source/feast.infra.registry_stores.rst deleted file mode 100644 index cff02fa338..0000000000 --- a/sdk/python/docs/source/feast.infra.registry_stores.rst +++ /dev/null @@ -1,21 +0,0 @@ -feast.infra.registry\_stores package -==================================== - -Submodules ----------- - -feast.infra.registry\_stores.sql module ---------------------------------------- - -.. automodule:: feast.infra.registry_stores.sql - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.registry_stores - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.rst b/sdk/python/docs/source/feast.infra.rst deleted file mode 100644 index 42c7d1334b..0000000000 --- a/sdk/python/docs/source/feast.infra.rst +++ /dev/null @@ -1,81 +0,0 @@ -feast.infra package -=================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.materialization - feast.infra.offline_stores - feast.infra.online_stores - feast.infra.registry_stores - feast.infra.utils - -Submodules ----------- - -feast.infra.aws module ----------------------- - -.. automodule:: feast.infra.aws - :members: - :undoc-members: - :show-inheritance: - -feast.infra.gcp module ----------------------- - -.. automodule:: feast.infra.gcp - :members: - :undoc-members: - :show-inheritance: - -feast.infra.infra\_object module --------------------------------- - -.. automodule:: feast.infra.infra_object - :members: - :undoc-members: - :show-inheritance: - -feast.infra.key\_encoding\_utils module ---------------------------------------- - -.. automodule:: feast.infra.key_encoding_utils - :members: - :undoc-members: - :show-inheritance: - -feast.infra.local module ------------------------- - -.. automodule:: feast.infra.local - :members: - :undoc-members: - :show-inheritance: - -feast.infra.passthrough\_provider module ----------------------------------------- - -.. automodule:: feast.infra.passthrough_provider - :members: - :undoc-members: - :show-inheritance: - -feast.infra.provider module ---------------------------- - -.. automodule:: feast.infra.provider - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.utils.postgres.rst b/sdk/python/docs/source/feast.infra.utils.postgres.rst deleted file mode 100644 index 119c8c1dee..0000000000 --- a/sdk/python/docs/source/feast.infra.utils.postgres.rst +++ /dev/null @@ -1,29 +0,0 @@ -feast.infra.utils.postgres package -================================== - -Submodules ----------- - -feast.infra.utils.postgres.connection\_utils module ---------------------------------------------------- - -.. automodule:: feast.infra.utils.postgres.connection_utils - :members: - :undoc-members: - :show-inheritance: - -feast.infra.utils.postgres.postgres\_config module --------------------------------------------------- - -.. automodule:: feast.infra.utils.postgres.postgres_config - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.utils.postgres - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.utils.rst b/sdk/python/docs/source/feast.infra.utils.rst deleted file mode 100644 index ffada49797..0000000000 --- a/sdk/python/docs/source/feast.infra.utils.rst +++ /dev/null @@ -1,45 +0,0 @@ -feast.infra.utils package -========================= - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.infra.utils.postgres - -Submodules ----------- - -feast.infra.utils.aws\_utils module ------------------------------------ - -.. automodule:: feast.infra.utils.aws_utils - :members: - :undoc-members: - :show-inheritance: - -feast.infra.utils.hbase\_utils module -------------------------------------- - -.. automodule:: feast.infra.utils.hbase_utils - :members: - :undoc-members: - :show-inheritance: - -feast.infra.utils.snowflake\_utils module ------------------------------------------ - -.. automodule:: feast.infra.utils.snowflake_utils - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.infra.utils - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.loaders.rst b/sdk/python/docs/source/feast.loaders.rst deleted file mode 100644 index d4968a2999..0000000000 --- a/sdk/python/docs/source/feast.loaders.rst +++ /dev/null @@ -1,21 +0,0 @@ -feast.loaders package -===================== - -Submodules ----------- - -feast.loaders.yaml module -------------------------- - -.. automodule:: feast.loaders.yaml - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.loaders - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.protos.feast.core.rst b/sdk/python/docs/source/feast.protos.feast.core.rst deleted file mode 100644 index aaed49cd73..0000000000 --- a/sdk/python/docs/source/feast.protos.feast.core.rst +++ /dev/null @@ -1,333 +0,0 @@ -feast.protos.feast.core package -=============================== - -Submodules ----------- - -feast.protos.feast.core.Aggregation\_pb2 module ------------------------------------------------ - -.. automodule:: feast.protos.feast.core.Aggregation_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.Aggregation\_pb2\_grpc module ------------------------------------------------------ - -.. automodule:: feast.protos.feast.core.Aggregation_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.DataFormat\_pb2 module ----------------------------------------------- - -.. automodule:: feast.protos.feast.core.DataFormat_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.DataFormat\_pb2\_grpc module ----------------------------------------------------- - -.. automodule:: feast.protos.feast.core.DataFormat_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.DataSource\_pb2 module ----------------------------------------------- - -.. automodule:: feast.protos.feast.core.DataSource_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.DataSource\_pb2\_grpc module ----------------------------------------------------- - -.. automodule:: feast.protos.feast.core.DataSource_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.DatastoreTable\_pb2 module --------------------------------------------------- - -.. automodule:: feast.protos.feast.core.DatastoreTable_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.DatastoreTable\_pb2\_grpc module --------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.DatastoreTable_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.DynamoDBTable\_pb2 module -------------------------------------------------- - -.. automodule:: feast.protos.feast.core.DynamoDBTable_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.DynamoDBTable\_pb2\_grpc module -------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.DynamoDBTable_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.Entity\_pb2 module ------------------------------------------- - -.. automodule:: feast.protos.feast.core.Entity_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.Entity\_pb2\_grpc module ------------------------------------------------- - -.. automodule:: feast.protos.feast.core.Entity_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.FeatureService\_pb2 module --------------------------------------------------- - -.. automodule:: feast.protos.feast.core.FeatureService_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.FeatureService\_pb2\_grpc module --------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.FeatureService_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.FeatureTable\_pb2 module ------------------------------------------------- - -.. automodule:: feast.protos.feast.core.FeatureTable_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.FeatureTable\_pb2\_grpc module ------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.FeatureTable_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.FeatureViewProjection\_pb2 module ---------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.FeatureViewProjection_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.FeatureViewProjection\_pb2\_grpc module ---------------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.FeatureViewProjection_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.FeatureView\_pb2 module ------------------------------------------------ - -.. automodule:: feast.protos.feast.core.FeatureView_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.FeatureView\_pb2\_grpc module ------------------------------------------------------ - -.. automodule:: feast.protos.feast.core.FeatureView_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.Feature\_pb2 module -------------------------------------------- - -.. automodule:: feast.protos.feast.core.Feature_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.Feature\_pb2\_grpc module -------------------------------------------------- - -.. automodule:: feast.protos.feast.core.Feature_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.InfraObject\_pb2 module ------------------------------------------------ - -.. automodule:: feast.protos.feast.core.InfraObject_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.InfraObject\_pb2\_grpc module ------------------------------------------------------ - -.. automodule:: feast.protos.feast.core.InfraObject_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.OnDemandFeatureView\_pb2 module -------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.OnDemandFeatureView_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.OnDemandFeatureView\_pb2\_grpc module -------------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.OnDemandFeatureView_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.Registry\_pb2 module --------------------------------------------- - -.. automodule:: feast.protos.feast.core.Registry_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.Registry\_pb2\_grpc module --------------------------------------------------- - -.. automodule:: feast.protos.feast.core.Registry_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.RequestFeatureView\_pb2 module ------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.RequestFeatureView_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.RequestFeatureView\_pb2\_grpc module ------------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.RequestFeatureView_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.SavedDataset\_pb2 module ------------------------------------------------- - -.. automodule:: feast.protos.feast.core.SavedDataset_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.SavedDataset\_pb2\_grpc module ------------------------------------------------------- - -.. automodule:: feast.protos.feast.core.SavedDataset_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.SqliteTable\_pb2 module ------------------------------------------------ - -.. automodule:: feast.protos.feast.core.SqliteTable_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.SqliteTable\_pb2\_grpc module ------------------------------------------------------ - -.. automodule:: feast.protos.feast.core.SqliteTable_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.Store\_pb2 module ------------------------------------------ - -.. automodule:: feast.protos.feast.core.Store_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.Store\_pb2\_grpc module ------------------------------------------------ - -.. automodule:: feast.protos.feast.core.Store_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.StreamFeatureView\_pb2 module ------------------------------------------------------ - -.. automodule:: feast.protos.feast.core.StreamFeatureView_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.StreamFeatureView\_pb2\_grpc module ------------------------------------------------------------ - -.. automodule:: feast.protos.feast.core.StreamFeatureView_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.ValidationProfile\_pb2 module ------------------------------------------------------ - -.. automodule:: feast.protos.feast.core.ValidationProfile_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.core.ValidationProfile\_pb2\_grpc module ------------------------------------------------------------ - -.. automodule:: feast.protos.feast.core.ValidationProfile_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.protos.feast.core - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.protos.feast.rst b/sdk/python/docs/source/feast.protos.feast.rst deleted file mode 100644 index f519165db8..0000000000 --- a/sdk/python/docs/source/feast.protos.feast.rst +++ /dev/null @@ -1,21 +0,0 @@ -feast.protos.feast package -========================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.protos.feast.core - feast.protos.feast.serving - feast.protos.feast.storage - feast.protos.feast.types - -Module contents ---------------- - -.. automodule:: feast.protos.feast - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.protos.feast.serving.rst b/sdk/python/docs/source/feast.protos.feast.serving.rst deleted file mode 100644 index 792335b189..0000000000 --- a/sdk/python/docs/source/feast.protos.feast.serving.rst +++ /dev/null @@ -1,61 +0,0 @@ -feast.protos.feast.serving package -================================== - -Submodules ----------- - -feast.protos.feast.serving.Connector\_pb2 module ------------------------------------------------- - -.. automodule:: feast.protos.feast.serving.Connector_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.serving.Connector\_pb2\_grpc module ------------------------------------------------------- - -.. automodule:: feast.protos.feast.serving.Connector_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.serving.ServingService\_pb2 module ------------------------------------------------------ - -.. automodule:: feast.protos.feast.serving.ServingService_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.serving.ServingService\_pb2\_grpc module ------------------------------------------------------------ - -.. automodule:: feast.protos.feast.serving.ServingService_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.serving.TransformationService\_pb2 module ------------------------------------------------------------- - -.. automodule:: feast.protos.feast.serving.TransformationService_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.serving.TransformationService\_pb2\_grpc module ------------------------------------------------------------------- - -.. automodule:: feast.protos.feast.serving.TransformationService_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.protos.feast.serving - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.protos.feast.storage.rst b/sdk/python/docs/source/feast.protos.feast.storage.rst deleted file mode 100644 index 90bc1adc9b..0000000000 --- a/sdk/python/docs/source/feast.protos.feast.storage.rst +++ /dev/null @@ -1,29 +0,0 @@ -feast.protos.feast.storage package -================================== - -Submodules ----------- - -feast.protos.feast.storage.Redis\_pb2 module --------------------------------------------- - -.. automodule:: feast.protos.feast.storage.Redis_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.storage.Redis\_pb2\_grpc module --------------------------------------------------- - -.. automodule:: feast.protos.feast.storage.Redis_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.protos.feast.storage - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.protos.feast.types.rst b/sdk/python/docs/source/feast.protos.feast.types.rst deleted file mode 100644 index aeb31bc9ad..0000000000 --- a/sdk/python/docs/source/feast.protos.feast.types.rst +++ /dev/null @@ -1,61 +0,0 @@ -feast.protos.feast.types package -================================ - -Submodules ----------- - -feast.protos.feast.types.EntityKey\_pb2 module ----------------------------------------------- - -.. automodule:: feast.protos.feast.types.EntityKey_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.types.EntityKey\_pb2\_grpc module ----------------------------------------------------- - -.. automodule:: feast.protos.feast.types.EntityKey_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.types.Field\_pb2 module ------------------------------------------- - -.. automodule:: feast.protos.feast.types.Field_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.types.Field\_pb2\_grpc module ------------------------------------------------- - -.. automodule:: feast.protos.feast.types.Field_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.types.Value\_pb2 module ------------------------------------------- - -.. automodule:: feast.protos.feast.types.Value_pb2 - :members: - :undoc-members: - :show-inheritance: - -feast.protos.feast.types.Value\_pb2\_grpc module ------------------------------------------------- - -.. automodule:: feast.protos.feast.types.Value_pb2_grpc - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast.protos.feast.types - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.protos.rst b/sdk/python/docs/source/feast.protos.rst deleted file mode 100644 index 7bec91eb03..0000000000 --- a/sdk/python/docs/source/feast.protos.rst +++ /dev/null @@ -1,18 +0,0 @@ -feast.protos package -==================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.protos.feast - -Module contents ---------------- - -.. automodule:: feast.protos - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.rst b/sdk/python/docs/source/feast.rst deleted file mode 100644 index c000ac2e2b..0000000000 --- a/sdk/python/docs/source/feast.rst +++ /dev/null @@ -1,394 +0,0 @@ -feast package -============= - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - feast.diff - feast.dqm - feast.infra - feast.loaders - feast.protos - feast.ui - -Submodules ----------- - -feast.aggregation module ------------------------- - -.. automodule:: feast.aggregation - :members: - :undoc-members: - :show-inheritance: - -feast.base\_feature\_view module --------------------------------- - -.. automodule:: feast.base_feature_view - :members: - :undoc-members: - :show-inheritance: - -feast.batch\_feature\_view module ---------------------------------- - -.. automodule:: feast.batch_feature_view - :members: - :undoc-members: - :show-inheritance: - -feast.cli module ----------------- - -.. automodule:: feast.cli - :members: - :undoc-members: - :show-inheritance: - -feast.constants module ----------------------- - -.. automodule:: feast.constants - :members: - :undoc-members: - :show-inheritance: - -feast.data\_format module -------------------------- - -.. automodule:: feast.data_format - :members: - :undoc-members: - :show-inheritance: - -feast.data\_source module -------------------------- - -.. automodule:: feast.data_source - :members: - :undoc-members: - :show-inheritance: - -feast.driver\_test\_data module -------------------------------- - -.. automodule:: feast.driver_test_data - :members: - :undoc-members: - :show-inheritance: - -feast.entity module -------------------- - -.. automodule:: feast.entity - :members: - :undoc-members: - :show-inheritance: - -feast.errors module -------------------- - -.. automodule:: feast.errors - :members: - :undoc-members: - :show-inheritance: - -feast.feast\_object module --------------------------- - -.. automodule:: feast.feast_object - :members: - :undoc-members: - :show-inheritance: - -feast.feature module --------------------- - -.. automodule:: feast.feature - :members: - :undoc-members: - :show-inheritance: - -feast.feature\_logging module ------------------------------ - -.. automodule:: feast.feature_logging - :members: - :undoc-members: - :show-inheritance: - -feast.feature\_server module ----------------------------- - -.. automodule:: feast.feature_server - :members: - :undoc-members: - :show-inheritance: - -feast.feature\_service module ------------------------------ - -.. automodule:: feast.feature_service - :members: - :undoc-members: - :show-inheritance: - -feast.feature\_store module ---------------------------- - -.. automodule:: feast.feature_store - :members: - :undoc-members: - :show-inheritance: - -feast.feature\_view module --------------------------- - -.. automodule:: feast.feature_view - :members: - :undoc-members: - :show-inheritance: - -feast.feature\_view\_projection module --------------------------------------- - -.. automodule:: feast.feature_view_projection - :members: - :undoc-members: - :show-inheritance: - -feast.field module ------------------- - -.. automodule:: feast.field - :members: - :undoc-members: - :show-inheritance: - -feast.flags module ------------------- - -.. automodule:: feast.flags - :members: - :undoc-members: - :show-inheritance: - -feast.flags\_helper module --------------------------- - -.. automodule:: feast.flags_helper - :members: - :undoc-members: - :show-inheritance: - -feast.importer module ---------------------- - -.. automodule:: feast.importer - :members: - :undoc-members: - :show-inheritance: - -feast.inference module ----------------------- - -.. automodule:: feast.inference - :members: - :undoc-members: - :show-inheritance: - -feast.names module ------------------- - -.. automodule:: feast.names - :members: - :undoc-members: - :show-inheritance: - -feast.on\_demand\_feature\_view module --------------------------------------- - -.. automodule:: feast.on_demand_feature_view - :members: - :undoc-members: - :show-inheritance: - -feast.online\_response module ------------------------------ - -.. automodule:: feast.online_response - :members: - :undoc-members: - :show-inheritance: - -feast.project\_metadata module ------------------------------- - -.. automodule:: feast.project_metadata - :members: - :undoc-members: - :show-inheritance: - -feast.proto\_json module ------------------------- - -.. automodule:: feast.proto_json - :members: - :undoc-members: - :show-inheritance: - -feast.registry module ---------------------- - -.. automodule:: feast.registry - :members: - :undoc-members: - :show-inheritance: - -feast.registry\_store module ----------------------------- - -.. automodule:: feast.registry_store - :members: - :undoc-members: - :show-inheritance: - -feast.repo\_config module -------------------------- - -.. automodule:: feast.repo_config - :members: - :undoc-members: - :show-inheritance: - -feast.repo\_contents module ---------------------------- - -.. automodule:: feast.repo_contents - :members: - :undoc-members: - :show-inheritance: - -feast.repo\_operations module ------------------------------ - -.. automodule:: feast.repo_operations - :members: - :undoc-members: - :show-inheritance: - -feast.repo\_upgrade module --------------------------- - -.. automodule:: feast.repo_upgrade - :members: - :undoc-members: - :show-inheritance: - -feast.request\_feature\_view module ------------------------------------ - -.. automodule:: feast.request_feature_view - :members: - :undoc-members: - :show-inheritance: - -feast.saved\_dataset module ---------------------------- - -.. automodule:: feast.saved_dataset - :members: - :undoc-members: - :show-inheritance: - -feast.stream\_feature\_view module ----------------------------------- - -.. automodule:: feast.stream_feature_view - :members: - :undoc-members: - :show-inheritance: - -feast.transformation\_server module ------------------------------------ - -.. automodule:: feast.transformation_server - :members: - :undoc-members: - :show-inheritance: - -feast.type\_map module ----------------------- - -.. automodule:: feast.type_map - :members: - :undoc-members: - :show-inheritance: - -feast.types module ------------------- - -.. automodule:: feast.types - :members: - :undoc-members: - :show-inheritance: - -feast.ui\_server module ------------------------ - -.. automodule:: feast.ui_server - :members: - :undoc-members: - :show-inheritance: - -feast.usage module ------------------- - -.. automodule:: feast.usage - :members: - :undoc-members: - :show-inheritance: - -feast.utils module ------------------- - -.. automodule:: feast.utils - :members: - :undoc-members: - :show-inheritance: - -feast.value\_type module ------------------------- - -.. automodule:: feast.value_type - :members: - :undoc-members: - :show-inheritance: - -feast.version module --------------------- - -.. automodule:: feast.version - :members: - :undoc-members: - :show-inheritance: - -feast.wait module ------------------ - -.. automodule:: feast.wait - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: feast - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/feast.ui.rst b/sdk/python/docs/source/feast.ui.rst deleted file mode 100644 index 01b16cb0a6..0000000000 --- a/sdk/python/docs/source/feast.ui.rst +++ /dev/null @@ -1,10 +0,0 @@ -feast.ui package -================ - -Module contents ---------------- - -.. automodule:: feast.ui - :members: - :undoc-members: - :show-inheritance: diff --git a/sdk/python/docs/source/index.rst b/sdk/python/docs/source/index.rst deleted file mode 100644 index 07b9d9a77e..0000000000 --- a/sdk/python/docs/source/index.rst +++ /dev/null @@ -1,308 +0,0 @@ -Feast Python API Documentation -============================== - - -Feature Store -================== - -.. automodule:: feast.feature_store - :members: - :undoc-members: - :show-inheritance: - -Config -================== - -.. automodule:: feast.repo_config - :members: - :exclude-members: load_repo_config, FeastBaseModel - -Data Source -================== - -.. automodule:: feast.data_source - :inherited-members: - :members: - :exclude-members: KafkaOptions, KafkaSource, KinesisOptions, KinesisSource, PushSource, RequestSource, RequestDataSource - -Request Source ------------------- - -.. automodule:: feast.data_source - :members: RequestSource - -Push Source ------------------- - -.. automodule:: feast.data_source - :members: PushSource - -BigQuery Source ------------------- - -.. automodule:: feast.infra.offline_stores.bigquery_source - :members: - :exclude-members: BigQueryOptions - -Redshift Source ------------------- - -.. automodule:: feast.infra.offline_stores.redshift_source - :members: - :exclude-members: RedshiftOptions - -Snowflake Source ------------------- - -.. automodule:: feast.infra.offline_stores.snowflake_source - :members: - :exclude-members: SnowflakeOptions - -Spark Source ------------------- - -.. automodule:: feast.infra.offline_stores.contrib.spark_offline_store.spark_source - :members: - :exclude-members: SparkOptions - -Trino Source ------------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.trino_source - :members: - :exclude-members: TrinoOptions - -PostgreSQL Source ------------------- - -.. automodule:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source - :members: - :exclude-members: PostgreSQLOptions - -File Source ------------------- - -.. automodule:: feast.infra.offline_stores.file_source - :members: - :exclude-members: FileOptions - -Entity -================== - -.. automodule:: feast.entity - :inherited-members: - :members: - -Feature View -================== - -.. automodule:: feast.feature_view - :members: - -On Demand Feature View ----------------------- - -.. automodule:: feast.on_demand_feature_view - :members: - -Stream Feature View ----------------------- - -.. automodule:: feast.stream_feature_view - :members: - -Feature -================== - -.. automodule:: feast.feature - :inherited-members: - :members: - -Feature Service -================== - -.. automodule:: feast.feature_service - :inherited-members: - :members: - -Registry -================== - -.. automodule:: feast.registry - :inherited-members: - :members: - -Registry Store -================== - -.. automodule:: feast.registry_store - :inherited-members: - :members: - :exclude-members: NoopRegistryStore - -SQL Registry Store ------------------------ - -.. automodule:: feast.infra.registry_stores.sql - :members: - :noindex: - -PostgreSQL Registry Store ------------------------ - -.. automodule:: feast.infra.registry_stores.contrib.postgres.registry_store - :members: - :noindex: - - -Provider -================== - -.. automodule:: feast.infra.provider - :inherited-members: - :members: - -Passthrough Provider --------------------- - -.. automodule:: feast.infra.passthrough_provider - :members: - -Local Provider ------------------- - -.. automodule:: feast.infra.local - :members: - :exclude-members: LocalRegistryStore - -GCP Provider ------------------- - -.. automodule:: feast.infra.gcp - :members: - :exclude-members: GCSRegistryStore - -AWS Provider ------------------- - -.. automodule:: feast.infra.aws - :members: - :exclude-members: S3RegistryStore - -Offline Store -================== - -.. automodule:: feast.infra.offline_stores.offline_store - :members: - -File Offline Store ------------------- - -.. automodule:: feast.infra.offline_stores.file - :members: - -BigQuery Offline Store ----------------------- - -.. automodule:: feast.infra.offline_stores.bigquery - :members: - -Redshift Offline Store ----------------------- - -.. automodule:: feast.infra.offline_stores.redshift - :members: - -Snowflake Offline Store ------------------------ - -.. automodule:: feast.infra.offline_stores.snowflake - :members: - -Spark Offline Store -------------------- - -.. automodule:: feast.infra.offline_stores.contrib.spark_offline_store.spark - :members: - -Trino Offline Store -------------------- - -.. automodule:: feast.infra.offline_stores.contrib.trino_offline_store.trino - :members: - -PostgreSQL Offline Store ------------------------- - -.. automodule:: feast.infra.offline_stores.contrib.postgres_offline_store.postgres - :members: - - -Online Store -================== - -.. automodule:: feast.infra.online_stores.online_store - :inherited-members: - :members: - -Sqlite Online Store -------------------- - -.. automodule:: feast.infra.online_stores.sqlite - :members: - :noindex: - -Datastore Online Store ----------------------- - -.. automodule:: feast.infra.online_stores.datastore - :members: - :noindex: - -DynamoDB Online Store ---------------------- - -.. automodule:: feast.infra.online_stores.dynamodb - :members: - :noindex: - -Redis Online Store ------------------- - -.. automodule:: feast.infra.online_stores.redis - :members: - :noindex: - -PostgreSQL Online Store ------------------------ - -.. automodule:: feast.infra.online_stores.contrib.postgres - :members: - :noindex: - -HBase Online Store ------------------------ - -.. automodule:: feast.infra.online_stores.contrib.hbase_online_store.hbase - :members: - :noindex: - - -Batch Materialization Engine -============================ - -.. automodule:: feast.infra.materialization - :members: BatchMaterializationEngine, MaterializationJob, MaterializationTask - -Local Engine ------------- -.. autoclass:: feast.infra.materialization.LocalMaterializationEngine - :members: - :noindex: - -(Alpha) Lambda Based Engine ---------------------------- - -.. autoclass:: feast.infra.materialization.lambda.lambda_engine - :members: - :noindex: diff --git a/sdk/python/docs/source/modules.rst b/sdk/python/docs/source/modules.rst deleted file mode 100644 index 3a6f8333ab..0000000000 --- a/sdk/python/docs/source/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -feast -===== - -.. toctree:: - :maxdepth: 4 - - feast diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index 8461cf31b6..d043f1a973 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -5,18 +5,15 @@ from importlib_metadata import PackageNotFoundError, version as _version # type: ignore from feast.infra.offline_stores.bigquery_source import BigQuerySource +from feast.infra.offline_stores.contrib.athena_offline_store.athena_source import ( + AthenaSource, +) from feast.infra.offline_stores.file_source import FileSource from feast.infra.offline_stores.redshift_source import RedshiftSource from feast.infra.offline_stores.snowflake_source import SnowflakeSource from .batch_feature_view import BatchFeatureView -from .data_source import ( - KafkaSource, - KinesisSource, - PushSource, - RequestSource, - SourceType, -) +from .data_source import KafkaSource, KinesisSource, PushSource, RequestSource from .entity import Entity from .feature import Feature from .feature_service import FeatureService @@ -47,7 +44,6 @@ "FeatureView", "OnDemandFeatureView", "RepoConfig", - "SourceType", "StreamFeatureView", "ValueType", "BigQuerySource", @@ -57,4 +53,5 @@ "SnowflakeSource", "PushSource", "RequestSource", + "AthenaSource", ] diff --git a/sdk/python/feast/base_feature_view.py b/sdk/python/feast/base_feature_view.py index 5feb1d7d89..975537a394 100644 --- a/sdk/python/feast/base_feature_view.py +++ b/sdk/python/feast/base_feature_view.py @@ -35,8 +35,8 @@ class BaseFeatureView(ABC): maintainer. projection: The feature view projection storing modifications to be applied to this base feature view at retrieval time. - created_timestamp (optional): The time when the base feature view was created. - last_updated_timestamp (optional): The time when the base feature view was last + created_timestamp: The time when the base feature view was created. + last_updated_timestamp: The time when the base feature view was last updated. """ @@ -117,10 +117,16 @@ def __getitem__(self, item): cp = self.__copy__() if self.features: + feature_name_to_feature = { + feature.name: feature for feature in self.features + } referenced_features = [] - for feature in self.features: - if feature.name in item: - referenced_features.append(feature) + for feature in item: + if feature not in feature_name_to_feature: + raise ValueError( + f"Feature {feature} does not exist in this feature view." + ) + referenced_features.append(feature_name_to_feature[feature]) cp.projection.features = referenced_features else: cp.projection.desired_features = item diff --git a/sdk/python/feast/batch_feature_view.py b/sdk/python/feast/batch_feature_view.py index 2f9fb080db..707529a1a8 100644 --- a/sdk/python/feast/batch_feature_view.py +++ b/sdk/python/feast/batch_feature_view.py @@ -1,12 +1,16 @@ -from datetime import timedelta -from typing import Dict, List, Optional, Union +import warnings +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Tuple, Union +from feast import flags_helper from feast.data_source import DataSource from feast.entity import Entity from feast.feature_view import FeatureView from feast.field import Field from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +warnings.simplefilter("once", RuntimeWarning) + SUPPORTED_BATCH_SOURCES = { "BigQuerySource", "FileSource", @@ -14,14 +18,48 @@ "SnowflakeSource", "SparkSource", "TrinoSource", + "AthenaSource", } class BatchFeatureView(FeatureView): + """ + A batch feature view defines a logical group of features that has only a batch data source. + + Attributes: + name: The unique name of the batch feature view. + entities: List of entities or entity join keys. + ttl: The amount of time this group of features lives. A ttl of 0 indicates that + this group of features lives forever. Note that large ttl's or a ttl of 0 + can result in extremely computationally intensive queries. + schema: The schema of the feature view, including feature, timestamp, and entity + columns. If not specified, can be inferred from the underlying data source. + source: The batch source of data where this group of features is stored. + online: A boolean indicating whether online retrieval is enabled for this feature view. + description: A human-readable description. + tags: A dictionary of key-value pairs to store arbitrary metadata. + owner: The owner of the batch feature view, typically the email of the primary maintainer. + """ + + name: str + entities: List[str] + ttl: Optional[timedelta] + source: DataSource + schema: List[Field] + entity_columns: List[Field] + features: List[Field] + online: bool + description: str + tags: Dict[str, str] + owner: str + timestamp_field: str + materialization_intervals: List[Tuple[datetime, datetime]] + def __init__( self, *, - name: Optional[str] = None, + name: str, + source: DataSource, entities: Optional[Union[List[Entity], List[str]]] = None, ttl: Optional[timedelta] = None, tags: Optional[Dict[str, str]] = None, @@ -29,11 +67,14 @@ def __init__( description: str = "", owner: str = "", schema: Optional[List[Field]] = None, - source: Optional[DataSource] = None, ): + if not flags_helper.is_test(): + warnings.warn( + "Batch feature views are experimental features in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) - if source is None: - raise ValueError("Feature views need a source specified") if ( type(source).__name__ not in SUPPORTED_BATCH_SOURCES and source.to_proto().type != DataSourceProto.SourceType.CUSTOM_SOURCE @@ -47,8 +88,6 @@ def __init__( name=name, entities=entities, ttl=ttl, - batch_source=None, - stream_source=None, tags=tags, online=online, description=description, diff --git a/sdk/python/feast/cli.py b/sdk/python/feast/cli.py index 153c1a5ddd..af9aa51191 100644 --- a/sdk/python/feast/cli.py +++ b/sdk/python/feast/cli.py @@ -11,9 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import base64 import json import logging -import warnings +import os +import tempfile from datetime import datetime from pathlib import Path from typing import List, Optional @@ -25,8 +27,11 @@ from dateutil import parser from pygments import formatters, highlight, lexers -from feast import flags, flags_helper, utils -from feast.constants import DEFAULT_FEATURE_TRANSFORMATION_SERVER_PORT +from feast import utils +from feast.constants import ( + DEFAULT_FEATURE_TRANSFORMATION_SERVER_PORT, + FEATURE_STORE_YAML_ENV_NAME, +) from feast.errors import FeastObjectNotFoundException, FeastProviderLoginError from feast.feature_store import FeatureStore from feast.feature_view import FeatureView @@ -45,7 +50,6 @@ from feast.utils import maybe_local_tz _logger = logging.getLogger(__name__) -warnings.filterwarnings("ignore", category=DeprecationWarning, module="(?!feast)") class NoOptionDefaultFormat(click.Command): @@ -72,8 +76,17 @@ def format_options(self, ctx: click.Context, formatter: click.HelpFormatter): default="info", help="The logging level. One of DEBUG, INFO, WARNING, ERROR, and CRITICAL (case-insensitive).", ) +@click.option( + "--feature-store-yaml", + help="Override the directory where the CLI should look for the feature_store.yaml file.", +) @click.pass_context -def cli(ctx: click.Context, chdir: Optional[str], log_level: str): +def cli( + ctx: click.Context, + chdir: Optional[str], + log_level: str, + feature_store_yaml: Optional[str], +): """ Feast CLI @@ -83,6 +96,11 @@ def cli(ctx: click.Context, chdir: Optional[str], log_level: str): """ ctx.ensure_object(dict) ctx.obj["CHDIR"] = Path.cwd() if chdir is None else Path(chdir).absolute() + ctx.obj["FS_YAML_FILE"] = ( + Path(feature_store_yaml).absolute() + if feature_store_yaml + else ctx.obj["CHDIR"] / "feature_store.yaml" + ) try: level = getattr(logging, log_level.upper()) logging.basicConfig( @@ -143,8 +161,9 @@ def ui(ctx: click.Context, host: str, port: int, registry_ttl_sec: int): Shows the Feast UI over the current directory """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) # Pass in the registry_dump method to get around a circular dependency store.serve_ui( host=host, @@ -161,8 +180,9 @@ def endpoint(ctx: click.Context): Display feature server endpoints """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) endpoint = store.get_feature_server_endpoint() if endpoint is not None: _logger.info( @@ -188,8 +208,9 @@ def data_source_describe(ctx: click.Context, name: str): Describe a data source """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) try: data_source = store.get_data_source(name) @@ -197,11 +218,6 @@ def data_source_describe(ctx: click.Context, name: str): print(e) exit(1) - warnings.warn( - "Describing data sources will only work properly if all data sources have names or table names specified. " - "Starting Feast 0.24, data source unique names will be required to encourage data source discovery.", - RuntimeWarning, - ) print( yaml.dump( yaml.safe_load(str(data_source)), default_flow_style=False, sort_keys=False @@ -216,19 +232,15 @@ def data_source_list(ctx: click.Context): List all data sources """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) table = [] for datasource in store.list_data_sources(): table.append([datasource.name, datasource.__class__]) from tabulate import tabulate - warnings.warn( - "Listing data sources will only work properly if all data sources have names or table names specified. " - "Starting Feast 0.24, data source unique names will be required to encourage data source discovery", - RuntimeWarning, - ) print(tabulate(table, headers=["NAME", "CLASS"], tablefmt="plain")) @@ -248,8 +260,9 @@ def entity_describe(ctx: click.Context, name: str): Describe an entity """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) try: entity = store.get_entity(name) @@ -271,8 +284,9 @@ def entity_list(ctx: click.Context): List all entities """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) table = [] for entity in store.list_entities(): table.append([entity.name, entity.description, entity.value_type]) @@ -298,8 +312,9 @@ def feature_service_describe(ctx: click.Context, name: str): Describe a feature service """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) try: feature_service = store.get_feature_service(name) @@ -323,8 +338,9 @@ def feature_service_list(ctx: click.Context): List all feature services """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) feature_services = [] for feature_service in store.list_feature_services(): feature_names = [] @@ -355,8 +371,9 @@ def feature_view_describe(ctx: click.Context, name: str): Describe a feature view """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) try: feature_view = store.get_feature_view(name) @@ -378,8 +395,10 @@ def feature_view_list(ctx: click.Context): List all feature views """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) table = [] for feature_view in [ *store.list_feature_views(), @@ -421,8 +440,9 @@ def on_demand_feature_view_describe(ctx: click.Context, name: str): [Experimental] Describe an on demand feature view """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) try: on_demand_feature_view = store.get_on_demand_feature_view(name) @@ -446,8 +466,9 @@ def on_demand_feature_view_list(ctx: click.Context): [Experimental] List all on demand feature views """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) table = [] for on_demand_feature_view in store.list_on_demand_feature_views(): table.append([on_demand_feature_view.name]) @@ -469,8 +490,9 @@ def plan_command(ctx: click.Context, skip_source_validation: bool): Create or update a feature store deployment """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - repo_config = load_repo_config(repo) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + repo_config = load_repo_config(repo, fs_yaml_file) try: plan(repo_config, repo, skip_source_validation) except FeastProviderLoginError as e: @@ -489,8 +511,10 @@ def apply_total_command(ctx: click.Context, skip_source_validation: bool): Create or update a feature store deployment """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - repo_config = load_repo_config(repo) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + + repo_config = load_repo_config(repo, fs_yaml_file) try: apply_total(repo_config, repo, skip_source_validation) except FeastProviderLoginError as e: @@ -504,8 +528,9 @@ def teardown_command(ctx: click.Context): Tear down deployed feature store infrastructure """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - repo_config = load_repo_config(repo) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + repo_config = load_repo_config(repo, fs_yaml_file) teardown(repo_config, repo) @@ -517,8 +542,9 @@ def registry_dump_command(ctx: click.Context): Print contents of the metadata registry """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - repo_config = load_repo_config(repo) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + repo_config = load_repo_config(repo, fs_yaml_file) click.echo(registry_dump(repo_config, repo_path=repo)) @@ -545,8 +571,9 @@ def materialize_command( START_TS and END_TS should be in ISO 8601 format, e.g. '2021-07-16T19:20:01' """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) store.materialize( feature_views=None if not views else views, start_date=utils.make_tzaware(parser.parse(start_ts)), @@ -573,8 +600,9 @@ def materialize_incremental_command(ctx: click.Context, end_ts: str, views: List END_TS should be in ISO 8601 format, e.g. '2021-07-16T19:20:01' """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) store.materialize_incremental( feature_views=None if not views else views, end_date=utils.make_tzaware(datetime.fromisoformat(end_ts)), @@ -590,7 +618,7 @@ def materialize_incremental_command(ctx: click.Context, end_ts: str, views: List "--template", "-t", type=click.Choice( - ["local", "gcp", "aws", "snowflake", "spark", "postgres", "hbase"], + ["local", "gcp", "aws", "snowflake", "spark", "postgres", "hbase", "cassandra"], case_sensitive=False, ), help="Specify a template for the created project", @@ -663,8 +691,21 @@ def serve_command( ): """Start a feature server locally on a given port.""" repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + + # If we received a base64 encoded version of feature_store.yaml, use that + config_base64 = os.getenv(FEATURE_STORE_YAML_ENV_NAME) + if config_base64: + print("Received base64 encoded feature_store.yaml") + config_bytes = base64.b64decode(config_base64) + # Create a new unique directory for writing feature_store.yaml + repo_path = Path(tempfile.mkdtemp()) + with open(repo_path / "feature_store.yaml", "wb") as f: + f.write(config_bytes) + store = FeatureStore(repo_path=str(repo_path.resolve())) + else: + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) if go: # Turn on Go feature retrieval. @@ -685,119 +726,13 @@ def serve_command( def serve_transformations_command(ctx: click.Context, port: int): """[Experimental] Start a feature consumption server locally on a given port.""" repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) store.serve_transformations(port) -@cli.group(name="alpha") -def alpha_cmd(): - """ - Access alpha features - """ - pass - - -@alpha_cmd.command("list") -@click.pass_context -def list_alpha_features(ctx: click.Context): - """ - Lists all alpha features - """ - repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - repo_path = str(repo) - store = FeatureStore(repo_path=repo_path) - - flags_to_show = flags.FLAG_NAMES.copy() - flags_to_show.remove(flags.FLAG_ALPHA_FEATURES_NAME) - print("Alpha features:") - for flag in flags_to_show: - enabled_string = ( - "enabled" - if flags_helper.feature_flag_enabled(store.config, flag) - else "disabled" - ) - print(f"{flag}: {enabled_string}") - - -@alpha_cmd.command("enable-all") -@click.pass_context -def enable_alpha_features(ctx: click.Context): - """ - Enables all alpha features - """ - repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - repo_path = str(repo) - store = FeatureStore(repo_path=repo_path) - - if store.config.flags is None: - store.config.flags = {} - for flag_name in flags.FLAG_NAMES: - store.config.flags[flag_name] = True - store.config.write_to_path(Path(repo_path)) - - -@alpha_cmd.command("enable") -@click.argument("name", type=click.STRING) -@click.pass_context -def enable_alpha_feature(ctx: click.Context, name: str): - """ - Enables an alpha feature - """ - if name not in flags.FLAG_NAMES: - raise ValueError(f"Flag name, {name}, not valid.") - - repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - repo_path = str(repo) - store = FeatureStore(repo_path=repo_path) - - if store.config.flags is None: - store.config.flags = {} - store.config.flags[flags.FLAG_ALPHA_FEATURES_NAME] = True - store.config.flags[name] = True - store.config.write_to_path(Path(repo_path)) - - -@alpha_cmd.command("disable") -@click.argument("name", type=click.STRING) -@click.pass_context -def disable_alpha_feature(ctx: click.Context, name: str): - """ - Disables an alpha feature - """ - if name not in flags.FLAG_NAMES: - raise ValueError(f"Flag name, {name}, not valid.") - - repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - repo_path = str(repo) - store = FeatureStore(repo_path=repo_path) - - if store.config.flags is None or name not in store.config.flags: - return - store.config.flags[name] = False - store.config.write_to_path(Path(repo_path)) - - -@alpha_cmd.command("disable-all") -@click.pass_context -def disable_alpha_features(ctx: click.Context): - """ - Disables all alpha features - """ - repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - repo_path = str(repo) - store = FeatureStore(repo_path=repo_path) - - store.config.flags = None - store.config.write_to_path(Path(repo_path)) - - @cli.command("validate") @click.option( "--feature-service", @@ -831,8 +766,9 @@ def validate( START_TS and END_TS should be in ISO 8601 format, e.g. '2021-07-16T19:20:01' """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) - store = FeatureStore(repo_path=str(repo)) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = FeatureStore(repo_path=str(repo), fs_yaml_file=fs_yaml_file) feature_service = store.get_feature_service(name=feature_service) reference = store.get_validation_reference(reference) @@ -873,7 +809,8 @@ def repo_upgrade(ctx: click.Context, write: bool): Upgrade a feature repo in place. """ repo = ctx.obj["CHDIR"] - cli_check_repo(repo) + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) try: RepoUpgrader(repo, write).upgrade() except FeastProviderLoginError as e: diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index a1e44b3186..19a780b32c 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -16,7 +16,7 @@ import warnings from abc import ABC, abstractmethod from datetime import timedelta -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple from google.protobuf.duration_pb2 import Duration from google.protobuf.json_format import MessageToJson @@ -31,19 +31,6 @@ from feast.value_type import ValueType -class SourceType(enum.Enum): - """ - DataSource value type. Used to define source types in DataSource. - """ - - UNKNOWN = 0 - BATCH_FILE = 1 - BATCH_BIGQUERY = 2 - STREAM_KAFKA = 3 - STREAM_KINESIS = 4 - BATCH_TRINO = 5 - - class KafkaOptions: """ DataSource Kafka options used to source features from Kafka messages @@ -169,6 +156,7 @@ def to_proto(self) -> DataSourceProto.KinesisOptions: DataSourceProto.SourceType.BATCH_SNOWFLAKE: "feast.infra.offline_stores.snowflake_source.SnowflakeSource", DataSourceProto.SourceType.BATCH_TRINO: "feast.infra.offline_stores.contrib.trino_offline_store.trino_source.TrinoSource", DataSourceProto.SourceType.BATCH_SPARK: "feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource", + DataSourceProto.SourceType.BATCH_ATHENA: "feast.infra.offline_stores.contrib.athena_offline_store.athena_source.AthenaSource", DataSourceProto.SourceType.STREAM_KAFKA: "feast.data_source.KafkaSource", DataSourceProto.SourceType.STREAM_KINESIS: "feast.data_source.KinesisSource", DataSourceProto.SourceType.REQUEST_SOURCE: "feast.data_source.RequestSource", @@ -183,96 +171,67 @@ class DataSource(ABC): Args: name: Name of data source, which should be unique within a project - event_timestamp_column (optional): (Deprecated in favor of timestamp_field) Event - timestamp column used for point in time joins of feature values. + timestamp_field (optional): Event timestamp field used for point-in-time joins of + feature values. created_timestamp_column (optional): Timestamp column indicating when the row was created, used for deduplicating rows. field_mapping (optional): A dictionary mapping of column names in this data source to feature names in a feature table or view. Only used for feature columns, not entity or timestamp columns. - date_partition_column (optional): Timestamp column used for partitioning. description (optional) A human-readable description. tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the data source, typically the email of the primary maintainer. timestamp_field (optional): Event timestamp field used for point in time joins of feature values. + date_partition_column (optional): Timestamp column used for partitioning. Not supported by all offline stores. """ name: str timestamp_field: str created_timestamp_column: str field_mapping: Dict[str, str] - date_partition_column: str description: str tags: Dict[str, str] owner: str + date_partition_column: str def __init__( self, *, - event_timestamp_column: Optional[str] = None, + name: str, + timestamp_field: Optional[str] = None, created_timestamp_column: Optional[str] = None, field_mapping: Optional[Dict[str, str]] = None, - date_partition_column: Optional[str] = None, description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", - name: Optional[str] = None, - timestamp_field: Optional[str] = None, + date_partition_column: Optional[str] = None, ): """ Creates a DataSource object. + Args: - name: Name of data source, which should be unique within a project - event_timestamp_column (optional): (Deprecated in favor of timestamp_field) Event - timestamp column used for point in time joins of feature values. + name: Name of data source, which should be unique within a project. + timestamp_field (optional): Event timestamp field used for point-in-time joins of + feature values. created_timestamp_column (optional): Timestamp column indicating when the row was created, used for deduplicating rows. field_mapping (optional): A dictionary mapping of column names in this data source to feature names in a feature table or view. Only used for feature columns, not entity or timestamp columns. - date_partition_column (optional): Timestamp column used for partitioning. description (optional): A human-readable description. tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the data source, typically the email of the primary maintainer. - timestamp_field (optional): Event timestamp field used for point - in time joins of feature values. + date_partition_column (optional): Timestamp column used for partitioning. Not supported by all stores """ - if not name: - warnings.warn( - ( - "Names for data sources need to be supplied. " - "Data sources without names will not be supported after Feast 0.24." - ), - UserWarning, - ) - self.name = name or "" - if not timestamp_field and event_timestamp_column: - warnings.warn( - ( - "The argument 'event_timestamp_column' is being deprecated. Please use 'timestamp_field' instead. " - "instead. Feast 0.24 and onwards will not support the argument 'event_timestamp_column' for datasources." - ), - DeprecationWarning, - ) - self.timestamp_field = timestamp_field or event_timestamp_column or "" + self.name = name + self.timestamp_field = timestamp_field or "" self.created_timestamp_column = ( created_timestamp_column if created_timestamp_column else "" ) self.field_mapping = field_mapping if field_mapping else {} - self.date_partition_column = ( - date_partition_column if date_partition_column else "" - ) - if date_partition_column: - warnings.warn( - ( - "The argument 'date_partition_column' is being deprecated. " - "Feast 0.25 and onwards will not support 'date_timestamp_column' for data sources." - ), - DeprecationWarning, - ) if ( self.timestamp_field and self.timestamp_field == self.created_timestamp_column @@ -283,6 +242,9 @@ def __init__( self.description = description or "" self.tags = tags or {} self.owner = owner or "" + self.date_partition_column = ( + date_partition_column if date_partition_column else "" + ) def __hash__(self): return hash((self.name, self.timestamp_field)) @@ -337,7 +299,6 @@ def from_proto(data_source: DataSourceProto) -> Any: if data_source_type == DataSourceProto.SourceType.CUSTOM_SOURCE: cls = get_data_source_class_from_type(data_source.data_source_class_type) return cls.from_proto(data_source) - cls = get_data_source_class_from_type(_DATA_SOURCE_OPTIONS[data_source_type]) return cls.from_proto(data_source) @@ -387,20 +348,18 @@ def get_table_query_string(self) -> str: class KafkaSource(DataSource): def __init__( self, - *args, - name: Optional[str] = None, - event_timestamp_column: Optional[str] = "", + *, + name: str, + timestamp_field: str, + message_format: StreamFormat, bootstrap_servers: Optional[str] = None, kafka_bootstrap_servers: Optional[str] = None, - message_format: Optional[StreamFormat] = None, topic: Optional[str] = None, created_timestamp_column: Optional[str] = "", field_mapping: Optional[Dict[str, str]] = None, - date_partition_column: Optional[str] = "", description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", - timestamp_field: Optional[str] = "", batch_source: Optional[DataSource] = None, watermark_delay_threshold: Optional[timedelta] = None, ): @@ -409,41 +368,24 @@ def __init__( Args: name: Name of data source, which should be unique within a project - event_timestamp_column (optional): (Deprecated in favor of timestamp_field) Event - timestamp column used for point in time joins of feature values. - bootstrap_servers: (Deprecated) The servers of the kafka broker in the form "localhost:9092". - kafka_bootstrap_servers: The servers of the kafka broker in the form "localhost:9092". + timestamp_field: Event timestamp field used for point-in-time joins of feature values. message_format: StreamFormat of serialized messages. - topic: The name of the topic to read from in the kafka source. + bootstrap_servers: (Deprecated) The servers of the kafka broker in the form "localhost:9092". + kafka_bootstrap_servers (optional): The servers of the kafka broker in the form "localhost:9092". + topic (optional): The name of the topic to read from in the kafka source. created_timestamp_column (optional): Timestamp column indicating when the row was created, used for deduplicating rows. field_mapping (optional): A dictionary mapping of column names in this data source to feature names in a feature table or view. Only used for feature columns, not entity or timestamp columns. - date_partition_column (optional): Timestamp column used for partitioning. description (optional): A human-readable description. tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the data source, typically the email of the primary maintainer. - timestamp_field (optional): Event timestamp field used for point - in time joins of feature values. - batch_source: The datasource that acts as a batch source. - watermark_delay_threshold: The watermark delay threshold for stream data. Specifically how - late stream data can arrive without being discarded. + batch_source (optional): The datasource that acts as a batch source. + watermark_delay_threshold (optional): The watermark delay threshold for stream data. + Specifically how late stream data can arrive without being discarded. """ - positional_attributes = [ - "name", - "event_timestamp_column", - "bootstrap_servers", - "message_format", - "topic", - ] - _name = name - _event_timestamp_column = event_timestamp_column - _kafka_bootstrap_servers = kafka_bootstrap_servers or bootstrap_servers or "" - _message_format = message_format - _topic = topic or "" - if bootstrap_servers: warnings.warn( ( @@ -453,53 +395,24 @@ def __init__( DeprecationWarning, ) - if args: - warnings.warn( - ( - "Kafka parameters should be specified as a keyword argument instead of a positional arg." - "Feast 0.24+ will not support positional arguments to construct Kafka sources" - ), - DeprecationWarning, - ) - if len(args) > len(positional_attributes): - raise ValueError( - f"Only {', '.join(positional_attributes)} are allowed as positional args when defining " - f"Kafka sources, for backwards compatibility." - ) - if len(args) >= 1: - _name = args[0] - if len(args) >= 2: - _event_timestamp_column = args[1] - if len(args) >= 3: - _kafka_bootstrap_servers = args[2] - if len(args) >= 4: - _message_format = args[3] - if len(args) >= 5: - _topic = args[4] - - if _message_format is None: - raise ValueError("Message format must be specified for Kafka source") - - if not timestamp_field and not _event_timestamp_column: - raise ValueError("Timestamp field must be specified for Kafka source") - super().__init__( - event_timestamp_column=_event_timestamp_column, + name=name, + timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping, - date_partition_column=date_partition_column, description=description, tags=tags, owner=owner, - name=_name, - timestamp_field=timestamp_field, ) self.batch_source = batch_source + kafka_bootstrap_servers = kafka_bootstrap_servers or bootstrap_servers or "" + topic = topic or "" + self.kafka_options = KafkaOptions( - kafka_bootstrap_servers=_kafka_bootstrap_servers, - message_format=_message_format, - topic=_topic, + kafka_bootstrap_servers=kafka_bootstrap_servers, + message_format=message_format, + topic=topic, watermark_delay_threshold=watermark_delay_threshold, ) @@ -539,7 +452,6 @@ def from_proto(data_source: DataSourceProto): ) return KafkaSource( name=data_source.name, - event_timestamp_column=data_source.timestamp_field, field_mapping=dict(data_source.field_mapping), kafka_bootstrap_servers=data_source.kafka_options.kafka_bootstrap_servers, message_format=StreamFormat.from_proto( @@ -549,7 +461,6 @@ def from_proto(data_source: DataSourceProto): topic=data_source.kafka_options.topic, created_timestamp_column=data_source.created_timestamp_column, timestamp_field=data_source.timestamp_field, - date_partition_column=data_source.date_partition_column, description=data_source.description, tags=dict(data_source.tags), owner=data_source.owner, @@ -571,7 +482,6 @@ def to_proto(self) -> DataSourceProto: data_source_proto.timestamp_field = self.timestamp_field data_source_proto.created_timestamp_column = self.created_timestamp_column - data_source_proto.date_partition_column = self.date_partition_column if self.batch_source: data_source_proto.batch_source.MergeFrom(self.batch_source.to_proto()) return data_source_proto @@ -614,55 +524,16 @@ class RequestSource(DataSource): def __init__( self, - *args, - name: Optional[str] = None, - schema: Optional[Union[Dict[str, ValueType], List[Field]]] = None, + *, + name: str, + schema: List[Field], description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", ): """Creates a RequestSource object.""" - positional_attributes = ["name", "schema"] - _name = name - _schema = schema - if args: - warnings.warn( - ( - "Request source parameters should be specified as a keyword argument instead of a positional arg." - "Feast 0.24+ will not support positional arguments to construct request sources" - ), - DeprecationWarning, - ) - if len(args) > len(positional_attributes): - raise ValueError( - f"Only {', '.join(positional_attributes)} are allowed as positional args when defining " - f"feature views, for backwards compatibility." - ) - if len(args) >= 1: - _name = args[0] - if len(args) >= 2: - _schema = args[1] - - super().__init__(name=_name, description=description, tags=tags, owner=owner) - if not _schema: - raise ValueError("Schema needs to be provided for Request Source") - if isinstance(_schema, Dict): - warnings.warn( - "Schema in RequestSource is changing type. The schema data type Dict[str, ValueType] is being deprecated in Feast 0.24. " - "Please use List[Field] instead for the schema", - DeprecationWarning, - ) - schema_list = [] - for key, value_type in _schema.items(): - schema_list.append(Field(name=key, dtype=from_value_type(value_type))) - self.schema = schema_list - elif isinstance(_schema, List): - self.schema = _schema - else: - raise Exception( - "Schema type must be either dictionary or list, not " - + str(type(_schema)) - ) + super().__init__(name=name, description=description, tags=tags, owner=owner) + self.schema = schema def validate(self, config: RepoConfig): pass @@ -694,38 +565,18 @@ def __hash__(self): @staticmethod def from_proto(data_source: DataSourceProto): - - deprecated_schema = data_source.request_data_options.deprecated_schema schema_pb = data_source.request_data_options.schema + list_schema = [] + for field_proto in schema_pb: + list_schema.append(Field.from_proto(field_proto)) - if deprecated_schema and not schema_pb: - warnings.warn( - "Schema in RequestSource is changing type. The schema data type Dict[str, ValueType] is being deprecated in Feast 0.24. " - "Please use List[Field] instead for the schema", - DeprecationWarning, - ) - dict_schema = {} - for key, val in deprecated_schema.items(): - dict_schema[key] = ValueType(val) - return RequestSource( - name=data_source.name, - schema=dict_schema, - description=data_source.description, - tags=dict(data_source.tags), - owner=data_source.owner, - ) - else: - list_schema = [] - for field_proto in schema_pb: - list_schema.append(Field.from_proto(field_proto)) - - return RequestSource( - name=data_source.name, - schema=list_schema, - description=data_source.description, - tags=dict(data_source.tags), - owner=data_source.owner, - ) + return RequestSource( + name=data_source.name, + schema=list_schema, + description=data_source.description, + tags=dict(data_source.tags), + owner=data_source.owner, + ) def to_proto(self) -> DataSourceProto: @@ -758,16 +609,6 @@ def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: raise NotImplementedError -@typechecked -class RequestDataSource(RequestSource): - def __init__(self, *args, **kwargs): - warnings.warn( - "The 'RequestDataSource' class is deprecated and was renamed to RequestSource. Please use RequestSource instead. This class name will be removed in Feast 0.24.", - DeprecationWarning, - ) - super().__init__(*args, **kwargs) - - @typechecked class KinesisSource(DataSource): def validate(self, config: RepoConfig): @@ -782,7 +623,7 @@ def get_table_column_names_and_types( def from_proto(data_source: DataSourceProto): return KinesisSource( name=data_source.name, - event_timestamp_column=data_source.timestamp_field, + timestamp_field=data_source.timestamp_field, field_mapping=dict(data_source.field_mapping), record_format=StreamFormat.from_proto( data_source.kinesis_options.record_format @@ -790,8 +631,6 @@ def from_proto(data_source: DataSourceProto): region=data_source.kinesis_options.region, stream_name=data_source.kinesis_options.stream_name, created_timestamp_column=data_source.created_timestamp_column, - timestamp_field=data_source.timestamp_field, - date_partition_column=data_source.date_partition_column, description=data_source.description, tags=dict(data_source.tags), owner=data_source.owner, @@ -809,78 +648,34 @@ def get_table_query_string(self) -> str: def __init__( self, - *args, - name: Optional[str] = None, - event_timestamp_column: Optional[str] = "", + *, + name: str, + record_format: StreamFormat, + region: str, + stream_name: str, + timestamp_field: Optional[str] = "", created_timestamp_column: Optional[str] = "", - record_format: Optional[StreamFormat] = None, - region: Optional[str] = "", - stream_name: Optional[str] = "", field_mapping: Optional[Dict[str, str]] = None, - date_partition_column: Optional[str] = "", description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", - timestamp_field: Optional[str] = "", batch_source: Optional[DataSource] = None, ): - positional_attributes = [ - "name", - "event_timestamp_column", - "created_timestamp_column", - "record_format", - "region", - "stream_name", - ] - _name = name - _event_timestamp_column = event_timestamp_column - _created_timestamp_column = created_timestamp_column - _record_format = record_format - _region = region or "" - _stream_name = stream_name or "" - if args: - warnings.warn( - ( - "Kinesis parameters should be specified as a keyword argument instead of a positional arg." - "Feast 0.24+ will not support positional arguments to construct kinesis sources" - ), - DeprecationWarning, - ) - if len(args) > len(positional_attributes): - raise ValueError( - f"Only {', '.join(positional_attributes)} are allowed as positional args when defining " - f"kinesis sources, for backwards compatibility." - ) - if len(args) >= 1: - _name = args[0] - if len(args) >= 2: - _event_timestamp_column = args[1] - if len(args) >= 3: - _created_timestamp_column = args[2] - if len(args) >= 4: - _record_format = args[3] - if len(args) >= 5: - _region = args[4] - if len(args) >= 6: - _stream_name = args[5] - - if _record_format is None: + if record_format is None: raise ValueError("Record format must be specified for kinesis source") super().__init__( - name=_name, - event_timestamp_column=_event_timestamp_column, - created_timestamp_column=_created_timestamp_column, + name=name, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, field_mapping=field_mapping, - date_partition_column=date_partition_column, description=description, tags=tags, owner=owner, - timestamp_field=timestamp_field, ) self.batch_source = batch_source self.kinesis_options = KinesisOptions( - record_format=_record_format, region=_region, stream_name=_stream_name + record_format=record_format, region=region, stream_name=stream_name ) def __eq__(self, other): @@ -917,7 +712,6 @@ def to_proto(self) -> DataSourceProto: data_source_proto.timestamp_field = self.timestamp_field data_source_proto.created_timestamp_column = self.created_timestamp_column - data_source_proto.date_partition_column = self.date_partition_column if self.batch_source: data_source_proto.batch_source.MergeFrom(self.batch_source.to_proto()) @@ -942,15 +736,16 @@ class PushSource(DataSource): def __init__( self, - *args, - name: Optional[str] = None, - batch_source: Optional[DataSource] = None, + *, + name: str, + batch_source: DataSource, description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", ): """ Creates a PushSource object. + Args: name: Name of the push source batch_source: The batch source that backs this push source. It's used when materializing from the offline @@ -959,35 +754,9 @@ def __init__( tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the data source, typically the email of the primary maintainer. - """ - positional_attributes = ["name", "batch_source"] - _name = name - _batch_source = batch_source - if args: - warnings.warn( - ( - "Push source parameters should be specified as a keyword argument instead of a positional arg." - "Feast 0.24+ will not support positional arguments to construct push sources" - ), - DeprecationWarning, - ) - if len(args) > len(positional_attributes): - raise ValueError( - f"Only {', '.join(positional_attributes)} are allowed as positional args when defining " - f"push sources, for backwards compatibility." - ) - if len(args) >= 1: - _name = args[0] - if len(args) >= 2: - _batch_source = args[1] - - super().__init__(name=_name, description=description, tags=tags, owner=owner) - if not _batch_source: - raise ValueError( - f"batch_source parameter is needed for push source {self.name}" - ) - self.batch_source = _batch_source + super().__init__(name=name, description=description, tags=tags, owner=owner) + self.batch_source = batch_source def __eq__(self, other): if not isinstance(other, PushSource): diff --git a/sdk/python/feast/diff/registry_diff.py b/sdk/python/feast/diff/registry_diff.py index fc0acf0223..15f880e392 100644 --- a/sdk/python/feast/diff/registry_diff.py +++ b/sdk/python/feast/diff/registry_diff.py @@ -8,6 +8,8 @@ from feast.feast_object import FeastObject, FeastObjectSpecProto from feast.feature_service import FeatureService from feast.feature_view import DUMMY_ENTITY_NAME +from feast.infra.registry.base_registry import BaseRegistry +from feast.infra.registry.registry import FEAST_OBJECT_TYPES, FeastObjectType from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.protos.feast.core.Entity_pb2 import Entity as EntityProto from feast.protos.feast.core.FeatureService_pb2 import ( @@ -17,6 +19,7 @@ from feast.protos.feast.core.OnDemandFeatureView_pb2 import ( OnDemandFeatureView as OnDemandFeatureViewProto, ) +from feast.protos.feast.core.OnDemandFeatureView_pb2 import OnDemandFeatureViewSpec from feast.protos.feast.core.RequestFeatureView_pb2 import ( RequestFeatureView as RequestFeatureViewProto, ) @@ -26,7 +29,6 @@ from feast.protos.feast.core.ValidationProfile_pb2 import ( ValidationReference as ValidationReferenceProto, ) -from feast.registry import FEAST_OBJECT_TYPES, BaseRegistry, FeastObjectType from feast.repo_contents import RepoContents @@ -137,19 +139,39 @@ def diff_registry_objects( else: current_spec = current_proto.spec new_spec = new_proto.spec - if current_spec != new_spec: + if current != new: for _field in current_spec.DESCRIPTOR.fields: if _field.name in FIELDS_TO_IGNORE: continue - if getattr(current_spec, _field.name) != getattr(new_spec, _field.name): - transition = TransitionType.UPDATE - property_diffs.append( - PropertyDiff( - _field.name, - getattr(current_spec, _field.name), - getattr(new_spec, _field.name), + elif getattr(current_spec, _field.name) != getattr(new_spec, _field.name): + if _field.name == "user_defined_function": + current_spec = cast(OnDemandFeatureViewSpec, current_spec) + new_spec = cast(OnDemandFeatureViewSpec, new_spec) + current_udf = current_spec.user_defined_function + new_udf = new_spec.user_defined_function + for _udf_field in current_udf.DESCRIPTOR.fields: + if _udf_field.name == "body": + continue + if getattr(current_udf, _udf_field.name) != getattr( + new_udf, _udf_field.name + ): + transition = TransitionType.UPDATE + property_diffs.append( + PropertyDiff( + _field.name + "." + _udf_field.name, + getattr(current_udf, _udf_field.name), + getattr(new_udf, _udf_field.name), + ) + ) + else: + transition = TransitionType.UPDATE + property_diffs.append( + PropertyDiff( + _field.name, + getattr(current_spec, _field.name), + getattr(new_spec, _field.name), + ) ) - ) return FeastObjectDiff( name=new_spec.name, feast_object_type=object_type, diff --git a/sdk/python/feast/driver_test_data.py b/sdk/python/feast/driver_test_data.py index da9d061313..58c3e8db8f 100644 --- a/sdk/python/feast/driver_test_data.py +++ b/sdk/python/feast/driver_test_data.py @@ -164,7 +164,7 @@ def create_customer_daily_profile_df(customers, start_date, end_date) -> pd.Data "event_timestamp": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( - start=start_date, end=end_date, freq="1D", closed="left" + start=start_date, end=end_date, freq="1D", inclusive="left" ) ] } @@ -209,7 +209,7 @@ def create_location_stats_df(locations, start_date, end_date) -> pd.DataFrame: "event_timestamp": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( - start=start_date, end=end_date, freq="1H", closed="left" + start=start_date, end=end_date, freq="1H", inclusive="left" ) ] } @@ -256,7 +256,7 @@ def create_global_daily_stats_df(start_date, end_date) -> pd.DataFrame: "event_timestamp": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( - start=start_date, end=end_date, freq="1D", closed="left" + start=start_date, end=end_date, freq="1D", inclusive="left" ) ] } diff --git a/sdk/python/feast/entity.py b/sdk/python/feast/entity.py index 0c468c7b59..55de68a404 100644 --- a/sdk/python/feast/entity.py +++ b/sdk/python/feast/entity.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import warnings from datetime import datetime from typing import Dict, List, Optional @@ -33,7 +32,10 @@ class Entity: Attributes: name: The unique name of the entity. - value_type (deprecated): The type of the entity, such as string or float. + value_type: The type of the entity, such as string or float. + join_keys: A list of properties that uniquely identifies different entities within the + collection. This currently only supports a list of size one, but is intended to + eventually support multiple join keys. join_key: A property that uniquely identifies different entities within the collection. The join_key property is typically used for joining entities with their associated features. If not specified, defaults to the name. @@ -42,108 +44,62 @@ class Entity: owner: The owner of the entity, typically the email of the primary maintainer. created_timestamp: The time when the entity was created. last_updated_timestamp: The time when the entity was last updated. - join_keys: A list of properties that uniquely identifies different entities within the - collection. This is meant to replace the `join_key` parameter, but currently only - supports a list of size one. """ name: str value_type: ValueType + join_keys: List[str] join_key: str description: str tags: Dict[str, str] owner: str created_timestamp: Optional[datetime] last_updated_timestamp: Optional[datetime] - join_keys: List[str] @log_exceptions def __init__( self, - *args, - name: Optional[str] = None, - value_type: Optional[ValueType] = None, + *, + name: str, + join_keys: Optional[List[str]] = None, description: str = "", - join_key: Optional[str] = None, tags: Optional[Dict[str, str]] = None, owner: str = "", - join_keys: Optional[List[str]] = None, ): """ Creates an Entity object. Args: name: The unique name of the entity. - value_type (deprecated): The type of the entity, such as string or float. - description: A human-readable description. - join_key (deprecated): A property that uniquely identifies different entities within the - collection. The join_key property is typically used for joining entities - with their associated features. If not specified, defaults to the name. - tags: A dictionary of key-value pairs to store arbitrary metadata. - owner: The owner of the entity, typically the email of the primary maintainer. - join_keys: A list of properties that uniquely identifies different entities within the - collection. This is meant to replace the `join_key` parameter, but currently only - supports a list of size one. + join_keys (optional): A list of properties that uniquely identifies different entities + within the collection. This currently only supports a list of size one, but is + intended to eventually support multiple join keys. + description (optional): A human-readable description. + tags (optional): A dictionary of key-value pairs to store arbitrary metadata. + owner (optional): The owner of the entity, typically the email of the primary maintainer. Raises: ValueError: Parameters are specified incorrectly. """ - if len(args) == 1: - warnings.warn( - ( - "Entity name should be specified as a keyword argument instead of a positional arg." - "Feast 0.24+ will not support positional arguments to construct Entities" - ), - DeprecationWarning, - ) - if len(args) > 1: - raise ValueError( - "All arguments to construct an entity should be specified as keyword arguments only" - ) - - self.name = args[0] if len(args) > 0 else name - - if not self.name: - raise ValueError("Name needs to be specified") - - if value_type: - warnings.warn( - ( - "The `value_type` parameter is being deprecated. Instead, the type of an entity " - "should be specified as a Field in the schema of a feature view. Feast 0.24 and " - "onwards will not support the `value_type` parameter. The `entities` parameter of " - "feature views should also be changed to a List[Entity] instead of a List[str]; if " - "this is not done, entity columns will be mistakenly interpreted as feature columns." - ), - DeprecationWarning, - ) - self.value_type = value_type or ValueType.UNKNOWN + self.name = name + self.value_type = ValueType.UNKNOWN # For now, both the `join_key` and `join_keys` attributes are set correctly, # so both are usable. - # TODO(felixwang9817): Remove the usage of `join_key` throughout the codebase - # when the usage of `join_key` as a parameter is removed. - if join_key: - warnings.warn( - ( - "The `join_key` parameter is being deprecated in favor of the `join_keys` parameter. " - "Please switch from using `join_key` to `join_keys`. Feast 0.24 and onwards will not " - "support the `join_key` parameter." - ), - DeprecationWarning, - ) - self.join_keys = join_keys or [] + # TODO(felixwang9817): Fully remove the usage of `join_key` throughout the codebase, + # at which point the `join_key` attribute no longer needs to be set. if join_keys and len(join_keys) > 1: raise ValueError( "An entity may only have single join key. " "Multiple join keys will be supported in the future." ) - if join_keys and len(join_keys) == 1: + elif join_keys and len(join_keys) == 1: + self.join_keys = join_keys self.join_key = join_keys[0] else: - self.join_key = join_key if join_key else self.name - if not self.join_keys: + self.join_key = self.name self.join_keys = [self.join_key] + self.description = description self.tags = tags if tags is not None else {} self.owner = owner diff --git a/sdk/python/feast/errors.py b/sdk/python/feast/errors.py index 980dfd470f..15fda6ac7d 100644 --- a/sdk/python/feast/errors.py +++ b/sdk/python/feast/errors.py @@ -19,6 +19,13 @@ def __init__(self): ) +class DataSourceRepeatNamesException(Exception): + def __init__(self, ds_name: str): + super().__init__( + f"Multiple data sources share the same case-insensitive name {ds_name}." + ) + + class FeastObjectNotFoundException(Exception): pass @@ -197,6 +204,18 @@ def __init__( ) +class SavedDatasetLocationAlreadyExists(Exception): + def __init__(self, location: str): + super().__init__(f"Saved dataset location {location} already exists.") + + +class FeastOfflineStoreInvalidName(Exception): + def __init__(self, offline_store_class_name: str): + super().__init__( + f"Offline Store Class '{offline_store_class_name}' should end with the string `OfflineStore`.'" + ) + + class FeastOnlineStoreInvalidName(Exception): def __init__(self, online_store_class_name: str): super().__init__( @@ -296,6 +315,13 @@ def __init__(self, expected_column_name: str): ) +class FeatureViewMissingDuringFeatureServiceInference(Exception): + def __init__(self, feature_view_name: str, feature_service_name: str): + super().__init__( + f"Missing {feature_view_name} feature view during inference for {feature_service_name} feature service." + ) + + class InvalidEntityType(Exception): def __init__(self, entity_type: type): super().__init__( @@ -312,14 +338,6 @@ def __init__(self, feature_view_name: str): ) -class ExperimentalFeatureNotEnabled(Exception): - def __init__(self, feature_flag_name: str): - super().__init__( - f"You are attempting to use an experimental feature that is not enabled. Please run " - f"`feast alpha enable {feature_flag_name}` " - ) - - class RepoConfigPathDoesNotExist(Exception): def __init__(self): super().__init__("The repo_path attribute does not exist for the repo_config.") diff --git a/sdk/python/feast/feast_object.py b/sdk/python/feast/feast_object.py index 0ac0446f5f..38109f5d8c 100644 --- a/sdk/python/feast/feast_object.py +++ b/sdk/python/feast/feast_object.py @@ -1,5 +1,6 @@ from typing import Union +from .batch_feature_view import BatchFeatureView from .data_source import DataSource from .entity import Entity from .feature_service import FeatureService @@ -16,12 +17,15 @@ ) from .request_feature_view import RequestFeatureView from .saved_dataset import ValidationReference +from .stream_feature_view import StreamFeatureView # Convenience type representing all Feast objects FeastObject = Union[ FeatureView, OnDemandFeatureView, RequestFeatureView, + BatchFeatureView, + StreamFeatureView, Entity, FeatureService, DataSource, diff --git a/sdk/python/feast/feature_logging.py b/sdk/python/feast/feature_logging.py index da9a0c9fe5..bd45c09b0a 100644 --- a/sdk/python/feast/feature_logging.py +++ b/sdk/python/feast/feature_logging.py @@ -18,7 +18,7 @@ if TYPE_CHECKING: from feast.feature_service import FeatureService - from feast.registry import BaseRegistry + from feast.infra.registry.base_registry import BaseRegistry REQUEST_ID_FIELD = "__request_id" diff --git a/sdk/python/feast/feature_service.py b/sdk/python/feast/feature_service.py index 410ba94b00..c3037a55da 100644 --- a/sdk/python/feast/feature_service.py +++ b/sdk/python/feast/feature_service.py @@ -1,4 +1,3 @@ -import warnings from datetime import datetime from typing import Dict, List, Optional, Union @@ -6,6 +5,7 @@ from typeguard import typechecked from feast.base_feature_view import BaseFeatureView +from feast.errors import FeatureViewMissingDuringFeatureServiceInference from feast.feature_logging import LoggingConfig from feast.feature_view import FeatureView from feast.feature_view_projection import FeatureViewProjection @@ -53,9 +53,9 @@ class FeatureService: @log_exceptions def __init__( self, - *args, - name: Optional[str] = None, - features: Optional[List[Union[FeatureView, OnDemandFeatureView]]] = None, + *, + name: str, + features: List[Union[FeatureView, OnDemandFeatureView]], tags: Dict[str, str] = None, description: str = "", owner: str = "", @@ -64,39 +64,17 @@ def __init__( """ Creates a FeatureService object. - Raises: - ValueError: If one of the specified features is not a valid type. + Args: + name: The unique name of the feature service. + feature_view_projections: A list containing feature views and feature view + projections, representing the features in the feature service. + description (optional): A human-readable description. + tags (optional): A dictionary of key-value pairs to store arbitrary metadata. + owner (optional): The owner of the feature view, typically the email of the + primary maintainer. """ - positional_attributes = ["name", "features"] - _name = name - _features = features - if args: - warnings.warn( - ( - "Feature service parameters should be specified as a keyword argument instead of a positional arg." - "Feast 0.24+ will not support positional arguments to construct feature service" - ), - DeprecationWarning, - ) - if len(args) > len(positional_attributes): - raise ValueError( - f"Only {', '.join(positional_attributes)} are allowed as positional args when defining " - f"feature service, for backwards compatibility." - ) - if len(args) >= 1: - _name = args[0] - if len(args) >= 2: - _features = args[1] - - if not _name: - raise ValueError("Feature service name needs to be specified") - - if not _features: - # Technically, legal to create feature service with no feature views before. - _features = [] - - self.name = _name - self._features = _features + self.name = name + self._features = features self.feature_view_projections = [] self.description = description self.tags = tags or {} @@ -108,15 +86,29 @@ def __init__( if isinstance(feature_grouping, BaseFeatureView): self.feature_view_projections.append(feature_grouping.projection) - def infer_features(self, fvs_to_update: Optional[Dict[str, FeatureView]] = None): + def infer_features(self, fvs_to_update: Dict[str, FeatureView]): + """ + Infers the features for the projections of this feature service, and updates this feature + service in place. + + This method is necessary since feature services may rely on feature views which require + feature inference. + + Args: + fvs_to_update: A mapping of feature view names to corresponding feature views that + contains all the feature views necessary to run inference. + """ for feature_grouping in self._features: if isinstance(feature_grouping, BaseFeatureView): - # For feature services that depend on an unspecified feature view, apply inferred schema - if fvs_to_update and feature_grouping.name in fvs_to_update: - if feature_grouping.projection.desired_features: - desired_features = set( - feature_grouping.projection.desired_features - ) + projection = feature_grouping.projection + + if projection.desired_features: + # The projection wants to select a specific set of inferred features. + # Example: FeatureService(features=[fv[["inferred_feature"]]]), where + # 'fv' is a feature view that was defined without a schema. + if feature_grouping.name in fvs_to_update: + # First we validate that the selected features have actually been inferred. + desired_features = set(projection.desired_features) actual_features = set( [ f.name @@ -124,16 +116,38 @@ def infer_features(self, fvs_to_update: Optional[Dict[str, FeatureView]] = None) ] ) assert desired_features.issubset(actual_features) - # We need to set the features for the projection at this point so we ensure we're starting with - # an empty list. - feature_grouping.projection.features = [] + + # Then we extract the selected features and add them to the projection. + projection.features = [] for f in fvs_to_update[feature_grouping.name].features: if f.name in desired_features: - feature_grouping.projection.features.append(f) + projection.features.append(f) else: - feature_grouping.projection.features = fvs_to_update[ - feature_grouping.name - ].features + raise FeatureViewMissingDuringFeatureServiceInference( + feature_view_name=feature_grouping.name, + feature_service_name=self.name, + ) + + continue + + if projection.features: + # The projection has already selected features from a feature view with a + # known schema, so no action needs to be taken. + # Example: FeatureService(features=[fv[["existing_feature"]]]), where + # 'existing_feature' was defined as part of the schema of 'fv'. + # Example: FeatureService(features=[fv]), where 'fv' was defined with a schema. + continue + + # The projection wants to select all possible inferred features. + # Example: FeatureService(features=[fv]), where 'fv' is a feature view that + # was defined without a schema. + if feature_grouping.name in fvs_to_update: + projection.features = fvs_to_update[feature_grouping.name].features + else: + raise FeatureViewMissingDuringFeatureServiceInference( + feature_view_name=feature_grouping.name, + feature_service_name=self.name, + ) else: raise ValueError( f"The feature service {self.name} has been provided with an invalid type " diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index c4ccc9a648..23600e7c64 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -40,17 +40,23 @@ from google.protobuf.timestamp_pb2 import Timestamp from tqdm import tqdm -from feast import feature_server, flags, flags_helper, ui_server, utils +from feast import feature_server, flags_helper, ui_server, utils from feast.base_feature_view import BaseFeatureView from feast.batch_feature_view import BatchFeatureView -from feast.data_source import DataSource, PushMode +from feast.data_source import ( + DataSource, + KafkaSource, + KinesisSource, + PushMode, + PushSource, +) from feast.diff.infra_diff import InfraDiff, diff_infra_protos from feast.diff.registry_diff import RegistryDiff, apply_diff_to_registry, diff_between from feast.dqm.errors import ValidationFailed from feast.entity import Entity from feast.errors import ( + DataSourceRepeatNamesException, EntityNotFoundException, - ExperimentalFeatureNotEnabled, FeatureNameCollisionError, FeatureViewNotFoundException, RequestDataNotFoundInEntityDfException, @@ -71,7 +77,9 @@ ) from feast.infra.infra_object import Infra from feast.infra.provider import Provider, RetrievalJob, get_provider -from feast.infra.registry_stores.sql import SqlRegistry +from feast.infra.registry.base_registry import BaseRegistry +from feast.infra.registry.registry import Registry +from feast.infra.registry.sql import SqlRegistry from feast.on_demand_feature_view import OnDemandFeatureView from feast.online_response import OnlineResponse from feast.protos.feast.serving.ServingService_pb2 import ( @@ -80,7 +88,6 @@ ) from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import RepeatedValue, Value -from feast.registry import BaseRegistry, Registry from feast.repo_config import RepoConfig, load_repo_config from feast.repo_contents import RepoContents from feast.request_feature_view import RequestFeatureView @@ -104,40 +111,57 @@ class FeatureStore: """ A FeatureStore object is used to define, create, and retrieve features. - Args: - repo_path (optional): Path to a `feature_store.yaml` used to configure the - feature store. - config (optional): Configuration object used to configure the feature store. + Attributes: + config: The config for the feature store. + repo_path: The path to the feature repo. + _registry: The registry for the feature store. + _provider: The provider for the feature store. + _go_server: The (optional) Go feature server for the feature store. """ config: RepoConfig repo_path: Path _registry: BaseRegistry _provider: Provider - _go_server: "EmbeddedOnlineFeatureServer" + _go_server: Optional["EmbeddedOnlineFeatureServer"] @log_exceptions def __init__( self, repo_path: Optional[str] = None, config: Optional[RepoConfig] = None, + fs_yaml_file: Optional[Path] = None, ): """ Creates a FeatureStore object. + Args: + repo_path (optional): Path to the feature repo. Defaults to the current working directory. + config (optional): Configuration object used to configure the feature store. + fs_yaml_file (optional): Path to the `feature_store.yaml` file used to configure the feature store. + At most one of 'fs_yaml_file' and 'config' can be set. + Raises: ValueError: If both or neither of repo_path and config are specified. """ - if repo_path is not None and config is not None: - raise ValueError("You cannot specify both repo_path and config.") - if config is not None: + if fs_yaml_file is not None and config is not None: + raise ValueError("You cannot specify both fs_yaml_file and config.") + + if repo_path: + self.repo_path = Path(repo_path) + else: self.repo_path = Path(os.getcwd()) + + # If config is specified, or fs_yaml_file is specified, those take precedence over + # the default feature_store.yaml location under repo_path. + if config is not None: self.config = config - elif repo_path is not None: - self.repo_path = Path(repo_path) - self.config = load_repo_config(Path(repo_path)) + elif fs_yaml_file is not None: + self.config = load_repo_config(self.repo_path, fs_yaml_file) else: - raise ValueError("Please specify one of repo_path or config.") + self.config = load_repo_config( + self.repo_path, Path(self.repo_path) / "feature_store.yaml" + ) registry_config = self.config.get_registry_config() if registry_config.registry_type == "sql": @@ -146,7 +170,8 @@ def __init__( r = Registry(registry_config, repo_path=self.repo_path) r._initialize_registry(self.config.project) self._registry = r - self._provider = get_provider(self.config, self.repo_path) + + self._provider = get_provider(self.config) self._go_server = None @log_exceptions @@ -502,8 +527,8 @@ def _get_features( ) if feature_service_from_registry != _features: warnings.warn( - "The FeatureService object that has been passed in as an argument is" - "inconsistent with the version from Registry. Potentially a newer version" + "The FeatureService object that has been passed in as an argument is " + "inconsistent with the version from the registry. Potentially a newer version " "of the FeatureService has been applied to the registry." ) for projection in feature_service_from_registry.feature_view_projections: @@ -533,14 +558,13 @@ def _validate_all_feature_views( sfvs_to_update: List[StreamFeatureView], ): """Validates all feature views.""" - if ( - not flags_helper.enable_on_demand_feature_views(self.config) - and len(odfvs_to_update) > 0 - ): - raise ExperimentalFeatureNotEnabled(flags.FLAG_ON_DEMAND_TRANSFORM_NAME) - + if len(odfvs_to_update) > 0 and not flags_helper.is_test(): + warnings.warn( + "On demand feature view is an experimental feature. " + "This API is stable, but the functionality does not scale well for offline retrieval", + RuntimeWarning, + ) set_usage_attribute("odfv", bool(odfvs_to_update)) - _validate_feature_views( [ *views_to_update, @@ -666,10 +690,10 @@ def plan( >>> from feast import FeatureStore, Entity, FeatureView, Feature, FileSource, RepoConfig >>> from feast.feature_store import RepoContents >>> from datetime import timedelta - >>> fs = FeatureStore(repo_path="feature_repo") + >>> fs = FeatureStore(repo_path="project/feature_repo") >>> driver = Entity(name="driver_id", description="driver id") >>> driver_hourly_stats = FileSource( - ... path="feature_repo/data/driver_stats.parquet", + ... path="project/feature_repo/data/driver_stats.parquet", ... timestamp_field="event_timestamp", ... created_timestamp_column="created", ... ) @@ -677,7 +701,7 @@ def plan( ... name="driver_hourly_stats", ... entities=[driver], ... ttl=timedelta(seconds=86400 * 1), - ... batch_source=driver_hourly_stats, + ... source=driver_hourly_stats, ... ) >>> registry_diff, infra_diff, new_infra = fs.plan(RepoContents( ... data_sources=[driver_hourly_stats], @@ -749,6 +773,7 @@ def apply( FeatureView, OnDemandFeatureView, RequestFeatureView, + BatchFeatureView, StreamFeatureView, FeatureService, ValidationReference, @@ -779,10 +804,10 @@ def apply( >>> from feast import FeatureStore, Entity, FeatureView, Feature, FileSource, RepoConfig >>> from datetime import timedelta - >>> fs = FeatureStore(repo_path="feature_repo") + >>> fs = FeatureStore(repo_path="project/feature_repo") >>> driver = Entity(name="driver_id", description="driver id") >>> driver_hourly_stats = FileSource( - ... path="feature_repo/data/driver_stats.parquet", + ... path="project/feature_repo/data/driver_stats.parquet", ... timestamp_field="event_timestamp", ... created_timestamp_column="created", ... ) @@ -790,7 +815,7 @@ def apply( ... name="driver_hourly_stats", ... entities=[driver], ... ttl=timedelta(seconds=86400 * 1), - ... batch_source=driver_hourly_stats, + ... source=driver_hourly_stats, ... ) >>> fs.apply([driver_hourly_stats_view, driver]) # register entity and feature view """ @@ -808,9 +833,9 @@ def apply( ob for ob in objects if ( - isinstance(ob, FeatureView) + # BFVs are not handled separately from FVs right now. + (isinstance(ob, FeatureView) or isinstance(ob, BatchFeatureView)) and not isinstance(ob, StreamFeatureView) - and not isinstance(ob, BatchFeatureView) ) ] sfvs_to_update = [ob for ob in objects if isinstance(ob, StreamFeatureView)] @@ -826,6 +851,18 @@ def apply( ob for ob in objects if isinstance(ob, ValidationReference) ] + batch_sources_to_add: List[DataSource] = [] + for data_source in data_sources_set_to_update: + if ( + isinstance(data_source, PushSource) + or isinstance(data_source, KafkaSource) + or isinstance(data_source, KinesisSource) + ): + assert data_source.batch_source + batch_sources_to_add.append(data_source.batch_source) + for batch_source in batch_sources_to_add: + data_sources_set_to_update.add(batch_source) + for fv in itertools.chain(views_to_update, sfvs_to_update): data_sources_set_to_update.add(fv.batch_source) if fv.stream_source: @@ -839,7 +876,7 @@ def apply( ) for rfv in request_views_to_update: - data_sources_set_to_update.add(rfv.request_data_source) + data_sources_set_to_update.add(rfv.request_source) for odfv in odfvs_to_update: for v in odfv.source_request_sources.values(): @@ -881,13 +918,18 @@ def apply( validation_references, project=self.project, commit=False ) + entities_to_delete = [] + views_to_delete = [] + sfvs_to_delete = [] if not partial: # Delete all registry objects that should not exist. entities_to_delete = [ ob for ob in objects_to_delete if isinstance(ob, Entity) ] views_to_delete = [ - ob for ob in objects_to_delete if isinstance(ob, FeatureView) + ob + for ob in objects_to_delete + if isinstance(ob, FeatureView) or isinstance(ob, BatchFeatureView) ] request_views_to_delete = [ ob for ob in objects_to_delete if isinstance(ob, RequestFeatureView) @@ -941,10 +983,13 @@ def apply( validation_references.name, project=self.project, commit=False ) + tables_to_delete: List[FeatureView] = views_to_delete + sfvs_to_delete if not partial else [] # type: ignore + tables_to_keep: List[FeatureView] = views_to_update + sfvs_to_update # type: ignore + self._get_provider().update_infra( project=self.project, - tables_to_delete=views_to_delete + sfvs_to_delete if not partial else [], - tables_to_keep=views_to_update + sfvs_to_update, + tables_to_delete=tables_to_delete, + tables_to_keep=tables_to_keep, entities_to_delete=entities_to_delete if not partial else [], entities_to_keep=entities_to_update, partial=partial, @@ -1014,7 +1059,7 @@ def get_historical_features( >>> from feast import FeatureStore, RepoConfig >>> import pandas as pd - >>> fs = FeatureStore(repo_path="feature_repo") + >>> fs = FeatureStore(repo_path="project/feature_repo") >>> entity_df = pd.DataFrame.from_dict( ... { ... "driver_id": [1001, 1002], @@ -1108,6 +1153,7 @@ def create_saved_dataset( storage: SavedDatasetStorage, tags: Optional[Dict[str, str]] = None, feature_service: Optional[FeatureService] = None, + allow_overwrite: bool = False, ) -> SavedDataset: """ Execute provided retrieval job and persist its outcome in given storage. @@ -1116,18 +1162,27 @@ def create_saved_dataset( Name for the saved dataset should be unique within project, since it's possible to overwrite previously stored dataset with the same name. + Args: + from_: The retrieval job whose result should be persisted. + name: The name of the saved dataset. + storage: The saved dataset storage object indicating where the result should be persisted. + tags (optional): A dictionary of key-value pairs to store arbitrary metadata. + feature_service (optional): The feature service that should be associated with this saved dataset. + allow_overwrite (optional): If True, the persisted result can overwrite an existing table or file. + Returns: SavedDataset object with attached RetrievalJob Raises: ValueError if given retrieval job doesn't have metadata """ - warnings.warn( - "Saving dataset is an experimental feature. " - "This API is unstable and it could and most probably will be changed in the future. " - "We do not guarantee that future changes will maintain backward compatibility.", - RuntimeWarning, - ) + if not flags_helper.is_test(): + warnings.warn( + "Saving dataset is an experimental feature. " + "This API is unstable and it could and most probably will be changed in the future. " + "We do not guarantee that future changes will maintain backward compatibility.", + RuntimeWarning, + ) if not from_.metadata: raise ValueError( @@ -1147,7 +1202,7 @@ def create_saved_dataset( dataset.min_event_timestamp = from_.metadata.min_event_timestamp dataset.max_event_timestamp = from_.metadata.max_event_timestamp - from_.persist(storage) + from_.persist(storage=storage, allow_overwrite=allow_overwrite) dataset = dataset.with_retrieval_job( self._get_provider().retrieve_saved_dataset( @@ -1174,12 +1229,13 @@ def get_saved_dataset(self, name: str) -> SavedDataset: Raises: SavedDatasetNotFound """ - warnings.warn( - "Retrieving datasets is an experimental feature. " - "This API is unstable and it could and most probably will be changed in the future. " - "We do not guarantee that future changes will maintain backward compatibility.", - RuntimeWarning, - ) + if not flags_helper.is_test(): + warnings.warn( + "Retrieving datasets is an experimental feature. " + "This API is unstable and it could and most probably will be changed in the future. " + "We do not guarantee that future changes will maintain backward compatibility.", + RuntimeWarning, + ) dataset = self._registry.get_saved_dataset(name, self.project) provider = self._get_provider() @@ -1217,7 +1273,7 @@ def materialize_incremental( >>> from feast import FeatureStore, RepoConfig >>> from datetime import datetime, timedelta - >>> fs = FeatureStore(repo_path="feature_repo") + >>> fs = FeatureStore(repo_path="project/feature_repo") >>> fs.materialize_incremental(end_date=datetime.utcnow() - timedelta(minutes=5)) Materializing... @@ -1306,7 +1362,7 @@ def materialize( from 3 hours ago to 10 minutes ago. >>> from feast import FeatureStore, RepoConfig >>> from datetime import datetime, timedelta - >>> fs = FeatureStore(repo_path="feature_repo") + >>> fs = FeatureStore(repo_path="project/feature_repo") >>> fs.materialize( ... start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10) ... ) @@ -1373,12 +1429,6 @@ def push( allow_registry_cache: Whether to allow cached versions of the registry. to: Whether to push to online or offline store. Defaults to online store only. """ - warnings.warn( - "Push source is an experimental feature. " - "This API is unstable and it could and might change in the future. " - "We do not guarantee that future changes will maintain backward compatibility.", - RuntimeWarning, - ) from feast.data_source import PushSource all_fvs = self.list_feature_views(allow_cache=allow_registry_cache) @@ -1412,7 +1462,12 @@ def write_to_online_store( allow_registry_cache: bool = True, ): """ - ingests data directly into the Online store + Persists a dataframe to the online store. + + Args: + feature_view_name: The feature view to which the dataframe corresponds. + df: The dataframe to be persisted. + allow_registry_cache (optional): Whether to allow retrieving feature views from a cached registry. """ # TODO: restrict this to work with online StreamFeatureViews and validate the FeatureView type try: @@ -1511,7 +1566,7 @@ def get_online_features( Retrieve online features from an online store. >>> from feast import FeatureStore, RepoConfig - >>> fs = FeatureStore(repo_path="feature_repo") + >>> fs = FeatureStore(repo_path="project/feature_repo") >>> online_response = fs.get_online_features( ... features=[ ... "driver_hourly_stats:conv_rate", @@ -1565,7 +1620,7 @@ def _get_online_features( } # If the embedded Go code is enabled, send request to it instead of going through regular Python logic. - if self.config.go_feature_retrieval: + if self.config.go_feature_retrieval and self._go_server: self._lazy_init_go_server() entity_native_values: Dict[str, List[Any]] @@ -2217,7 +2272,7 @@ def serve( ) -> None: """Start the feature consumption server locally on a given port.""" type_ = type_.lower() - if self.config.go_feature_serving: + if self.config.go_feature_serving and self._go_server: # Start go server instead of python if the flag is enabled self._lazy_init_go_server() enable_logging = ( @@ -2267,11 +2322,12 @@ def serve_ui( self, host: str, port: int, get_registry_dump: Callable, registry_ttl_sec: int ) -> None: """Start the UI server locally""" - warnings.warn( - "The Feast UI is an experimental feature. " - "We do not guarantee that future changes will maintain backward compatibility.", - RuntimeWarning, - ) + if flags_helper.is_test(): + warnings.warn( + "The Feast UI is an experimental feature. " + "We do not guarantee that future changes will maintain backward compatibility.", + RuntimeWarning, + ) ui_server.start_server( self, host=host, @@ -2284,8 +2340,11 @@ def serve_ui( @log_exceptions_and_usage def serve_transformations(self, port: int) -> None: """Start the feature transformation server locally on a given port.""" - if not flags_helper.enable_on_demand_feature_views(self.config): - raise ExperimentalFeatureNotEnabled(flags.FLAG_ON_DEMAND_TRANSFORM_NAME) + warnings.warn( + "On demand feature view is an experimental feature. " + "This API is stable, but the functionality does not scale well for offline retrieval", + RuntimeWarning, + ) from feast import transformation_server @@ -2348,12 +2407,13 @@ def validate_logged_features( or None if successful. """ - warnings.warn( - "Logged features validation is an experimental feature. " - "This API is unstable and it could and most probably will be changed in the future. " - "We do not guarantee that future changes will maintain backward compatibility.", - RuntimeWarning, - ) + if not flags_helper.is_test(): + warnings.warn( + "Logged features validation is an experimental feature. " + "This API is unstable and it could and most probably will be changed in the future. " + "We do not guarantee that future changes will maintain backward compatibility.", + RuntimeWarning, + ) if not isinstance(source, FeatureService): raise ValueError("Only feature service is currently supported as a source") @@ -2548,19 +2608,12 @@ def _validate_feature_views(feature_views: List[BaseFeatureView]): def _validate_data_sources(data_sources: List[DataSource]): - """Verify data sources have case-insensitively unique names""" + """Verify data sources have case-insensitively unique names.""" ds_names = set() for ds in data_sources: case_insensitive_ds_name = ds.name.lower() if case_insensitive_ds_name in ds_names: - if case_insensitive_ds_name.strip(): - warnings.warn( - f"More than one data source with name {case_insensitive_ds_name} found. " - f"Please ensure that all data source names are case-insensitively unique. " - f"It may be necessary to ignore certain files in your feature repository by using a .feastignore " - f"file. Starting in Feast 0.24, unique names (perhaps inferred from the table name) will be " - f"required in data sources to encourage data source discovery" - ) + raise DataSourceRepeatNamesException(case_insensitive_ds_name) else: ds_names.add(case_insensitive_ds_name) diff --git a/sdk/python/feast/feature_view.py b/sdk/python/feast/feature_view.py index 0310376646..41bad5828a 100644 --- a/sdk/python/feast/feature_view.py +++ b/sdk/python/feast/feature_view.py @@ -14,7 +14,7 @@ import copy import warnings from datetime import datetime, timedelta -from typing import Dict, List, Optional, Tuple, Type, Union +from typing import Dict, List, Optional, Tuple, Type from google.protobuf.duration_pb2 import Duration from typeguard import typechecked @@ -23,7 +23,6 @@ from feast.base_feature_view import BaseFeatureView from feast.data_source import DataSource, KafkaSource, KinesisSource, PushSource from feast.entity import Entity -from feast.feature import Feature from feast.feature_view_projection import FeatureViewProjection from feast.field import Field from feast.protos.feast.core.FeatureView_pb2 import FeatureView as FeatureViewProto @@ -61,11 +60,10 @@ class FeatureView(BaseFeatureView): ttl: The amount of time this group of features lives. A ttl of 0 indicates that this group of features lives forever. Note that large ttl's or a ttl of 0 can result in extremely computationally intensive queries. - batch_source (optional): The batch source of data where this group of features + batch_source: The batch source of data where this group of features is stored. This is optional ONLY if a push source is specified as the - stream_source, since push sources contain their own batch sources. This is deprecated in favor of `source`. - stream_source (optional): The stream source of data where this group of features - is stored. This is deprecated in favor of `source`. + stream_source, since push sources contain their own batch sources. + stream_source: The stream source of data where this group of features is stored. schema: The schema of the feature view, including feature, timestamp, and entity columns. If not specified, can be inferred from the underlying data source. entity_columns: The list of entity columns contained in the schema. If not specified, @@ -78,8 +76,6 @@ class FeatureView(BaseFeatureView): tags: A dictionary of key-value pairs to store arbitrary metadata. owner: The owner of the feature view, typically the email of the primary maintainer. - source (optional): The source of data for this group of features. May be a stream source, or a batch source. - If a stream source, the source should contain a batch_source for backfills & batch materialization. """ name: str @@ -95,157 +91,83 @@ class FeatureView(BaseFeatureView): tags: Dict[str, str] owner: str materialization_intervals: List[Tuple[datetime, datetime]] - source: Optional[DataSource] @log_exceptions def __init__( self, - *args, - name: Optional[str] = None, - entities: Optional[Union[List[Entity], List[str]]] = None, - ttl: Optional[Union[Duration, timedelta]] = None, - batch_source: Optional[DataSource] = None, - stream_source: Optional[DataSource] = None, - features: Optional[List[Feature]] = None, - tags: Optional[Dict[str, str]] = None, + *, + name: str, + source: DataSource, + schema: Optional[List[Field]] = None, + entities: List[Entity] = None, + ttl: Optional[timedelta] = timedelta(days=0), online: bool = True, description: str = "", + tags: Optional[Dict[str, str]] = None, owner: str = "", - schema: Optional[List[Field]] = None, - source: Optional[DataSource] = None, ): """ Creates a FeatureView object. Args: name: The unique name of the feature view. - entities: The list of entities with which this group of features is associated. - ttl: The amount of time this group of features lives. A ttl of 0 indicates that + source: The source of data for this group of features. May be a stream source, or a batch source. + If a stream source, the source should contain a batch_source for backfills & batch materialization. + schema (optional): The schema of the feature view, including feature, timestamp, + and entity columns. + entities (optional): The list of entities with which this group of features is associated. + ttl (optional): The amount of time this group of features lives. A ttl of 0 indicates that this group of features lives forever. Note that large ttl's or a ttl of 0 can result in extremely computationally intensive queries. - batch_source: The batch source of data where this group of features is stored. - stream_source (optional): The stream source of data where this group of features - is stored. - features (deprecated): The list of features defined as part of this feature view. - tags (optional): A dictionary of key-value pairs to store arbitrary metadata. online (optional): A boolean indicating whether online retrieval is enabled for this feature view. description (optional): A human-readable description. + tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the feature view, typically the email of the primary maintainer. - schema (optional): The schema of the feature view, including feature, timestamp, - and entity columns. If entity columns are included in the schema, a List[Entity] - must be passed to `entities` instead of a List[str]; otherwise, the entity columns - will be mistakenly interpreted as feature columns. - source (optional): The source of data for this group of features. May be a stream source, or a batch source. - If a stream source, the source should contain a batch_source for backfills & batch materialization. Raises: ValueError: A field mapping conflicts with an Entity or a Feature. """ - positional_attributes = ["name", "entities", "ttl"] + self.name = name + self.entities = [e.name for e in entities] if entities else [DUMMY_ENTITY_NAME] + self.ttl = ttl + self.schema = schema or [] - _name = name - _entities = entities - _ttl = ttl - - if args: - warnings.warn( - ( - "feature view parameters should be specified as a keyword argument instead of a positional arg." - "Feast 0.24+ will not support positional arguments to construct feature views" - ), - DeprecationWarning, - ) - if len(args) > len(positional_attributes): + # Initialize data sources. + if ( + isinstance(source, PushSource) + or isinstance(source, KafkaSource) + or isinstance(source, KinesisSource) + ): + self.stream_source = source + if not source.batch_source: raise ValueError( - f"Only {', '.join(positional_attributes)} are allowed as positional args when defining " - f"feature views, for backwards compatibility." + f"A batch_source needs to be specified for stream source `{source.name}`" ) - if len(args) >= 1: - _name = args[0] - if len(args) >= 2: - _entities = args[1] - if len(args) >= 3: - _ttl = args[2] - - if not _name: - raise ValueError("feature view name needs to be specified") - - self.name = _name - - self.entities = ( - [e.name if isinstance(e, Entity) else e for e in _entities] - if _entities - else [DUMMY_ENTITY_NAME] - ) - if _entities and isinstance(_entities[0], str): - warnings.warn( - ( - "The `entities` parameter should be a list of `Entity` objects. " - "Feast 0.24 and onwards will not support passing in a list of " - "strings to define entities." - ), - DeprecationWarning, - ) - - self._initialize_sources(_name, batch_source, stream_source, source) - - if isinstance(_ttl, Duration): - self.ttl = timedelta(seconds=int(_ttl.seconds)) - warnings.warn( - ( - "The option to pass a Duration object to the ttl parameter is being deprecated. " - "Please pass a timedelta object instead. Feast 0.24 and onwards will not support " - "Duration objects." - ), - DeprecationWarning, - ) - elif isinstance(_ttl, timedelta) or _ttl is None: - self.ttl = _ttl + else: + self.batch_source = source.batch_source else: - raise ValueError(f"unknown value type specified for ttl {type(_ttl)}") - - if features is not None: - warnings.warn( - ( - "The `features` parameter is being deprecated in favor of the `schema` parameter. " - "Please switch from using `features` to `schema`. This will also requiring switching " - "feature definitions from using `Feature` to `Field`. Feast 0.24 and onwards will not " - "support the `features` parameter." - ), - DeprecationWarning, - ) + self.stream_source = None + self.batch_source = source - _schema = schema or [] - if len(_schema) == 0 and features is not None: - _schema = [Field.from_feature(feature) for feature in features] - self.schema = _schema - - # If a user has added entity fields to schema, then they should also have switched - # to using a List[Entity], in which case entity and feature columns can be separated - # here. Conversely, if the user is still using a List[str], they must not have added - # added entity fields, in which case we can set the `features` attribute directly - # equal to the schema. - _features: List[Field] = [] + # Initialize features and entity columns. + features: List[Field] = [] self.entity_columns = [] - if _entities and len(_entities) > 0 and isinstance(_entities[0], str): - _features = _schema - else: - join_keys = [] - if _entities: - for entity in _entities: - if isinstance(entity, Entity): - join_keys += entity.join_keys - - for field in _schema: - if field.name in join_keys: - self.entity_columns.append(field) - else: - _features.append(field) + + join_keys = [] + if entities: + for entity in entities: + join_keys += entity.join_keys + + for field in self.schema: + if field.name in join_keys: + self.entity_columns.append(field) + else: + features.append(field) # TODO(felixwang9817): Add more robust validation of features. - cols = [field.name for field in _schema] + cols = [field.name for field in self.schema] for col in cols: if ( self.batch_source.field_mapping is not None @@ -258,8 +180,8 @@ def __init__( ) super().__init__( - name=_name, - features=_features, + name=name, + features=features, description=description, tags=tags, owner=owner, @@ -267,41 +189,6 @@ def __init__( self.online = online self.materialization_intervals = [] - def _initialize_sources(self, name, batch_source, stream_source, source): - if source: - if ( - isinstance(source, PushSource) - or isinstance(source, KafkaSource) - or isinstance(source, KinesisSource) - ): - self.stream_source = source - if not source.batch_source: - raise ValueError( - f"A batch_source needs to be specified for stream source `{source.name}`" - ) - else: - self.batch_source = source.batch_source - else: - self.stream_source = stream_source - self.batch_source = source - else: - warnings.warn( - "batch_source and stream_source have been deprecated in favor of `source`." - "The deprecated fields will be removed in Feast 0.24.", - DeprecationWarning, - ) - if stream_source is not None and isinstance(stream_source, PushSource): - self.stream_source = stream_source - self.batch_source = stream_source.batch_source - else: - if batch_source is None: - raise ValueError( - f"A batch_source needs to be specified for feature view `{name}`" - ) - self.stream_source = stream_source - self.batch_source = batch_source - self.source = source - def __hash__(self): return super().__hash__() @@ -309,21 +196,17 @@ def __copy__(self): fv = FeatureView( name=self.name, ttl=self.ttl, - source=self.batch_source, - stream_source=self.stream_source, + source=self.stream_source if self.stream_source else self.batch_source, schema=self.schema, tags=self.tags, online=self.online, ) - # This is deliberately set outside of the FV initialization to avoid the deprecation warning. - # TODO(felixwang9817): Move this into the FV initialization when the deprecation warning - # is removed. + # This is deliberately set outside of the FV initialization as we do not have the Entity objects. fv.entities = self.entities fv.features = copy.copy(self.features) fv.entity_columns = copy.copy(self.entity_columns) fv.projection = copy.copy(self.projection) - fv.entities = self.entities return fv def __eq__(self, other): diff --git a/sdk/python/feast/feature_view_projection.py b/sdk/python/feast/feature_view_projection.py index a862e5f08d..2960996a10 100644 --- a/sdk/python/feast/feature_view_projection.py +++ b/sdk/python/feast/feature_view_projection.py @@ -21,6 +21,10 @@ class FeatureViewProjection: name: The unique name of the feature view from which this projection is created. name_alias: An optional alias for the name. features: The list of features represented by the feature view projection. + desired_features: The list of features that this feature view projection intends to select. + If empty, the projection intends to select all features. This attribute is only used + for feature service inference. It should only be set if the underlying feature view + is not ready to be projected, i.e. still needs to go through feature inference. join_key_map: A map to modify join key columns during retrieval of this feature view projection. """ diff --git a/sdk/python/feast/file_utils.py b/sdk/python/feast/file_utils.py new file mode 100644 index 0000000000..0a3b614dd4 --- /dev/null +++ b/sdk/python/feast/file_utils.py @@ -0,0 +1,85 @@ +# +# Copyright 2019 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +def replace_str_in_file(file_path, match_str, sub_str): + """ + Replace a string, in-place, in a text file, throughout. + Does not return anything, side-effect only. + Inputs are: + file_path, a string with the path to the ascii file to edit + match_str, the substring to be replaced (as many times as it's found) + sub_str, the string to insert in place of match_str + NOTE: not suitable for very large files (it does all in-memory). + """ + with open(file_path, "r") as f: + contents = f.read() + contents = contents.replace(match_str, sub_str) + with open(file_path, "wt") as f: + f.write(contents) + + +def remove_lines_from_file(file_path, match_str, partial=True): + """ + Edit an ascii file (in-place) by removing all lines that + match a given string (partially or totally). + Does not return anything, side-effect only. + Inputs are: + file_path, a string with the path to the ascii file to edit + match_str, the string to look for in the file lines + partial, a boolean: if True, any line with match_str as substring + will be removed; if False, only lines matching it entirely. + NOTE: not suitable for very large files (it does all in-memory). + """ + + def _line_matcher(line, _m=match_str, _p=partial): + if _p: + return _m in line + else: + return _m == line + + with open(file_path, "r") as f: + file_lines = list(f.readlines()) + + new_file_lines = [line for line in file_lines if not _line_matcher(line)] + + with open(file_path, "wt") as f: + f.write("".join(new_file_lines)) + + +def write_setting_or_remove( + file_path, setting_value, setting_name, setting_placeholder_value +): + """ + Utility to adapt a settings-file template to some provided values. + Assumes the file has lines such as + " username: c_username" + (quotes excluded) where the placeholder might be replaced with actual value + or the line might not be needed altogether. + Then, calling + write_settings_or_remove(file_path, new_username, 'username', 'c_username') + the file is edited in-place in one of two ways: + 1. if new_username is None, the line disappears completely + 2. if e.g. new_username == 'jenny', the line becomes + " username: jenny" + This utility is called repeatedly (a bit inefficiently, admittedly) + to refine the template feature-store yaml config to suit the parameters + supplied during a "feast init" feature store setup. + """ + if setting_value is not None: + replace_str_in_file(file_path, setting_placeholder_value, str(setting_value)) + else: + remove_lines_from_file(file_path, setting_name) diff --git a/sdk/python/feast/flags.py b/sdk/python/feast/flags.py deleted file mode 100644 index 26e20d81f6..0000000000 --- a/sdk/python/feast/flags.py +++ /dev/null @@ -1,10 +0,0 @@ -FLAG_ALPHA_FEATURES_NAME = "alpha_features" -FLAG_ON_DEMAND_TRANSFORM_NAME = "on_demand_transforms" -FLAG_AWS_LAMBDA_FEATURE_SERVER_NAME = "aws_lambda_feature_server" -ENV_FLAG_IS_TEST = "IS_TEST" - -FLAG_NAMES = { - FLAG_ALPHA_FEATURES_NAME, - FLAG_ON_DEMAND_TRANSFORM_NAME, - FLAG_AWS_LAMBDA_FEATURE_SERVER_NAME, -} diff --git a/sdk/python/feast/flags_helper.py b/sdk/python/feast/flags_helper.py index 7cf16dbf0b..4763f4a283 100644 --- a/sdk/python/feast/flags_helper.py +++ b/sdk/python/feast/flags_helper.py @@ -1,39 +1,11 @@ import os -from feast import flags -from feast.repo_config import RepoConfig +ENV_FLAG_IS_TEST = "IS_TEST" def _env_flag_enabled(name: str) -> bool: return os.getenv(name, default="False") == "True" -def feature_flag_enabled(repo_config: RepoConfig, flag_name: str) -> bool: - if is_test(): - return True - return ( - _alpha_feature_flag_enabled(repo_config) - and repo_config.flags is not None - and flag_name in repo_config.flags - and repo_config.flags[flag_name] - ) - - -def _alpha_feature_flag_enabled(repo_config: RepoConfig) -> bool: - return ( - repo_config.flags is not None - and flags.FLAG_ALPHA_FEATURES_NAME in repo_config.flags - and repo_config.flags[flags.FLAG_ALPHA_FEATURES_NAME] - ) - - def is_test() -> bool: - return _env_flag_enabled(flags.ENV_FLAG_IS_TEST) - - -def enable_on_demand_feature_views(repo_config: RepoConfig) -> bool: - return feature_flag_enabled(repo_config, flags.FLAG_ON_DEMAND_TRANSFORM_NAME) - - -def enable_aws_lambda_feature_server(repo_config: RepoConfig) -> bool: - return feature_flag_enabled(repo_config, flags.FLAG_AWS_LAMBDA_FEATURE_SERVER_NAME) + return _env_flag_enabled(ENV_FLAG_IS_TEST) diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index 0b8e42b4e9..84e8321c12 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -7,6 +7,9 @@ from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_NAME, FeatureView from feast.field import Field, from_value_type from feast.infra.offline_stores.bigquery_source import BigQuerySource +from feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source import ( + MsSqlServerSource, +) from feast.infra.offline_stores.file_source import FileSource from feast.infra.offline_stores.redshift_source import RedshiftSource from feast.infra.offline_stores.snowflake_source import SnowflakeSource @@ -40,12 +43,14 @@ def update_data_sources_with_inferred_event_timestamp_col( ts_column_type_regex_pattern = "TIMESTAMP[A-Z]*" elif isinstance(data_source, SnowflakeSource): ts_column_type_regex_pattern = "TIMESTAMP_[A-Z]*" + elif isinstance(data_source, MsSqlServerSource): + ts_column_type_regex_pattern = "TIMESTAMP|DATETIME" else: raise RegistryInferenceFailure( "DataSource", f""" DataSource inferencing of timestamp_field is currently only supported - for FileSource, SparkSource, BigQuerySource, RedshiftSource, and SnowflakeSource. + for FileSource, SparkSource, BigQuerySource, RedshiftSource, SnowflakeSource, MsSqlSource. Attempting to infer from {data_source}. """, ) @@ -55,6 +60,7 @@ def update_data_sources_with_inferred_event_timestamp_col( or isinstance(data_source, BigQuerySource) or isinstance(data_source, RedshiftSource) or isinstance(data_source, SnowflakeSource) + or isinstance(data_source, MsSqlServerSource) or "SparkSource" == data_source.__class__.__name__ ) @@ -150,7 +156,11 @@ def update_feature_views_with_inferred_features_and_entities( ) # Infer a dummy entity column for entityless feature views. - if len(fv.entities) == 1 and fv.entities[0] == DUMMY_ENTITY_NAME: + if ( + len(fv.entities) == 1 + and fv.entities[0] == DUMMY_ENTITY_NAME + and not fv.entity_columns + ): fv.entity_columns.append(Field(name=DUMMY_ENTITY_ID, dtype=String)) # Run inference for entity columns if there are fewer entity fields than expected. @@ -199,10 +209,10 @@ def _infer_features_and_entities( fv.batch_source.timestamp_field, fv.batch_source.created_timestamp_column, } - for column in columns_to_exclude: - if column in fv.batch_source.field_mapping: - columns_to_exclude.remove(column) - columns_to_exclude.add(fv.batch_source.field_mapping[column]) + for original_col, mapped_col in fv.batch_source.field_mapping.items(): + if mapped_col in columns_to_exclude: + columns_to_exclude.remove(mapped_col) + columns_to_exclude.add(original_col) table_column_names_and_types = fv.batch_source.get_table_column_names_and_types( config diff --git a/sdk/python/feast/infra/aws.py b/sdk/python/feast/infra/aws.py index 145c55952e..f334998e6b 100644 --- a/sdk/python/feast/infra/aws.py +++ b/sdk/python/feast/infra/aws.py @@ -3,11 +3,8 @@ import logging import os import uuid -from datetime import datetime -from pathlib import Path -from tempfile import TemporaryFile +import warnings from typing import Optional, Sequence -from urllib.parse import urlparse from colorama import Fore, Style @@ -22,22 +19,15 @@ from feast.errors import ( AwsAPIGatewayDoesNotExist, AwsLambdaDoesNotExist, - ExperimentalFeatureNotEnabled, IncompatibleRegistryStoreClass, RepoConfigPathDoesNotExist, - S3RegistryBucketForbiddenAccess, - S3RegistryBucketNotExist, ) from feast.feature_view import FeatureView -from feast.flags import FLAG_AWS_LAMBDA_FEATURE_SERVER_NAME -from feast.flags_helper import enable_aws_lambda_feature_server from feast.infra.feature_servers.aws_lambda.config import AwsLambdaFeatureServerConfig from feast.infra.passthrough_provider import PassthroughProvider +from feast.infra.registry.registry import get_registry_store_class_from_scheme +from feast.infra.registry.s3 import S3RegistryStore from feast.infra.utils import aws_utils -from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto -from feast.registry import get_registry_store_class_from_scheme -from feast.registry_store import RegistryStore -from feast.repo_config import RegistryConfig from feast.usage import log_exceptions_and_usage from feast.version import get_version @@ -74,8 +64,11 @@ def update_infra( ) if self.repo_config.feature_server and self.repo_config.feature_server.enabled: - if not enable_aws_lambda_feature_server(self.repo_config): - raise ExperimentalFeatureNotEnabled(FLAG_AWS_LAMBDA_FEATURE_SERVER_NAME) + warnings.warn( + "AWS Lambda based feature serving is an experimental feature. " + "We do not guarantee that future changes will maintain backward compatibility.", + RuntimeWarning, + ) # Since the AWS Lambda feature server will attempt to load the registry, we # only allow the registry to be in S3. @@ -361,64 +354,3 @@ def _get_docker_image_version() -> str: "> pip install -e '.'" ) return version - - -class S3RegistryStore(RegistryStore): - def __init__(self, registry_config: RegistryConfig, repo_path: Path): - uri = registry_config.path - self._uri = urlparse(uri) - self._bucket = self._uri.hostname - self._key = self._uri.path.lstrip("/") - - self.s3_client = boto3.resource( - "s3", endpoint_url=os.environ.get("FEAST_S3_ENDPOINT_URL") - ) - - @log_exceptions_and_usage(registry="s3") - def get_registry_proto(self): - file_obj = TemporaryFile() - registry_proto = RegistryProto() - try: - from botocore.exceptions import ClientError - except ImportError as e: - from feast.errors import FeastExtrasDependencyImportError - - raise FeastExtrasDependencyImportError("aws", str(e)) - try: - bucket = self.s3_client.Bucket(self._bucket) - self.s3_client.meta.client.head_bucket(Bucket=bucket.name) - except ClientError as e: - # If a client error is thrown, then check that it was a 404 error. - # If it was a 404 error, then the bucket does not exist. - error_code = int(e.response["Error"]["Code"]) - if error_code == 404: - raise S3RegistryBucketNotExist(self._bucket) - else: - raise S3RegistryBucketForbiddenAccess(self._bucket) from e - - try: - obj = bucket.Object(self._key) - obj.download_fileobj(file_obj) - file_obj.seek(0) - registry_proto.ParseFromString(file_obj.read()) - return registry_proto - except ClientError as e: - raise FileNotFoundError( - f"Error while trying to locate Registry at path {self._uri.geturl()}" - ) from e - - @log_exceptions_and_usage(registry="s3") - def update_registry_proto(self, registry_proto: RegistryProto): - self._write_registry(registry_proto) - - def teardown(self): - self.s3_client.Object(self._bucket, self._key).delete() - - def _write_registry(self, registry_proto: RegistryProto): - registry_proto.version_id = str(uuid.uuid4()) - registry_proto.last_updated.FromDatetime(datetime.utcnow()) - # we have already checked the bucket exists so no need to do it again - file_obj = TemporaryFile() - file_obj.write(registry_proto.SerializeToString()) - file_obj.seek(0) - self.s3_client.Bucket(self._bucket).put_object(Body=file_obj, Key=self._key) diff --git a/sdk/python/feast/infra/contrib/azure_provider.py b/sdk/python/feast/infra/contrib/azure_provider.py new file mode 100644 index 0000000000..ac56a2b33e --- /dev/null +++ b/sdk/python/feast/infra/contrib/azure_provider.py @@ -0,0 +1,72 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +from datetime import datetime +from typing import Callable + +from tqdm import tqdm + +from feast.feature_view import FeatureView +from feast.infra.passthrough_provider import PassthroughProvider +from feast.infra.registry.base_registry import BaseRegistry +from feast.repo_config import RepoConfig +from feast.utils import ( + _convert_arrow_to_proto, + _get_column_names, + _run_pyarrow_field_mapping, +) + +DEFAULT_BATCH_SIZE = 10_000 + + +class AzureProvider(PassthroughProvider): + def materialize_single_feature_view( + self, + config: RepoConfig, + feature_view: FeatureView, + start_date: datetime, + end_date: datetime, + registry: BaseRegistry, + project: str, + tqdm_builder: Callable[[int], tqdm], + ) -> None: + # TODO(kevjumba): untested + entities = [] + for entity_name in feature_view.entities: + entities.append(registry.get_entity(entity_name, project)) + + ( + join_key_columns, + feature_name_columns, + event_timestamp_column, + created_timestamp_column, + ) = _get_column_names(feature_view, entities) + + offline_job = self.offline_store.pull_latest_from_table_or_query( + config=config, + data_source=feature_view.batch_source, + join_key_columns=join_key_columns, + feature_name_columns=feature_name_columns, + timestamp_field=event_timestamp_column, + created_timestamp_column=created_timestamp_column, + start_date=start_date, + end_date=end_date, + ) + + table = offline_job.to_arrow() + + if feature_view.batch_source.field_mapping is not None: + table = _run_pyarrow_field_mapping( + table, feature_view.batch_source.field_mapping + ) + + join_keys = {entity.join_key: entity.value_type for entity in entities} + + with tqdm_builder(table.num_rows) as pbar: + for batch in table.to_batches(DEFAULT_BATCH_SIZE): + rows_to_write = _convert_arrow_to_proto(batch, feature_view, join_keys) + self.online_write_batch( + self.repo_config, + feature_view, + rows_to_write, + lambda x: pbar.update(x), + ) diff --git a/sdk/python/feast/infra/feature_servers/multicloud/Dockerfile b/sdk/python/feast/infra/feature_servers/multicloud/Dockerfile new file mode 100644 index 0000000000..b853411e27 --- /dev/null +++ b/sdk/python/feast/infra/feature_servers/multicloud/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.8 + +RUN apt update && \ + apt install -y jq +RUN pip install pip --upgrade +RUN pip install "feast[aws,gcp,snowflake,redis,go]" +RUN apt update +RUN apt install -y -V ca-certificates lsb-release wget +RUN wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +RUN apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +RUN apt update +RUN apt -y install libarrow-dev \ No newline at end of file diff --git a/sdk/python/feast/infra/feature_servers/multicloud/Dockerfile.dev b/sdk/python/feast/infra/feature_servers/multicloud/Dockerfile.dev new file mode 100644 index 0000000000..f1dd7cc390 --- /dev/null +++ b/sdk/python/feast/infra/feature_servers/multicloud/Dockerfile.dev @@ -0,0 +1,14 @@ +FROM python:3.8 + +RUN apt update && \ + apt install -y jq +RUN pip install pip --upgrade +COPY . . + +RUN pip install ".[aws,gcp,snowflake,redis,go]" +RUN apt update +RUN apt install -y -V ca-certificates lsb-release wget +RUN wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +RUN apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +RUN apt update +RUN apt -y install libarrow-dev \ No newline at end of file diff --git a/sdk/python/feast/infra/feature_servers/multicloud/__init__.py b/sdk/python/feast/infra/feature_servers/multicloud/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/gcp.py b/sdk/python/feast/infra/gcp.py index 257ae38d02..512378237a 100644 --- a/sdk/python/feast/infra/gcp.py +++ b/sdk/python/feast/infra/gcp.py @@ -1,14 +1,4 @@ -import uuid -from datetime import datetime -from pathlib import Path -from tempfile import TemporaryFile -from urllib.parse import urlparse - from feast.infra.passthrough_provider import PassthroughProvider -from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto -from feast.registry_store import RegistryStore -from feast.repo_config import RegistryConfig -from feast.usage import log_exceptions_and_usage class GcpProvider(PassthroughProvider): @@ -17,68 +7,3 @@ class GcpProvider(PassthroughProvider): """ pass - - -class GCSRegistryStore(RegistryStore): - def __init__(self, registry_config: RegistryConfig, repo_path: Path): - uri = registry_config.path - try: - import google.cloud.storage as storage - except ImportError as e: - from feast.errors import FeastExtrasDependencyImportError - - raise FeastExtrasDependencyImportError("gcp", str(e)) - - self.gcs_client = storage.Client() - self._uri = urlparse(uri) - self._bucket = self._uri.hostname - self._blob = self._uri.path.lstrip("/") - - @log_exceptions_and_usage(registry="gs") - def get_registry_proto(self): - import google.cloud.storage as storage - from google.cloud.exceptions import NotFound - - file_obj = TemporaryFile() - registry_proto = RegistryProto() - try: - bucket = self.gcs_client.get_bucket(self._bucket) - except NotFound: - raise Exception( - f"No bucket named {self._bucket} exists; please create it first." - ) - if storage.Blob(bucket=bucket, name=self._blob).exists(self.gcs_client): - self.gcs_client.download_blob_to_file( - self._uri.geturl(), file_obj, timeout=30 - ) - file_obj.seek(0) - registry_proto.ParseFromString(file_obj.read()) - return registry_proto - raise FileNotFoundError( - f'Registry not found at path "{self._uri.geturl()}". Have you run "feast apply"?' - ) - - @log_exceptions_and_usage(registry="gs") - def update_registry_proto(self, registry_proto: RegistryProto): - self._write_registry(registry_proto) - - def teardown(self): - from google.cloud.exceptions import NotFound - - gs_bucket = self.gcs_client.get_bucket(self._bucket) - try: - gs_bucket.delete_blob(self._blob) - except NotFound: - # If the blob deletion fails with NotFound, it has already been deleted. - pass - - def _write_registry(self, registry_proto: RegistryProto): - registry_proto.version_id = str(uuid.uuid4()) - registry_proto.last_updated.FromDatetime(datetime.utcnow()) - # we have already checked the bucket exists so no need to do it again - gs_bucket = self.gcs_client.get_bucket(self._bucket) - blob = gs_bucket.blob(self._blob) - file_obj = TemporaryFile() - file_obj.write(registry_proto.SerializeToString()) - file_obj.seek(0) - blob.upload_from_file(file_obj) diff --git a/sdk/python/feast/infra/local.py b/sdk/python/feast/infra/local.py index 7249d247a2..1226ceaf37 100644 --- a/sdk/python/feast/infra/local.py +++ b/sdk/python/feast/infra/local.py @@ -1,14 +1,9 @@ -import uuid -from datetime import datetime -from pathlib import Path from typing import List from feast.infra.infra_object import Infra, InfraObject from feast.infra.passthrough_provider import PassthroughProvider from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto -from feast.registry_store import RegistryStore -from feast.repo_config import RegistryConfig, RepoConfig -from feast.usage import log_exceptions_and_usage +from feast.repo_config import RepoConfig class LocalProvider(PassthroughProvider): @@ -26,42 +21,3 @@ def plan_infra( ) infra.infra_objects += infra_objects return infra - - -class LocalRegistryStore(RegistryStore): - def __init__(self, registry_config: RegistryConfig, repo_path: Path): - registry_path = Path(registry_config.path) - if registry_path.is_absolute(): - self._filepath = registry_path - else: - self._filepath = repo_path.joinpath(registry_path) - - @log_exceptions_and_usage(registry="local") - def get_registry_proto(self): - registry_proto = RegistryProto() - if self._filepath.exists(): - registry_proto.ParseFromString(self._filepath.read_bytes()) - return registry_proto - raise FileNotFoundError( - f'Registry not found at path "{self._filepath}". Have you run "feast apply"?' - ) - - @log_exceptions_and_usage(registry="local") - def update_registry_proto(self, registry_proto: RegistryProto): - self._write_registry(registry_proto) - - def teardown(self): - try: - self._filepath.unlink() - except FileNotFoundError: - # If the file deletion fails with FileNotFoundError, the file has already - # been deleted. - pass - - def _write_registry(self, registry_proto: RegistryProto): - registry_proto.version_id = str(uuid.uuid4()) - registry_proto.last_updated.FromDatetime(datetime.utcnow()) - file_dir = self._filepath.parent - file_dir.mkdir(exist_ok=True) - with open(self._filepath, mode="wb", buffering=0) as f: - f.write(registry_proto.SerializeToString()) diff --git a/sdk/python/feast/infra/materialization/__init__.py b/sdk/python/feast/infra/materialization/__init__.py deleted file mode 100644 index 815f98739b..0000000000 --- a/sdk/python/feast/infra/materialization/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from .batch_materialization_engine import ( - BatchMaterializationEngine, - MaterializationJob, - MaterializationTask, -) -from .local_engine import LocalMaterializationEngine, LocalMaterializationJob - -__all__ = [ - "MaterializationJob", - "MaterializationTask", - "BatchMaterializationEngine", - "LocalMaterializationEngine", - "LocalMaterializationJob", -] diff --git a/sdk/python/feast/infra/materialization/lambda/Dockerfile b/sdk/python/feast/infra/materialization/aws_lambda/Dockerfile similarity index 100% rename from sdk/python/feast/infra/materialization/lambda/Dockerfile rename to sdk/python/feast/infra/materialization/aws_lambda/Dockerfile diff --git a/sdk/python/feast/infra/materialization/lambda/app.py b/sdk/python/feast/infra/materialization/aws_lambda/app.py similarity index 100% rename from sdk/python/feast/infra/materialization/lambda/app.py rename to sdk/python/feast/infra/materialization/aws_lambda/app.py diff --git a/sdk/python/feast/infra/materialization/lambda/lambda_engine.py b/sdk/python/feast/infra/materialization/aws_lambda/lambda_engine.py similarity index 98% rename from sdk/python/feast/infra/materialization/lambda/lambda_engine.py rename to sdk/python/feast/infra/materialization/aws_lambda/lambda_engine.py index 69986ca6e1..53a845140e 100644 --- a/sdk/python/feast/infra/materialization/lambda/lambda_engine.py +++ b/sdk/python/feast/infra/materialization/aws_lambda/lambda_engine.py @@ -22,7 +22,7 @@ ) from feast.infra.offline_stores.offline_store import OfflineStore from feast.infra.online_stores.online_store import OnlineStore -from feast.registry import BaseRegistry +from feast.infra.registry.base_registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.stream_feature_view import StreamFeatureView from feast.utils import _get_column_names @@ -227,7 +227,7 @@ def _materialize_one( logger.info( f"Ingested task; request id {response['ResponseMetadata']['RequestId']}, " - f"rows written: {output['written_rows']}" + f"Output: {output}" ) for f in not_done: diff --git a/sdk/python/feast/infra/materialization/batch_materialization_engine.py b/sdk/python/feast/infra/materialization/batch_materialization_engine.py index 1890ffed5a..41ab9f22d4 100644 --- a/sdk/python/feast/infra/materialization/batch_materialization_engine.py +++ b/sdk/python/feast/infra/materialization/batch_materialization_engine.py @@ -11,7 +11,7 @@ from feast.feature_view import FeatureView from feast.infra.offline_stores.offline_store import OfflineStore from feast.infra.online_stores.online_store import OnlineStore -from feast.registry import BaseRegistry +from feast.infra.registry.base_registry import BaseRegistry from feast.repo_config import RepoConfig from feast.stream_feature_view import StreamFeatureView @@ -42,7 +42,7 @@ class MaterializationJobStatus(enum.Enum): class MaterializationJob(ABC): """ - MaterializationJob represents an ongoing or executed process that materializes data as per the + A MaterializationJob represents an ongoing or executed process that materializes data as per the definition of a materialization task. """ @@ -70,6 +70,10 @@ def url(self) -> Optional[str]: class BatchMaterializationEngine(ABC): + """ + The interface that Feast uses to control the compute system that handles batch materialization. + """ + def __init__( self, *, @@ -95,8 +99,19 @@ def update( entities_to_delete: Sequence[Entity], entities_to_keep: Sequence[Entity], ): - """This method ensures that any necessary infrastructure or resources needed by the - engine are set up ahead of materialization.""" + """ + Prepares cloud resources required for batch materialization for the specified set of Feast objects. + + Args: + project: Feast project to which the objects belong. + views_to_delete: Feature views whose corresponding infrastructure should be deleted. + views_to_keep: Feature views whose corresponding infrastructure should not be deleted, and + may need to be updated. + entities_to_delete: Entities whose corresponding infrastructure should be deleted. + entities_to_keep: Entities whose corresponding infrastructure should not be deleted, and + may need to be updated. + """ + pass @abstractmethod def materialize( @@ -104,13 +119,15 @@ def materialize( ) -> List[MaterializationJob]: """ Materialize data from the offline store to the online store for this feature repo. + Args: - registry: The feast registry containing the applied feature views. + registry: The registry for the current feature store. tasks: A list of individual materialization tasks. + Returns: A list of materialization jobs representing each task. """ - ... + pass @abstractmethod def teardown_infra( @@ -119,4 +136,12 @@ def teardown_infra( fvs: Sequence[Union[BatchFeatureView, StreamFeatureView, FeatureView]], entities: Sequence[Entity], ): - """This method ensures that any infrastructure or resources set up by ``update()``are torn down.""" + """ + Tears down all cloud resources used by the materialization engine for the specified set of Feast objects. + + Args: + project: Feast project to which the objects belong. + fvs: Feature views whose corresponding infrastructure should be deleted. + entities: Entities whose corresponding infrastructure should be deleted. + """ + pass diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/Dockerfile b/sdk/python/feast/infra/materialization/contrib/bytewax/Dockerfile new file mode 100644 index 0000000000..963924f38d --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.9-slim-bullseye AS build + +RUN apt-get update && \ + apt-get install --no-install-suggests --no-install-recommends --yes git + +WORKDIR /bytewax + +# Copy dataflow code +COPY sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_dataflow.py /bytewax +COPY sdk/python/feast/infra/materialization/contrib/bytewax/dataflow.py /bytewax + +# Copy entrypoint +COPY sdk/python/feast/infra/materialization/contrib/bytewax/entrypoint.sh /bytewax + +# Copy necessary parts of the Feast codebase +COPY sdk/python sdk/python +COPY protos protos +COPY go go +COPY setup.py setup.py +COPY pyproject.toml pyproject.toml +COPY README.md README.md + +# Install Feast for AWS with Bytewax dependencies +# We need this mount thingy because setuptools_scm needs access to the +# git dir to infer the version of feast we're installing. +# https://github.com/pypa/setuptools_scm#usage-from-docker +# I think it also assumes that this dockerfile is being built from the root of the directory. +RUN --mount=source=.git,target=.git,type=bind pip3 install --no-cache-dir -e '.[aws,gcp,bytewax]' + diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/__init__.py b/sdk/python/feast/infra/materialization/contrib/bytewax/__init__.py new file mode 100644 index 0000000000..0838a4c0d5 --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/__init__.py @@ -0,0 +1,15 @@ +from .bytewax_materialization_dataflow import BytewaxMaterializationDataflow +from .bytewax_materialization_engine import ( + BytewaxMaterializationEngine, + BytewaxMaterializationEngineConfig, +) +from .bytewax_materialization_job import BytewaxMaterializationJob +from .bytewax_materialization_task import BytewaxMaterializationTask + +__all__ = [ + "BytewaxMaterializationTask", + "BytewaxMaterializationJob", + "BytewaxMaterializationDataflow", + "BytewaxMaterializationEngine", + "BytewaxMaterializationEngineConfig", +] diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_dataflow.py b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_dataflow.py new file mode 100644 index 0000000000..1fad2c909f --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_dataflow.py @@ -0,0 +1,89 @@ +from typing import List + +import pyarrow as pa +import pyarrow.parquet as pq +import s3fs +from bytewax import Dataflow, cluster_main # type: ignore +from bytewax.inputs import AdvanceTo, Emit, ManualInputConfig, distribute +from bytewax.parse import proc_env +from tqdm import tqdm + +from feast import FeatureStore, FeatureView, RepoConfig +from feast.utils import _convert_arrow_to_proto, _run_pyarrow_field_mapping + + +class BytewaxMaterializationDataflow: + def __init__( + self, + config: RepoConfig, + feature_view: FeatureView, + paths: List[str], + ): + self.config = config + self.feature_store = FeatureStore(config=config) + + self.feature_view = feature_view + self.paths = paths + + self._run_dataflow() + + def process_path(self, path): + fs = s3fs.S3FileSystem() + dataset = pq.ParquetDataset(path, filesystem=fs, use_legacy_dataset=False) + batches = [] + for fragment in dataset.fragments: + for batch in fragment.to_table().to_batches(): + batches.append(batch) + + return batches + + def input_builder(self, worker_index, worker_count, resume_epoch): + worker_paths = distribute(self.paths, worker_index, worker_count) + epoch = 0 + for path in worker_paths: + yield AdvanceTo(epoch) + yield Emit(path) + epoch += 1 + + return + + def output_builder(self, worker_index, worker_count): + def output_fn(epoch_batch): + _, batch = epoch_batch + + table = pa.Table.from_batches([batch]) + + if self.feature_view.batch_source.field_mapping is not None: + table = _run_pyarrow_field_mapping( + table, self.feature_view.batch_source.field_mapping + ) + + join_key_to_value_type = { + entity.name: entity.dtype.to_value_type() + for entity in self.feature_view.entity_columns + } + + rows_to_write = _convert_arrow_to_proto( + table, self.feature_view, join_key_to_value_type + ) + provider = self.feature_store._get_provider() + with tqdm(total=len(rows_to_write)) as progress: + provider.online_write_batch( + config=self.config, + table=self.feature_view, + data=rows_to_write, + progress=progress.update, + ) + + return output_fn + + def _run_dataflow(self): + flow = Dataflow() + flow.flat_map(self.process_path) + flow.capture() + cluster_main( + flow, + ManualInputConfig(self.input_builder), + self.output_builder, + **proc_env(), + ) diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_engine.py b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_engine.py new file mode 100644 index 0000000000..0477722eb1 --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_engine.py @@ -0,0 +1,392 @@ +import uuid +from datetime import datetime +from typing import Callable, List, Literal, Sequence, Union + +import yaml +from kubernetes import client +from kubernetes import config as k8s_config +from kubernetes import utils +from kubernetes.utils import FailToCreateError +from pydantic import StrictStr +from tqdm import tqdm + +from feast import FeatureView, RepoConfig +from feast.batch_feature_view import BatchFeatureView +from feast.entity import Entity +from feast.infra.materialization.batch_materialization_engine import ( + BatchMaterializationEngine, + MaterializationJob, + MaterializationTask, +) +from feast.infra.offline_stores.offline_store import OfflineStore +from feast.infra.online_stores.online_store import OnlineStore +from feast.infra.registry.base_registry import BaseRegistry +from feast.repo_config import FeastConfigBaseModel +from feast.stream_feature_view import StreamFeatureView +from feast.utils import _get_column_names + +from .bytewax_materialization_job import BytewaxMaterializationJob + + +class BytewaxMaterializationEngineConfig(FeastConfigBaseModel): + """Batch Materialization Engine config for Bytewax""" + + type: Literal["bytewax"] = "bytewax" + """ Materialization type selector""" + + namespace: StrictStr = "default" + """ (optional) The namespace in Kubernetes to use when creating services, configuration maps and jobs. + """ + + image: StrictStr = "bytewax/bytewax-feast:latest" + """ (optional) The container image to use when running the materialization job.""" + + env: List[dict] = [] + """ (optional) A list of environment variables to set in the created Kubernetes pods. + These environment variables can be used to reference Kubernetes secrets. + """ + + +class BytewaxMaterializationEngine(BatchMaterializationEngine): + def __init__( + self, + *, + repo_config: RepoConfig, + offline_store: OfflineStore, + online_store: OnlineStore, + **kwargs, + ): + super().__init__( + repo_config=repo_config, + offline_store=offline_store, + online_store=online_store, + **kwargs, + ) + self.repo_config = repo_config + self.offline_store = offline_store + self.online_store = online_store + + # TODO: Configure k8s here + k8s_config.load_kube_config() + + self.k8s_client = client.api_client.ApiClient() + self.v1 = client.CoreV1Api(self.k8s_client) + self.batch_v1 = client.BatchV1Api(self.k8s_client) + self.batch_engine_config = repo_config.batch_engine + self.namespace = self.batch_engine_config.namespace + + def update( + self, + project: str, + views_to_delete: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + views_to_keep: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + ): + """This method ensures that any necessary infrastructure or resources needed by the + engine are set up ahead of materialization.""" + pass + + def teardown_infra( + self, + project: str, + fvs: Sequence[Union[BatchFeatureView, StreamFeatureView, FeatureView]], + entities: Sequence[Entity], + ): + """This method ensures that any infrastructure or resources set up by ``update()``are torn down.""" + pass + + def materialize( + self, + registry: BaseRegistry, + tasks: List[MaterializationTask], + ) -> List[MaterializationJob]: + return [ + self._materialize_one( + registry, + task.feature_view, + task.start_time, + task.end_time, + task.project, + task.tqdm_builder, + ) + for task in tasks + ] + + def _materialize_one( + self, + registry: BaseRegistry, + feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView], + start_date: datetime, + end_date: datetime, + project: str, + tqdm_builder: Callable[[int], tqdm], + ): + entities = [] + for entity_name in feature_view.entities: + entities.append(registry.get_entity(entity_name, project)) + + ( + join_key_columns, + feature_name_columns, + timestamp_field, + created_timestamp_column, + ) = _get_column_names(feature_view, entities) + + offline_job = self.offline_store.pull_latest_from_table_or_query( + config=self.repo_config, + data_source=feature_view.batch_source, + join_key_columns=join_key_columns, + feature_name_columns=feature_name_columns, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + start_date=start_date, + end_date=end_date, + ) + + paths = offline_job.to_remote_storage() + job_id = str(uuid.uuid4()) + return self._create_kubernetes_job(job_id, paths, feature_view) + + def _create_kubernetes_job(self, job_id, paths, feature_view): + try: + # Create a k8s configmap with information needed by bytewax + self._create_configuration_map(job_id, paths, feature_view, self.namespace) + + # Create the k8s service definition, used for bytewax communication + self._create_service_definition(job_id, self.namespace) + + # Create the k8s job definition + self._create_job_definition( + job_id, + self.namespace, + len(paths), # Create a pod for each parquet file + self.batch_engine_config.env, + ) + except FailToCreateError as failures: + return BytewaxMaterializationJob(job_id, self.namespace, error=failures) + + return BytewaxMaterializationJob(job_id, self.namespace) + + def _create_configuration_map(self, job_id, paths, feature_view, namespace): + """Create a Kubernetes configmap for this job""" + + feature_store_configuration = yaml.dump( + yaml.safe_load( + self.repo_config.json( + exclude={"repo_path"}, + exclude_unset=True, + ) + ) + ) + + materialization_config = yaml.dump( + {"paths": paths, "feature_view": feature_view.name} + ) + + configmap_manifest = { + "kind": "ConfigMap", + "apiVersion": "v1", + "metadata": { + "name": f"feast-{job_id}", + }, + "data": { + "feature_store.yaml": feature_store_configuration, + "bytewax_materialization_config.yaml": materialization_config, + }, + } + self.v1.create_namespaced_config_map( + namespace=namespace, + body=configmap_manifest, + ) + + def _create_service_definition(self, job_id, namespace): + """Creates a kubernetes service definition. + + This service definition is created to allow bytewax workers + to communicate with each other. + """ + service_definition = { + "apiVersion": "v1", + "kind": "Service", + "metadata": { + "name": f"dataflow-{job_id}", + "namespace": namespace, + }, + "spec": { + "clusterIP": "None", + "clusterIPs": ["None"], + "internalTrafficPolicy": "Cluster", + "ipFamilies": ["IPv4"], + "ipFamilyPolicy": "SingleStack", + "ports": [ + { + "name": "worker", + "port": 9999, + "protocol": "TCP", + "targetPort": 9999, + } + ], + "selector": {"job-name": f"dataflow-{job_id}"}, + "sessionAffinity": "None", + "type": "ClusterIP", + }, + } + + utils.create_from_dict(self.k8s_client, service_definition) + + def _create_job_definition(self, job_id, namespace, pods, env): + """Create a kubernetes job definition.""" + job_env = [ + {"name": "RUST_BACKTRACE", "value": "full"}, + { + "name": "BYTEWAX_PYTHON_FILE_PATH", + "value": "/bytewax/dataflow.py", + }, + {"name": "BYTEWAX_WORKDIR", "value": "/bytewax"}, + { + "name": "BYTEWAX_WORKERS_PER_PROCESS", + "value": "1", + }, + { + "name": "BYTEWAX_POD_NAME", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "metadata.annotations['batch.kubernetes.io/job-completion-index']", + } + }, + }, + { + "name": "BYTEWAX_REPLICAS", + "value": f"{pods}", + }, + { + "name": "BYTEWAX_KEEP_CONTAINER_ALIVE", + "value": "false", + }, + { + "name": "BYTEWAX_HOSTFILE_PATH", + "value": "/etc/bytewax/hostfile.txt", + }, + { + "name": "BYTEWAX_STATEFULSET_NAME", + "value": f"dataflow-{job_id}", + }, + ] + # Add any Feast configured environment variables + job_env.extend(env) + + job_definition = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": f"dataflow-{job_id}", + "namespace": namespace, + }, + "spec": { + "ttlSecondsAfterFinished": 3600, + "completions": pods, + "parallelism": pods, + "completionMode": "Indexed", + "template": { + "spec": { + "restartPolicy": "Never", + "subdomain": f"dataflow-{job_id}", + "initContainers": [ + { + "command": [ + "sh", + "-c", + f'set -ex\n# Generate hostfile.txt.\necho "dataflow-{job_id}-0.dataflow-{job_id}.{namespace}.svc.cluster.local:9999" > /etc/bytewax/hostfile.txt\nreplicas=$(($BYTEWAX_REPLICAS-1))\nx=1\nwhile [ $x -le $replicas ]\ndo\n echo "dataflow-{job_id}-$x.dataflow-{job_id}.{namespace}.svc.cluster.local:9999" >> /etc/bytewax/hostfile.txt\n x=$(( $x + 1 ))\ndone', + ], + "env": [ + { + "name": "BYTEWAX_REPLICAS", + "value": f"{pods}", + } + ], + "image": "busybox", + "imagePullPolicy": "Always", + "name": "init-hostfile", + "resources": {}, + "securityContext": { + "allowPrivilegeEscalation": False, + "capabilities": { + "add": ["NET_BIND_SERVICE"], + "drop": ["ALL"], + }, + "readOnlyRootFilesystem": True, + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + {"mountPath": "/etc/bytewax", "name": "hostfile"}, + { + "mountPath": "/tmp/bytewax/", + "name": "python-files", + }, + { + "mountPath": "/var/feast/", + "name": f"feast-{job_id}", + }, + ], + } + ], + "containers": [ + { + "command": ["sh", "-c", "sh ./entrypoint.sh"], + "env": job_env, + "image": self.batch_engine_config.image, + "imagePullPolicy": "Always", + "name": "process", + "ports": [ + { + "containerPort": 9999, + "name": "process", + "protocol": "TCP", + } + ], + "resources": {}, + "securityContext": { + "allowPrivilegeEscalation": False, + "capabilities": { + "add": ["NET_BIND_SERVICE"], + "drop": ["ALL"], + }, + "readOnlyRootFilesystem": False, + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + {"mountPath": "/etc/bytewax", "name": "hostfile"}, + { + "mountPath": "/var/feast/", + "name": f"feast-{job_id}", + }, + ], + } + ], + "volumes": [ + {"emptyDir": {}, "name": "hostfile"}, + { + "configMap": { + "defaultMode": 420, + "name": f"feast-{job_id}", + }, + "name": "python-files", + }, + { + "configMap": {"name": f"feast-{job_id}"}, + "name": f"feast-{job_id}", + }, + ], + } + }, + }, + } + utils.create_from_dict(self.k8s_client, job_definition) diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_job.py b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_job.py new file mode 100644 index 0000000000..77d2149eb5 --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_job.py @@ -0,0 +1,51 @@ +from typing import Optional + +from kubernetes import client + +from feast.infra.materialization.batch_materialization_engine import ( + MaterializationJob, + MaterializationJobStatus, +) + + +class BytewaxMaterializationJob(MaterializationJob): + def __init__( + self, + job_id, + namespace, + error: Optional[BaseException] = None, + ): + super().__init__() + self._job_id = job_id + self.namespace = namespace + self._error: Optional[BaseException] = error + self.batch_v1 = client.BatchV1Api() + + def error(self): + return self._error + + def status(self): + if self._error is not None: + return MaterializationJobStatus.ERROR + else: + # TODO: Find a better way to parse status? + job_status = self.batch_v1.read_namespaced_job_status( + self.job_id(), self.namespace + ).status + if job_status.active is not None: + if job_status.completion_time is None: + return MaterializationJobStatus.RUNNING + elif job_status.failed is not None: + return MaterializationJobStatus.ERROR + elif job_status.active is None and job_status.succeeded is not None: + if job_status.conditions[0].type == "Complete": + return MaterializationJobStatus.SUCCEEDED + + def should_be_retried(self): + return False + + def job_id(self): + return f"dataflow-{self._job_id}" + + def url(self): + return None diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_task.py b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_task.py new file mode 100644 index 0000000000..8bb8da741a --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_task.py @@ -0,0 +1,10 @@ +from feast.infra.materialization.batch_materialization_engine import MaterializationTask + + +class BytewaxMaterializationTask(MaterializationTask): + def __init__(self, project, feature_view, start_date, end_date, tqdm): + self.project = project + self.feature_view = feature_view + self.start_date = start_date + self.end_date = end_date + self.tqdm = tqdm diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/dataflow.py b/sdk/python/feast/infra/materialization/contrib/bytewax/dataflow.py new file mode 100644 index 0000000000..e3d95e2a75 --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/dataflow.py @@ -0,0 +1,22 @@ +import yaml + +from feast import FeatureStore, RepoConfig +from feast.infra.materialization.contrib.bytewax.bytewax_materialization_dataflow import ( + BytewaxMaterializationDataflow, +) + +if __name__ == "__main__": + with open("/var/feast/feature_store.yaml") as f: + feast_config = yaml.safe_load(f) + + with open("/var/feast/bytewax_materialization_config.yaml") as b: + bytewax_config = yaml.safe_load(b) + + config = RepoConfig(**feast_config) + store = FeatureStore(config=config) + + job = BytewaxMaterializationDataflow( + config, + store.get_feature_view(bytewax_config["feature_view"]), + bytewax_config["paths"], + ) diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/entrypoint.sh b/sdk/python/feast/infra/materialization/contrib/bytewax/entrypoint.sh new file mode 100644 index 0000000000..0179e5481f --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/entrypoint.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cd /bytewax +python dataflow.py diff --git a/sdk/python/feast/infra/materialization/lambda/__init__.py b/sdk/python/feast/infra/materialization/lambda/__init__.py deleted file mode 100644 index d21505d91e..0000000000 --- a/sdk/python/feast/infra/materialization/lambda/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from .lambda_engine import ( - LambdaMaterializationEngine, - LambdaMaterializationEngineConfig, - LambdaMaterializationJob, -) - -__all__ = [ - "LambdaMaterializationEngineConfig", - "LambdaMaterializationJob", - "LambdaMaterializationEngine", -] diff --git a/sdk/python/feast/infra/materialization/local_engine.py b/sdk/python/feast/infra/materialization/local_engine.py index 4f775981ef..d818571453 100644 --- a/sdk/python/feast/infra/materialization/local_engine.py +++ b/sdk/python/feast/infra/materialization/local_engine.py @@ -9,15 +9,15 @@ from feast.feature_view import FeatureView from feast.infra.offline_stores.offline_store import OfflineStore from feast.infra.online_stores.online_store import OnlineStore +from feast.infra.registry.base_registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.stream_feature_view import StreamFeatureView - -from ...registry import BaseRegistry -from ...utils import ( +from feast.utils import ( _convert_arrow_to_proto, _get_column_names, _run_pyarrow_field_mapping, ) + from .batch_materialization_engine import ( BatchMaterializationEngine, MaterializationJob, diff --git a/sdk/python/feast/infra/materialization/snowflake_engine.py b/sdk/python/feast/infra/materialization/snowflake_engine.py new file mode 100644 index 0000000000..1663cbcbc0 --- /dev/null +++ b/sdk/python/feast/infra/materialization/snowflake_engine.py @@ -0,0 +1,485 @@ +import os +import shutil +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Callable, List, Literal, Optional, Sequence, Union + +import click +import pandas as pd +from colorama import Fore, Style +from pydantic import Field, StrictStr +from tqdm import tqdm + +import feast +from feast.batch_feature_view import BatchFeatureView +from feast.entity import Entity +from feast.feature_view import FeatureView +from feast.infra.materialization.batch_materialization_engine import ( + BatchMaterializationEngine, + MaterializationJob, + MaterializationJobStatus, + MaterializationTask, +) +from feast.infra.offline_stores.offline_store import OfflineStore +from feast.infra.online_stores.online_store import OnlineStore +from feast.infra.registry.base_registry import BaseRegistry +from feast.infra.utils.snowflake.snowflake_utils import ( + _run_snowflake_field_mapping, + assert_snowflake_feature_names, + execute_snowflake_statement, + get_snowflake_conn, + package_snowpark_zip, +) +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.stream_feature_view import StreamFeatureView +from feast.type_map import _convert_value_name_to_snowflake_udf +from feast.utils import _coerce_datetime, _get_column_names + + +class SnowflakeMaterializationEngineConfig(FeastConfigBaseModel): + """Batch Materialization Engine config for Snowflake Snowpark Python UDFs""" + + type: Literal["snowflake.engine"] = "snowflake.engine" + """ Type selector""" + + config_path: Optional[str] = ( + Path(os.environ["HOME"]) / ".snowsql/config" + ).__str__() + """ Snowflake config path -- absolute path required (Cant use ~)""" + + account: Optional[str] = None + """ Snowflake deployment identifier -- drop .snowflakecomputing.com""" + + user: Optional[str] = None + """ Snowflake user name """ + + password: Optional[str] = None + """ Snowflake password """ + + role: Optional[str] = None + """ Snowflake role name""" + + warehouse: Optional[str] = None + """ Snowflake warehouse name """ + + authenticator: Optional[str] = None + """ Snowflake authenticator name """ + + database: StrictStr + """ Snowflake database name """ + + schema_: Optional[str] = Field("PUBLIC", alias="schema") + """ Snowflake schema name """ + + class Config: + allow_population_by_field_name = True + + +@dataclass +class SnowflakeMaterializationJob(MaterializationJob): + def __init__( + self, + job_id: str, + status: MaterializationJobStatus, + error: Optional[BaseException] = None, + ) -> None: + super().__init__() + self._job_id: str = job_id + self._status: MaterializationJobStatus = status + self._error: Optional[BaseException] = error + + def status(self) -> MaterializationJobStatus: + return self._status + + def error(self) -> Optional[BaseException]: + return self._error + + def should_be_retried(self) -> bool: + return False + + def job_id(self) -> str: + return self._job_id + + def url(self) -> Optional[str]: + return None + + +class SnowflakeMaterializationEngine(BatchMaterializationEngine): + def update( + self, + project: str, + views_to_delete: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + views_to_keep: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + ): + click.echo( + f"Deploying materialization functions for {Style.BRIGHT + Fore.GREEN}{project}{Style.RESET_ALL}" + ) + click.echo() + + stage_context = f'"{self.repo_config.batch_engine.database}"."{self.repo_config.batch_engine.schema_}"' + stage_path = f'{stage_context}."feast_{project}"' + with get_snowflake_conn(self.repo_config.batch_engine) as conn: + query = f"SHOW STAGES IN {stage_context}" + cursor = execute_snowflake_statement(conn, query) + stage_list = pd.DataFrame( + cursor.fetchall(), + columns=[column.name for column in cursor.description], + ) + + # if the stage already exists, + # assumes that the materialization functions have been deployed + if f"feast_{project}" in stage_list["name"].tolist(): + click.echo( + f"Materialization functions for {Style.BRIGHT + Fore.GREEN}{project}{Style.RESET_ALL} already exists" + ) + click.echo() + return None + + query = f"CREATE STAGE {stage_path}" + execute_snowflake_statement(conn, query) + + copy_path, zip_path = package_snowpark_zip(project) + query = f"PUT file://{zip_path} @{stage_path}" + execute_snowflake_statement(conn, query) + + shutil.rmtree(copy_path) + + # Execute snowflake python udf creation functions + sql_function_file = f"{os.path.dirname(feast.__file__)}/infra/utils/snowflake/snowpark/snowflake_python_udfs_creation.sql" + with open(sql_function_file, "r") as file: + sqlFile = file.read() + + sqlCommands = sqlFile.split(";") + for command in sqlCommands: + command = command.replace("STAGE_HOLDER", f"{stage_path}") + query = command.replace("PROJECT_NAME", f"{project}") + execute_snowflake_statement(conn, query) + + return None + + def teardown_infra( + self, + project: str, + fvs: Sequence[Union[BatchFeatureView, StreamFeatureView, FeatureView]], + entities: Sequence[Entity], + ): + + stage_path = f'"{self.repo_config.batch_engine.database}"."{self.repo_config.batch_engine.schema_}"."feast_{project}"' + with get_snowflake_conn(self.repo_config.batch_engine) as conn: + query = f"DROP STAGE IF EXISTS {stage_path}" + execute_snowflake_statement(conn, query) + + # Execute snowflake python udf deletion functions + sql_function_file = f"{os.path.dirname(feast.__file__)}/infra/utils/snowflake/snowpark/snowflake_python_udfs_deletion.sql" + with open(sql_function_file, "r") as file: + sqlFile = file.read() + + sqlCommands = sqlFile.split(";") + for command in sqlCommands: + query = command.replace("PROJECT_NAME", f"{project}") + execute_snowflake_statement(conn, query) + + return None + + def __init__( + self, + *, + repo_config: RepoConfig, + offline_store: OfflineStore, + online_store: OnlineStore, + **kwargs, + ): + assert ( + repo_config.offline_store.type == "snowflake.offline" + ), "To use SnowflakeMaterializationEngine, you must use Snowflake as an offline store." + + super().__init__( + repo_config=repo_config, + offline_store=offline_store, + online_store=online_store, + **kwargs, + ) + + def materialize( + self, registry, tasks: List[MaterializationTask] + ) -> List[MaterializationJob]: + return [ + self._materialize_one( + registry, + task.feature_view, + task.start_time, + task.end_time, + task.project, + task.tqdm_builder, + ) + for task in tasks + ] + + def _materialize_one( + self, + registry: BaseRegistry, + feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView], + start_date: datetime, + end_date: datetime, + project: str, + tqdm_builder: Callable[[int], tqdm], + ): + assert isinstance(feature_view, BatchFeatureView) or isinstance( + feature_view, FeatureView + ), "Snowflake can only materialize FeatureView & BatchFeatureView feature view types." + + entities = [] + for entity_name in feature_view.entities: + entities.append(registry.get_entity(entity_name, project)) + + ( + join_key_columns, + feature_name_columns, + timestamp_field, + created_timestamp_column, + ) = _get_column_names(feature_view, entities) + + job_id = f"{feature_view.name}-{start_date}-{end_date}" + + try: + offline_job = self.offline_store.pull_latest_from_table_or_query( + config=self.repo_config, + data_source=feature_view.batch_source, + join_key_columns=join_key_columns, + feature_name_columns=feature_name_columns, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + start_date=start_date, + end_date=end_date, + ) + + fv_latest_values_sql = offline_job.to_sql() + + if feature_view.batch_source.field_mapping is not None: + fv_latest_mapped_values_sql = _run_snowflake_field_mapping( + fv_latest_values_sql, feature_view.batch_source.field_mapping + ) + + fv_to_proto_sql = self.generate_snowflake_materialization_query( + self.repo_config, + fv_latest_mapped_values_sql, + feature_view, + project, + ) + + if self.repo_config.online_store.type == "snowflake.online": + self.materialize_to_snowflake_online_store( + self.repo_config, + fv_to_proto_sql, + feature_view, + project, + ) + else: + self.materialize_to_external_online_store( + self.repo_config, + fv_to_proto_sql, + feature_view, + tqdm_builder, + ) + + return SnowflakeMaterializationJob( + job_id=job_id, status=MaterializationJobStatus.SUCCEEDED + ) + except BaseException as e: + return SnowflakeMaterializationJob( + job_id=job_id, status=MaterializationJobStatus.ERROR, error=e + ) + + def generate_snowflake_materialization_query( + self, + repo_config: RepoConfig, + fv_latest_mapped_values_sql: str, + feature_view: Union[BatchFeatureView, FeatureView], + project: str, + ) -> str: + + if feature_view.batch_source.created_timestamp_column: + fv_created_str = f',"{feature_view.batch_source.created_timestamp_column}"' + else: + fv_created_str = None + + join_keys = [entity.name for entity in feature_view.entity_columns] + join_keys_type = [ + entity.dtype.to_value_type().name for entity in feature_view.entity_columns + ] + + entity_names = "ARRAY_CONSTRUCT('" + "', '".join(join_keys) + "')" + entity_data = 'ARRAY_CONSTRUCT("' + '", "'.join(join_keys) + '")' + entity_types = "ARRAY_CONSTRUCT('" + "', '".join(join_keys_type) + "')" + + """ + Generate the SQL that maps the feature given ValueType to the correct python + UDF serialization function. + """ + feature_sql_list = [] + for feature in feature_view.features: + feature_value_type_name = feature.dtype.to_value_type().name + + feature_sql = _convert_value_name_to_snowflake_udf( + feature_value_type_name, project + ) + + if feature_value_type_name == "UNIX_TIMESTAMP": + feature_sql = f'{feature_sql}(DATE_PART(EPOCH_NANOSECOND, "{feature.name}")) AS "{feature.name}"' + else: + feature_sql = f'{feature_sql}("{feature.name}") AS "{feature.name}"' + + feature_sql_list.append(feature_sql) + + features_str = ",\n".join(feature_sql_list) + + if repo_config.online_store.type == "snowflake.online": + serial_func = f"feast_{project}_serialize_entity_keys" + else: + serial_func = f"feast_{project}_entity_key_proto_to_string" + + fv_to_proto_sql = f""" + SELECT + {serial_func.upper()}({entity_names}, {entity_data}, {entity_types}) AS "entity_key", + {features_str}, + "{feature_view.batch_source.timestamp_field}" + {fv_created_str if fv_created_str else ''} + FROM ( + {fv_latest_mapped_values_sql} + ) + """ + + return fv_to_proto_sql + + def materialize_to_snowflake_online_store( + self, + repo_config: RepoConfig, + materialization_sql: str, + feature_view: Union[BatchFeatureView, FeatureView], + project: str, + ) -> None: + assert_snowflake_feature_names(feature_view) + + online_table = f"""{repo_config .online_store.database}"."{repo_config.online_store.schema_}"."[online-transient] {project}_{feature_view.name}""" + + feature_names_str = '", "'.join( + [feature.name for feature in feature_view.features] + ) + + if feature_view.batch_source.created_timestamp_column: + fv_created_str = f',"{feature_view.batch_source.created_timestamp_column}"' + else: + fv_created_str = None + + query = f""" + MERGE INTO "{online_table}" online_table + USING ( + SELECT + "entity_key" || TO_BINARY("feature_name", 'UTF-8') AS "entity_feature_key", + "entity_key", + "feature_name", + "feature_value" AS "value", + "{feature_view.batch_source.timestamp_field}" AS "event_ts" + {fv_created_str + ' AS "created_ts"' if fv_created_str else ''} + FROM ( + {materialization_sql} + ) + UNPIVOT("feature_value" FOR "feature_name" IN ("{feature_names_str}")) + ) AS latest_values ON online_table."entity_feature_key" = latest_values."entity_feature_key" + WHEN MATCHED THEN + UPDATE SET + online_table."entity_key" = latest_values."entity_key", + online_table."feature_name" = latest_values."feature_name", + online_table."value" = latest_values."value", + online_table."event_ts" = latest_values."event_ts" + {',online_table."created_ts" = latest_values."created_ts"' if fv_created_str else ''} + WHEN NOT MATCHED THEN + INSERT ("entity_feature_key", "entity_key", "feature_name", "value", "event_ts" {', "created_ts"' if fv_created_str else ''}) + VALUES ( + latest_values."entity_feature_key", + latest_values."entity_key", + latest_values."feature_name", + latest_values."value", + latest_values."event_ts" + {',latest_values."created_ts"' if fv_created_str else ''} + ) + """ + + with get_snowflake_conn(repo_config.batch_engine) as conn: + query_id = execute_snowflake_statement(conn, query).sfqid + + click.echo( + f"Snowflake Query ID: {Style.BRIGHT + Fore.GREEN}{query_id}{Style.RESET_ALL}" + ) + return None + + def materialize_to_external_online_store( + self, + repo_config: RepoConfig, + materialization_sql: str, + feature_view: Union[StreamFeatureView, FeatureView], + tqdm_builder: Callable[[int], tqdm], + ) -> None: + + feature_names = [feature.name for feature in feature_view.features] + + with get_snowflake_conn(repo_config.batch_engine) as conn: + query = materialization_sql + cursor = execute_snowflake_statement(conn, query) + for i, df in enumerate(cursor.fetch_pandas_batches()): + click.echo( + f"Snowflake: Processing Materialization ResultSet Batch #{i+1}" + ) + + entity_keys = ( + df["entity_key"].apply(EntityKeyProto.FromString).to_numpy() + ) + + for feature in feature_names: + df[feature] = df[feature].apply(ValueProto.FromString) + + features = df[feature_names].to_dict("records") + + event_timestamps = [ + _coerce_datetime(val) + for val in pd.to_datetime( + df[feature_view.batch_source.timestamp_field] + ) + ] + + if feature_view.batch_source.created_timestamp_column: + created_timestamps = [ + _coerce_datetime(val) + for val in pd.to_datetime( + df[feature_view.batch_source.created_timestamp_column] + ) + ] + else: + created_timestamps = [None] * df.shape[0] + + rows_to_write = list( + zip( + entity_keys, + features, + event_timestamps, + created_timestamps, + ) + ) + + with tqdm_builder(len(rows_to_write)) as pbar: + self.online_store.online_write_batch( + repo_config, + feature_view, + rows_to_write, + lambda x: pbar.update(x), + ) + return None diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index e3791f08c7..8b2773fb65 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -39,12 +39,12 @@ RetrievalJob, RetrievalMetadata, ) +from feast.infra.registry.base_registry import BaseRegistry from feast.on_demand_feature_view import OnDemandFeatureView -from feast.registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel, RepoConfig from ...saved_dataset import SavedDatasetStorage -from ...usage import log_exceptions_and_usage +from ...usage import get_user_agent, log_exceptions_and_usage from .bigquery_source import ( BigQueryLoggingDestination, BigQuerySource, @@ -52,6 +52,7 @@ ) try: + from google.api_core import client_info as http_client_info from google.api_core.exceptions import NotFound from google.auth.exceptions import DefaultCredentialsError from google.cloud import bigquery @@ -65,6 +66,10 @@ raise FeastExtrasDependencyImportError("gcp", str(e)) +def get_http_client_info(): + return http_client_info.ClientInfo(user_agent=get_user_agent()) + + class BigQueryOfflineStoreConfig(FeastConfigBaseModel): """Offline store config for GCP BigQuery""" @@ -101,6 +106,7 @@ def pull_latest_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: + assert isinstance(config.offline_store, BigQueryOfflineStoreConfig) assert isinstance(data_source, BigQuerySource) from_expression = data_source.get_table_query_string() @@ -151,6 +157,7 @@ def pull_all_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: + assert isinstance(config.offline_store, BigQueryOfflineStoreConfig) assert isinstance(data_source, BigQuerySource) from_expression = data_source.get_table_query_string() @@ -186,6 +193,8 @@ def get_historical_features( ) -> RetrievalJob: # TODO: Add entity_df validation in order to fail before interacting with BigQuery assert isinstance(config.offline_store, BigQueryOfflineStoreConfig) + for fv in feature_views: + assert isinstance(fv.batch_source, BigQuerySource) client = _get_bigquery_client( project=config.offline_store.project_id, @@ -328,18 +337,8 @@ def offline_write_batch( table: pyarrow.Table, progress: Optional[Callable[[int], Any]], ): - if not feature_view.batch_source: - raise ValueError( - "feature view does not have a batch source to persist offline data" - ) - if not isinstance(config.offline_store, BigQueryOfflineStoreConfig): - raise ValueError( - f"offline store config is of type {type(config.offline_store)} when bigquery type required" - ) - if not isinstance(feature_view.batch_source, BigQuerySource): - raise ValueError( - f"feature view batch source is {type(feature_view.batch_source)} not bigquery source" - ) + assert isinstance(config.offline_store, BigQueryOfflineStoreConfig) + assert isinstance(feature_view.batch_source, BigQuerySource) pa_schema, column_names = offline_utils.get_pyarrow_schema_from_batch_source( config, feature_view.batch_source @@ -399,9 +398,7 @@ def query_generator() -> Iterator[str]: self.client = client self.config = config self._full_feature_names = full_feature_names - self._on_demand_feature_views = ( - on_demand_feature_views if on_demand_feature_views else [] - ) + self._on_demand_feature_views = on_demand_feature_views or [] self._metadata = metadata if self.config.offline_store.gcs_staging_location: self._gcs_path = ( @@ -417,7 +414,7 @@ def full_feature_names(self) -> bool: return self._full_feature_names @property - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: return self._on_demand_feature_views def _to_df_internal(self) -> pd.DataFrame: @@ -426,29 +423,27 @@ def _to_df_internal(self) -> pd.DataFrame: return df def to_sql(self) -> str: - """ - Returns the SQL query that will be executed in BigQuery to build the historical feature table. - """ + """Returns the underlying SQL query.""" with self._query_generator() as query: return query def to_bigquery( self, - job_config: bigquery.QueryJobConfig = None, + job_config: Optional[bigquery.QueryJobConfig] = None, timeout: int = 1800, retry_cadence: int = 10, ) -> str: """ - Triggers the execution of a historical feature retrieval query and exports the results to a BigQuery table. - Runs for a maximum amount of time specified by the timeout parameter (defaulting to 30 minutes). + Synchronously executes the underlying query and exports the result to a BigQuery table. The + underlying BigQuery job runs for a limited amount of time (the default is 30 minutes). Args: - job_config: An optional bigquery.QueryJobConfig to specify options like destination table, dry run, etc. - timeout: An optional number of seconds for setting the time limit of the QueryJob. - retry_cadence: An optional number of seconds for setting how long the job should checked for completion. + job_config (optional): A bigquery.QueryJobConfig to specify options like the destination table, dry run, etc. + timeout (optional): The time limit of the BigQuery job in seconds. Defaults to 30 minutes. + retry_cadence (optional): The number of seconds for setting how long the job should checked for completion. Returns: - Returns the destination table name or returns None if job_config.dry_run is True. + Returns the destination table name or None if job_config.dry_run is True. """ if not job_config: @@ -492,7 +487,7 @@ def _execute_query( block_until_done(client=self.client, bq_job=bq_job, timeout=timeout) return bq_job - def persist(self, storage: SavedDatasetStorage): + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): assert isinstance(storage, SavedDatasetBigQueryStorage) self.to_bigquery( @@ -692,7 +687,9 @@ def _get_bigquery_client( project: Optional[str] = None, location: Optional[str] = None ) -> bigquery.Client: try: - client = bigquery.Client(project=project, location=location) + client = bigquery.Client( + project=project, location=location, client_info=get_http_client_info() + ) except DefaultCredentialsError as e: raise FeastProviderLoginError( str(e) diff --git a/sdk/python/feast/infra/offline_stores/bigquery_source.py b/sdk/python/feast/infra/offline_stores/bigquery_source.py index bb8316869b..a9c7924c66 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery_source.py +++ b/sdk/python/feast/infra/offline_stores/bigquery_source.py @@ -1,11 +1,10 @@ -import warnings from typing import Callable, Dict, Iterable, List, Optional, Tuple from typeguard import typechecked from feast import type_map from feast.data_source import DataSource -from feast.errors import DataSourceNotFoundException +from feast.errors import DataSourceNoNameException, DataSourceNotFoundException from feast.feature_logging import LoggingDestination from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.protos.feast.core.FeatureService_pb2 import ( @@ -24,36 +23,35 @@ class BigQuerySource(DataSource): def __init__( self, *, - event_timestamp_column: Optional[str] = "", + name: Optional[str] = None, + timestamp_field: Optional[str] = None, table: Optional[str] = None, created_timestamp_column: Optional[str] = "", field_mapping: Optional[Dict[str, str]] = None, - date_partition_column: Optional[str] = None, query: Optional[str] = None, - name: Optional[str] = None, description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", - timestamp_field: Optional[str] = None, ): """Create a BigQuerySource from an existing table or query. Args: + name (optional): Name for the source. Defaults to the table if not specified, in which + case the table must be specified. + timestamp_field (optional): Event timestamp field used for point in time + joins of feature values. + table (optional): BigQuery table where the features are stored. Exactly one of 'table' + and 'query' must be specified. table (optional): The BigQuery table where features can be found. - event_timestamp_column (optional): (Deprecated in favor of timestamp_field) Event - timestamp column used for point in time joins of feature values. created_timestamp_column (optional): Timestamp column when row was created, used for deduplicating rows. - field_mapping: A dictionary mapping of column names in this data source to feature names in a feature table + field_mapping (optional): A dictionary mapping of column names in this data source to feature names in a feature table or view. Only used for feature columns, not entities or timestamp columns. - date_partition_column (deprecated): Timestamp column used for partitioning. - query (optional): SQL query to execute to generate data for this data source. - name (optional): Name for the source. Defaults to the table if not specified. + query (optional): The query to be executed to obtain the features. Exactly one of 'table' + and 'query' must be specified. description (optional): A human-readable description. tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the bigquery source, typically the email of the primary maintainer. - timestamp_field (optional): Event timestamp field used for point in time - joins of feature values. Example: >>> from feast import BigQuerySource >>> my_bigquery_source = BigQuerySource(table="gcp_project:bq_dataset.bq_table") @@ -63,37 +61,20 @@ def __init__( self.bigquery_options = BigQueryOptions(table=table, query=query) - if date_partition_column: - warnings.warn( - ( - "The argument 'date_partition_column' is not supported for BigQuery sources. " - "It will be removed in Feast 0.24+" - ), - DeprecationWarning, - ) - - # If no name, use the table as the default name - _name = name - if not _name: - if table: - _name = table - else: - warnings.warn( - ( - f"Starting in Feast 0.24, Feast will require either a name for a data source (if using query) or `table`: {self.query}" - ), - DeprecationWarning, - ) + # If no name, use the table as the default name. + if name is None and table is None: + raise DataSourceNoNameException() + name = name or table + assert name super().__init__( - name=_name if _name else "", - event_timestamp_column=event_timestamp_column, + name=name, + timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping, description=description, tags=tags, owner=owner, - timestamp_field=timestamp_field, ) # Note: Python requires redefining hash in child classes that override __eq__ diff --git a/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/athena.py b/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/athena.py new file mode 100644 index 0000000000..5095a43d57 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/athena.py @@ -0,0 +1,713 @@ +import contextlib +import uuid +from datetime import datetime +from pathlib import Path +from typing import ( + Callable, + ContextManager, + Dict, + Iterator, + List, + Optional, + Tuple, + Union, +) + +import numpy as np +import pandas as pd +import pyarrow +import pyarrow as pa +from pydantic import StrictStr +from pydantic.typing import Literal +from pytz import utc + +from feast import OnDemandFeatureView +from feast.data_source import DataSource +from feast.errors import InvalidEntityType +from feast.feature_logging import LoggingConfig, LoggingSource +from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL, FeatureView +from feast.infra.offline_stores import offline_utils +from feast.infra.offline_stores.contrib.athena_offline_store.athena_source import ( + AthenaLoggingDestination, + AthenaSource, + SavedDatasetAthenaStorage, +) +from feast.infra.offline_stores.offline_store import ( + OfflineStore, + RetrievalJob, + RetrievalMetadata, +) +from feast.infra.registry.base_registry import BaseRegistry +from feast.infra.registry.registry import Registry +from feast.infra.utils import aws_utils +from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.usage import log_exceptions_and_usage + + +class AthenaOfflineStoreConfig(FeastConfigBaseModel): + """Offline store config for AWS Athena""" + + type: Literal["athena"] = "athena" + """ Offline store type selector""" + + data_source: StrictStr + """ athena data source ex) AwsDataCatalog """ + + region: StrictStr + """ Athena's AWS region """ + + database: StrictStr + """ Athena database name """ + + s3_staging_location: StrictStr + """ S3 path for importing & exporting data to Athena """ + + +class AthenaOfflineStore(OfflineStore): + @staticmethod + @log_exceptions_and_usage(offline_store="athena") + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + assert isinstance(config.offline_store, AthenaOfflineStoreConfig) + assert isinstance(data_source, AthenaSource) + + from_expression = data_source.get_table_query_string(config) + + partition_by_join_key_string = ", ".join(join_key_columns) + if partition_by_join_key_string != "": + partition_by_join_key_string = ( + "PARTITION BY " + partition_by_join_key_string + ) + timestamp_columns = [timestamp_field] + if created_timestamp_column: + timestamp_columns.append(created_timestamp_column) + timestamp_desc_string = " DESC, ".join(timestamp_columns) + " DESC" + field_string = ", ".join( + join_key_columns + feature_name_columns + timestamp_columns + ) + + date_partition_column = data_source.date_partition_column + + athena_client = aws_utils.get_athena_data_client(config.offline_store.region) + s3_resource = aws_utils.get_s3_resource(config.offline_store.region) + + start_date = start_date.astimezone(tz=utc) + end_date = end_date.astimezone(tz=utc) + + query = f""" + SELECT + {field_string} + {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} + FROM ( + SELECT {field_string}, + ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row + FROM {from_expression} + WHERE {timestamp_field} BETWEEN TIMESTAMP '{start_date.strftime('%Y-%m-%d %H:%M:%S')}' AND TIMESTAMP '{end_date.strftime('%Y-%m-%d %H:%M:%S')}' + {"AND "+date_partition_column+" >= '"+start_date.strftime('%Y-%m-%d')+"' AND "+date_partition_column+" <= '"+end_date.strftime('%Y-%m-%d')+"' " if date_partition_column != "" and date_partition_column is not None else ''} + ) + WHERE _feast_row = 1 + """ + # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized + return AthenaRetrievalJob( + query=query, + athena_client=athena_client, + s3_resource=s3_resource, + config=config, + full_feature_names=False, + ) + + @staticmethod + @log_exceptions_and_usage(offline_store="athena") + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + assert isinstance(config.offline_store, AthenaOfflineStoreConfig) + assert isinstance(data_source, AthenaSource) + from_expression = data_source.get_table_query_string(config) + + field_string = ", ".join( + join_key_columns + feature_name_columns + [timestamp_field] + ) + + athena_client = aws_utils.get_athena_data_client(config.offline_store.region) + s3_resource = aws_utils.get_s3_resource(config.offline_store.region) + + date_partition_column = data_source.date_partition_column + + query = f""" + SELECT {field_string} + FROM {from_expression} + WHERE {timestamp_field} BETWEEN TIMESTAMP '{start_date.astimezone(tz=utc).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]}' AND TIMESTAMP '{end_date.astimezone(tz=utc).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]}' + {"AND "+date_partition_column+" >= '"+start_date.strftime('%Y-%m-%d')+"' AND "+date_partition_column+" <= '"+end_date.strftime('%Y-%m-%d')+"' " if date_partition_column != "" and date_partition_column is not None else ''} + """ + + return AthenaRetrievalJob( + query=query, + athena_client=athena_client, + s3_resource=s3_resource, + config=config, + full_feature_names=False, + ) + + @staticmethod + @log_exceptions_and_usage(offline_store="athena") + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pd.DataFrame, str], + registry: Registry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + assert isinstance(config.offline_store, AthenaOfflineStoreConfig) + for fv in feature_views: + assert isinstance(fv.batch_source, AthenaSource) + + athena_client = aws_utils.get_athena_data_client(config.offline_store.region) + s3_resource = aws_utils.get_s3_resource(config.offline_store.region) + + # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it + entity_schema = _get_entity_schema( + entity_df, athena_client, config, s3_resource + ) + + # find timestamp column of entity df.(default = "event_timestamp"). Exception occurs if there are more than two timestamp columns. + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df(entity_schema) + ) + + # get min,max of event_timestamp. + entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( + entity_df, + entity_df_event_timestamp_col, + athena_client, + config, + ) + + @contextlib.contextmanager + def query_generator() -> Iterator[str]: + + table_name = offline_utils.get_temp_entity_table_name() + + _upload_entity_df(entity_df, athena_client, config, s3_resource, table_name) + + expected_join_keys = offline_utils.get_expected_join_keys( + project, feature_views, registry + ) + + offline_utils.assert_expected_columns_in_entity_df( + entity_schema, expected_join_keys, entity_df_event_timestamp_col + ) + + # Build a query context containing all information required to template the Athena SQL query + query_context = offline_utils.get_feature_view_query_context( + feature_refs, + feature_views, + registry, + project, + entity_df_event_timestamp_range, + ) + + # Generate the Athena SQL query from the query context + query = offline_utils.build_point_in_time_query( + query_context, + left_table_query_string=table_name, + entity_df_event_timestamp_col=entity_df_event_timestamp_col, + entity_df_columns=entity_schema.keys(), + query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, + full_feature_names=full_feature_names, + ) + + try: + yield query + finally: + + # Always clean up the temp Athena table + aws_utils.execute_athena_query( + athena_client, + config.offline_store.data_source, + config.offline_store.database, + f"DROP TABLE IF EXISTS {config.offline_store.database}.{table_name}", + ) + + bucket = config.offline_store.s3_staging_location.replace( + "s3://", "" + ).split("/", 1)[0] + aws_utils.delete_s3_directory( + s3_resource, bucket, "entity_df/" + table_name + "/" + ) + + return AthenaRetrievalJob( + query=query_generator, + athena_client=athena_client, + s3_resource=s3_resource, + config=config, + full_feature_names=full_feature_names, + on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( + feature_refs, project, registry + ), + metadata=RetrievalMetadata( + features=feature_refs, + keys=list(entity_schema.keys() - {entity_df_event_timestamp_col}), + min_event_timestamp=entity_df_event_timestamp_range[0], + max_event_timestamp=entity_df_event_timestamp_range[1], + ), + ) + + @staticmethod + def write_logged_features( + config: RepoConfig, + data: Union[pyarrow.Table, Path], + source: LoggingSource, + logging_config: LoggingConfig, + registry: BaseRegistry, + ): + destination = logging_config.destination + assert isinstance(destination, AthenaLoggingDestination) + + athena_client = aws_utils.get_athena_data_client(config.offline_store.region) + s3_resource = aws_utils.get_s3_resource(config.offline_store.region) + if isinstance(data, Path): + s3_path = f"{config.offline_store.s3_staging_location}/logged_features/{uuid.uuid4()}" + else: + s3_path = f"{config.offline_store.s3_staging_location}/logged_features/{uuid.uuid4()}.parquet" + + aws_utils.upload_arrow_table_to_athena( + table=data, + athena_client=athena_client, + data_source=config.offline_store.data_source, + database=config.offline_store.database, + s3_resource=s3_resource, + s3_path=s3_path, + table_name=destination.table_name, + schema=source.get_schema(registry), + fail_if_exists=False, + ) + + +class AthenaRetrievalJob(RetrievalJob): + def __init__( + self, + query: Union[str, Callable[[], ContextManager[str]]], + athena_client, + s3_resource, + config: RepoConfig, + full_feature_names: bool, + on_demand_feature_views: Optional[List[OnDemandFeatureView]] = None, + metadata: Optional[RetrievalMetadata] = None, + ): + """Initialize AthenaRetrievalJob object. + + Args: + query: Athena SQL query to execute. Either a string, or a generator function that handles the artifact cleanup. + athena_client: boto3 athena client + s3_resource: boto3 s3 resource object + config: Feast repo config + full_feature_names: Whether to add the feature view prefixes to the feature names + on_demand_feature_views (optional): A list of on demand transforms to apply at retrieval time + """ + + if not isinstance(query, str): + self._query_generator = query + else: + + @contextlib.contextmanager + def query_generator() -> Iterator[str]: + assert isinstance(query, str) + yield query + + self._query_generator = query_generator + self._athena_client = athena_client + self._s3_resource = s3_resource + self._config = config + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views or [] + self._metadata = metadata + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: + return self._on_demand_feature_views + + def get_temp_s3_path(self) -> str: + return ( + self._config.offline_store.s3_staging_location + + "/unload/" + + str(uuid.uuid4()) + ) + + def get_temp_table_dml_header( + self, temp_table_name: str, temp_external_location: str + ) -> str: + temp_table_dml_header = f""" + CREATE TABLE {temp_table_name} + WITH ( + external_location = '{temp_external_location}', + format = 'parquet', + write_compression = 'snappy' + ) + as + """ + return temp_table_dml_header + + @log_exceptions_and_usage + def _to_df_internal(self) -> pd.DataFrame: + with self._query_generator() as query: + temp_table_name = "_" + str(uuid.uuid4()).replace("-", "") + temp_external_location = self.get_temp_s3_path() + return aws_utils.unload_athena_query_to_df( + self._athena_client, + self._config.offline_store.data_source, + self._config.offline_store.database, + self._s3_resource, + temp_external_location, + self.get_temp_table_dml_header(temp_table_name, temp_external_location) + + query, + temp_table_name, + ) + + @log_exceptions_and_usage + def _to_arrow_internal(self) -> pa.Table: + with self._query_generator() as query: + temp_table_name = "_" + str(uuid.uuid4()).replace("-", "") + temp_external_location = self.get_temp_s3_path() + return aws_utils.unload_athena_query_to_pa( + self._athena_client, + self._config.offline_store.data_source, + self._config.offline_store.database, + self._s3_resource, + temp_external_location, + self.get_temp_table_dml_header(temp_table_name, temp_external_location) + + query, + temp_table_name, + ) + + @property + def metadata(self) -> Optional[RetrievalMetadata]: + return self._metadata + + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): + assert isinstance(storage, SavedDatasetAthenaStorage) + self.to_athena(table_name=storage.athena_options.table) + + @log_exceptions_and_usage + def to_athena(self, table_name: str) -> None: + + if self.on_demand_feature_views: + transformed_df = self.to_df() + + _upload_entity_df( + transformed_df, + self._athena_client, + self._config, + self._s3_resource, + table_name, + ) + + return + + with self._query_generator() as query: + query = f'CREATE TABLE "{table_name}" AS ({query});\n' + + aws_utils.execute_athena_query( + self._athena_client, + self._config.offline_store.data_source, + self._config.offline_store.database, + query, + ) + + +def _upload_entity_df( + entity_df: Union[pd.DataFrame, str], + athena_client, + config: RepoConfig, + s3_resource, + table_name: str, +): + if isinstance(entity_df, pd.DataFrame): + # If the entity_df is a pandas dataframe, upload it to Athena + aws_utils.upload_df_to_athena( + athena_client, + config.offline_store.data_source, + config.offline_store.database, + s3_resource, + f"{config.offline_store.s3_staging_location}/entity_df/{table_name}/{table_name}.parquet", + table_name, + entity_df, + ) + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), create a Athena table out of it + aws_utils.execute_athena_query( + athena_client, + config.offline_store.data_source, + config.offline_store.database, + f"CREATE TABLE {table_name} AS ({entity_df})", + ) + else: + raise InvalidEntityType(type(entity_df)) + + +def _get_entity_schema( + entity_df: Union[pd.DataFrame, str], + athena_client, + config: RepoConfig, + s3_resource, +) -> Dict[str, np.dtype]: + if isinstance(entity_df, pd.DataFrame): + return dict(zip(entity_df.columns, entity_df.dtypes)) + + elif isinstance(entity_df, str): + # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it + entity_df_sample = AthenaRetrievalJob( + f"SELECT * FROM ({entity_df}) LIMIT 1", + athena_client, + s3_resource, + config, + full_feature_names=False, + ).to_df() + return dict(zip(entity_df_sample.columns, entity_df_sample.dtypes)) + else: + raise InvalidEntityType(type(entity_df)) + + +def _get_entity_df_event_timestamp_range( + entity_df: Union[pd.DataFrame, str], + entity_df_event_timestamp_col: str, + athena_client, + config: RepoConfig, +) -> Tuple[datetime, datetime]: + if isinstance(entity_df, pd.DataFrame): + entity_df_event_timestamp = entity_df.loc[ + :, entity_df_event_timestamp_col + ].infer_objects() + if pd.api.types.is_string_dtype(entity_df_event_timestamp): + entity_df_event_timestamp = pd.to_datetime( + entity_df_event_timestamp, utc=True + ) + entity_df_event_timestamp_range = ( + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), + ) + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), determine range + # from table + statement_id = aws_utils.execute_athena_query( + athena_client, + config.offline_store.data_source, + config.offline_store.database, + f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max " + f"FROM ({entity_df})", + ) + res = aws_utils.get_athena_query_result(athena_client, statement_id) + entity_df_event_timestamp_range = ( + datetime.strptime( + res["Rows"][1]["Data"][0]["VarCharValue"], "%Y-%m-%d %H:%M:%S.%f" + ), + datetime.strptime( + res["Rows"][1]["Data"][1]["VarCharValue"], "%Y-%m-%d %H:%M:%S.%f" + ), + ) + else: + raise InvalidEntityType(type(entity_df)) + + return entity_df_event_timestamp_range + + +MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """ +/* + Compute a deterministic hash for the `left_table_query_string` that will be used throughout + all the logic as the field to GROUP BY the data +*/ +WITH entity_dataframe AS ( + SELECT *, + {{entity_df_event_timestamp_col}} AS entity_timestamp + {% for featureview in featureviews %} + {% if featureview.entities %} + ,( + {% for entity in featureview.entities %} + CAST({{entity}} as VARCHAR) || + {% endfor %} + CAST({{entity_df_event_timestamp_col}} AS VARCHAR) + ) AS {{featureview.name}}__entity_row_unique_id + {% else %} + ,CAST({{entity_df_event_timestamp_col}} AS VARCHAR) AS {{featureview.name}}__entity_row_unique_id + {% endif %} + {% endfor %} + FROM {{ left_table_query_string }} +), + +{% for featureview in featureviews %} + +{{ featureview.name }}__entity_dataframe AS ( + SELECT + {{ featureview.entities | join(', ')}}{% if featureview.entities %},{% else %}{% endif %} + entity_timestamp, + {{featureview.name}}__entity_row_unique_id + FROM entity_dataframe + GROUP BY + {{ featureview.entities | join(', ')}}{% if featureview.entities %},{% else %}{% endif %} + entity_timestamp, + {{featureview.name}}__entity_row_unique_id +), + +/* + This query template performs the point-in-time correctness join for a single feature set table + to the provided entity table. + + 1. We first join the current feature_view to the entity dataframe that has been passed. + This JOIN has the following logic: + - For each row of the entity dataframe, only keep the rows where the `timestamp_field` + is less than the one provided in the entity dataframe + - If there a TTL for the current feature_view, also keep the rows where the `timestamp_field` + is higher the the one provided minus the TTL + - For each row, Join on the entity key and retrieve the `entity_row_unique_id` that has been + computed previously + + The output of this CTE will contain all the necessary information and already filtered out most + of the data that is not relevant. +*/ + +{{ featureview.name }}__subquery AS ( + SELECT + {{ featureview.timestamp_field }} as event_timestamp, + {{ featureview.created_timestamp_column ~ ' as created_timestamp,' if featureview.created_timestamp_column else '' }} + {{ featureview.entity_selections | join(', ')}}{% if featureview.entity_selections %},{% else %}{% endif %} + {% for feature in featureview.features %} + {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}{% if loop.last %}{% else %}, {% endif %} + {% endfor %} + FROM {{ featureview.table_subquery }} + WHERE {{ featureview.timestamp_field }} <= from_iso8601_timestamp('{{ featureview.max_event_timestamp }}') + {% if featureview.date_partition_column != "" and featureview.date_partition_column is not none %} + AND {{ featureview.date_partition_column }} <= '{{ featureview.max_event_timestamp[:10] }}' + {% endif %} + + {% if featureview.ttl == 0 %}{% else %} + AND {{ featureview.timestamp_field }} >= from_iso8601_timestamp('{{ featureview.min_event_timestamp }}') + {% if featureview.date_partition_column != "" and featureview.date_partition_column is not none %} + AND {{ featureview.date_partition_column }} >= '{{ featureview.min_event_timestamp[:10] }}' + {% endif %} + {% endif %} + +), + +{{ featureview.name }}__base AS ( + SELECT + subquery.*, + entity_dataframe.entity_timestamp, + entity_dataframe.{{featureview.name}}__entity_row_unique_id + FROM {{ featureview.name }}__subquery AS subquery + INNER JOIN {{ featureview.name }}__entity_dataframe AS entity_dataframe + ON TRUE + AND subquery.event_timestamp <= entity_dataframe.entity_timestamp + + {% if featureview.ttl == 0 %}{% else %} + AND subquery.event_timestamp >= entity_dataframe.entity_timestamp - {{ featureview.ttl }} * interval '1' second + {% endif %} + + {% for entity in featureview.entities %} + AND subquery.{{ entity }} = entity_dataframe.{{ entity }} + {% endfor %} +), + +/* + 2. If the `created_timestamp_column` has been set, we need to + deduplicate the data first. This is done by calculating the + `MAX(created_at_timestamp)` for each event_timestamp. + We then join the data on the next CTE +*/ +{% if featureview.created_timestamp_column %} +{{ featureview.name }}__dedup AS ( + SELECT + {{featureview.name}}__entity_row_unique_id, + event_timestamp, + MAX(created_timestamp) as created_timestamp + FROM {{ featureview.name }}__base + GROUP BY {{featureview.name}}__entity_row_unique_id, event_timestamp +), +{% endif %} + +/* + 3. The data has been filtered during the first CTE "*__base" + Thus we only need to compute the latest timestamp of each feature. +*/ +{{ featureview.name }}__latest AS ( + SELECT + event_timestamp, + {% if featureview.created_timestamp_column %}created_timestamp,{% endif %} + {{featureview.name}}__entity_row_unique_id + FROM + ( + SELECT base.*, + ROW_NUMBER() OVER( + PARTITION BY base.{{featureview.name}}__entity_row_unique_id + ORDER BY base.event_timestamp DESC{% if featureview.created_timestamp_column %},base.created_timestamp DESC{% endif %} + ) AS row_number + FROM {{ featureview.name }}__base as base + {% if featureview.created_timestamp_column %} + INNER JOIN {{ featureview.name }}__dedup as dedup + ON TRUE + AND base.{{featureview.name}}__entity_row_unique_id = dedup.{{featureview.name}}__entity_row_unique_id + AND base.event_timestamp = dedup.event_timestamp + AND base.created_timestamp = dedup.created_timestamp + {% endif %} + ) + WHERE row_number = 1 +), + +/* + 4. Once we know the latest value of each feature for a given timestamp, + we can join again the data back to the original "base" dataset +*/ +{{ featureview.name }}__cleaned AS ( + SELECT base.* + FROM {{ featureview.name }}__base as base + INNER JOIN {{ featureview.name }}__latest as latest + ON TRUE + AND base.{{featureview.name}}__entity_row_unique_id = latest.{{featureview.name}}__entity_row_unique_id + AND base.event_timestamp = latest.event_timestamp + {% if featureview.created_timestamp_column %} + AND base.created_timestamp = latest.created_timestamp + {% endif %} +){% if loop.last %}{% else %}, {% endif %} + + +{% endfor %} +/* + Joins the outputs of multiple time travel joins to a single table. + The entity_dataframe dataset being our source of truth here. + */ + +SELECT {{ final_output_feature_names | join(', ')}} +FROM entity_dataframe as entity_df +{% for featureview in featureviews %} +LEFT JOIN ( + SELECT + {{featureview.name}}__entity_row_unique_id + {% for feature in featureview.features %} + ,{% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %} + {% endfor %} + FROM {{ featureview.name }}__cleaned +) as cleaned +ON TRUE +AND entity_df.{{featureview.name}}__entity_row_unique_id = cleaned.{{featureview.name}}__entity_row_unique_id +{% endfor %} +""" diff --git a/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/athena_source.py b/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/athena_source.py new file mode 100644 index 0000000000..bac027ff3e --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/athena_source.py @@ -0,0 +1,343 @@ +from typing import Callable, Dict, Iterable, Optional, Tuple + +from feast import type_map +from feast.data_source import DataSource +from feast.errors import DataSourceNoNameException, DataSourceNotFoundException +from feast.feature_logging import LoggingDestination +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.protos.feast.core.FeatureService_pb2 import ( + LoggingConfig as LoggingConfigProto, +) +from feast.protos.feast.core.SavedDataset_pb2 import ( + SavedDatasetStorage as SavedDatasetStorageProto, +) +from feast.repo_config import RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.value_type import ValueType + + +class AthenaSource(DataSource): + def __init__( + self, + *, + timestamp_field: Optional[str] = "", + table: Optional[str] = None, + database: Optional[str] = None, + data_source: Optional[str] = None, + created_timestamp_column: Optional[str] = None, + field_mapping: Optional[Dict[str, str]] = None, + date_partition_column: Optional[str] = None, + query: Optional[str] = None, + name: Optional[str] = None, + description: Optional[str] = "", + tags: Optional[Dict[str, str]] = None, + owner: Optional[str] = "", + ): + """ + Creates a AthenaSource object. + + Args: + timestamp_field : event timestamp column. + table (optional): Athena table where the features are stored. Exactly one of 'table' + and 'query' must be specified. + database: Athena Database Name + data_source (optional): Athena data source + created_timestamp_column (optional): Timestamp column indicating when the + row was created, used for deduplicating rows. + field_mapping (optional): A dictionary mapping of column names in this data + source to column names in a feature table or view. + date_partition_column : Timestamp column used for partitioning. + query (optional): The query to be executed to obtain the features. Exactly one of 'table' + and 'query' must be specified. + name (optional): Name for the source. Defaults to the table if not specified, in which + case the table must be specified. + description (optional): A human-readable description. + tags (optional): A dictionary of key-value pairs to store arbitrary metadata. + owner (optional): The owner of the athena source, typically the email of the primary + maintainer. + """ + _database = "default" if table and not database else database + self.athena_options = AthenaOptions( + table=table, query=query, database=_database, data_source=data_source + ) + + if table is None and query is None: + raise ValueError('No "table" argument provided.') + + # If no name, use the table as the default name. + if name is None and table is None: + raise DataSourceNoNameException() + _name = name or table + assert _name + + super().__init__( + name=_name if _name else "", + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping, + date_partition_column=date_partition_column, + description=description, + tags=tags, + owner=owner, + ) + + @staticmethod + def from_proto(data_source: DataSourceProto): + """ + Creates a AthenaSource from a protobuf representation of a AthenaSource. + + Args: + data_source: A protobuf representation of a AthenaSource + + Returns: + A AthenaSource object based on the data_source protobuf. + """ + return AthenaSource( + name=data_source.name, + timestamp_field=data_source.timestamp_field, + table=data_source.athena_options.table, + database=data_source.athena_options.database, + data_source=data_source.athena_options.data_source, + created_timestamp_column=data_source.created_timestamp_column, + field_mapping=dict(data_source.field_mapping), + date_partition_column=data_source.date_partition_column, + query=data_source.athena_options.query, + description=data_source.description, + tags=dict(data_source.tags), + ) + + # Note: Python requires redefining hash in child classes that override __eq__ + def __hash__(self): + return super().__hash__() + + def __eq__(self, other): + if not isinstance(other, AthenaSource): + raise TypeError( + "Comparisons should only involve AthenaSource class objects." + ) + + return ( + super().__eq__(other) + and self.athena_options.table == other.athena_options.table + and self.athena_options.query == other.athena_options.query + and self.athena_options.database == other.athena_options.database + and self.athena_options.data_source == other.athena_options.data_source + ) + + @property + def table(self): + """Returns the table of this Athena source.""" + return self.athena_options.table + + @property + def database(self): + """Returns the database of this Athena source.""" + return self.athena_options.database + + @property + def query(self): + """Returns the Athena query of this Athena source.""" + return self.athena_options.query + + @property + def data_source(self): + """Returns the Athena data_source of this Athena source.""" + return self.athena_options.data_source + + def to_proto(self) -> DataSourceProto: + """ + Converts a RedshiftSource object to its protobuf representation. + + Returns: + A DataSourceProto object. + """ + data_source_proto = DataSourceProto( + type=DataSourceProto.BATCH_ATHENA, + name=self.name, + timestamp_field=self.timestamp_field, + created_timestamp_column=self.created_timestamp_column, + field_mapping=self.field_mapping, + date_partition_column=self.date_partition_column, + description=self.description, + tags=self.tags, + athena_options=self.athena_options.to_proto(), + ) + + return data_source_proto + + def validate(self, config: RepoConfig): + # As long as the query gets successfully executed, or the table exists, + # the data source is validated. We don't need the results though. + self.get_table_column_names_and_types(config) + + def get_table_query_string(self, config: Optional[RepoConfig] = None) -> str: + """Returns a string that can directly be used to reference this table in SQL.""" + if self.table: + data_source = self.data_source + database = self.database + if config: + data_source = config.offline_store.data_source + database = config.offline_store.database + return f'"{data_source}"."{database}"."{self.table}"' + else: + return f"({self.query})" + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + return type_map.athena_to_feast_value_type + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + """ + Returns a mapping of column names to types for this Athena source. + + Args: + config: A RepoConfig describing the feature repo + """ + from botocore.exceptions import ClientError + + from feast.infra.offline_stores.contrib.athena_offline_store.athena import ( + AthenaOfflineStoreConfig, + ) + from feast.infra.utils import aws_utils + + assert isinstance(config.offline_store, AthenaOfflineStoreConfig) + + client = aws_utils.get_athena_data_client(config.offline_store.region) + if self.table: + try: + table = client.get_table_metadata( + CatalogName=self.data_source, + DatabaseName=self.database, + TableName=self.table, + ) + except ClientError as e: + raise aws_utils.AthenaError(e) + + # The API returns valid JSON with empty column list when the table doesn't exist + if len(table["TableMetadata"]["Columns"]) == 0: + raise DataSourceNotFoundException(self.table) + + columns = table["TableMetadata"]["Columns"] + else: + statement_id = aws_utils.execute_athena_query( + client, + config.offline_store.data_source, + config.offline_store.database, + f"SELECT * FROM ({self.query}) LIMIT 1", + ) + columns = aws_utils.get_athena_query_result(client, statement_id)[ + "ResultSetMetadata" + ]["ColumnInfo"] + + return [(column["Name"], column["Type"].upper()) for column in columns] + + +class AthenaOptions: + """ + Configuration options for a Athena data source. + """ + + def __init__( + self, + table: Optional[str], + query: Optional[str], + database: Optional[str], + data_source: Optional[str], + ): + self.table = table or "" + self.query = query or "" + self.database = database or "" + self.data_source = data_source or "" + + @classmethod + def from_proto(cls, athena_options_proto: DataSourceProto.AthenaOptions): + """ + Creates a AthenaOptions from a protobuf representation of a Athena option. + + Args: + athena_options_proto: A protobuf representation of a DataSource + + Returns: + A AthenaOptions object based on the athena_options protobuf. + """ + athena_options = cls( + table=athena_options_proto.table, + query=athena_options_proto.query, + database=athena_options_proto.database, + data_source=athena_options_proto.data_source, + ) + + return athena_options + + def to_proto(self) -> DataSourceProto.AthenaOptions: + """ + Converts an AthenaOptionsProto object to its protobuf representation. + + Returns: + A AthenaOptionsProto protobuf. + """ + athena_options_proto = DataSourceProto.AthenaOptions( + table=self.table, + query=self.query, + database=self.database, + data_source=self.data_source, + ) + + return athena_options_proto + + +class SavedDatasetAthenaStorage(SavedDatasetStorage): + _proto_attr_name = "athena_storage" + + athena_options: AthenaOptions + + def __init__( + self, + table_ref: str, + query: str = None, + database: str = None, + data_source: str = None, + ): + self.athena_options = AthenaOptions( + table=table_ref, query=query, database=database, data_source=data_source + ) + + @staticmethod + def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: + + return SavedDatasetAthenaStorage( + table_ref=AthenaOptions.from_proto(storage_proto.athena_storage).table + ) + + def to_proto(self) -> SavedDatasetStorageProto: + return SavedDatasetStorageProto(athena_storage=self.athena_options.to_proto()) + + def to_data_source(self) -> DataSource: + return AthenaSource(table=self.athena_options.table) + + +class AthenaLoggingDestination(LoggingDestination): + _proto_kind = "athena_destination" + + table_name: str + + def __init__(self, *, table_name: str): + self.table_name = table_name + + @classmethod + def from_proto(cls, config_proto: LoggingConfigProto) -> "LoggingDestination": + return AthenaLoggingDestination( + table_name=config_proto.athena_destination.table_name, + ) + + def to_proto(self) -> LoggingConfigProto: + return LoggingConfigProto( + athena_destination=LoggingConfigProto.AthenaDestination( + table_name=self.table_name + ) + ) + + def to_data_source(self) -> DataSource: + return AthenaSource(table=self.table_name) diff --git a/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/tests/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/tests/data_source.py new file mode 100644 index 0000000000..92e0d6e5f6 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/athena_offline_store/tests/data_source.py @@ -0,0 +1,130 @@ +import os +import uuid +from typing import Dict, List, Optional + +import pandas as pd + +from feast import AthenaSource +from feast.data_source import DataSource +from feast.feature_logging import LoggingDestination +from feast.infra.offline_stores.contrib.athena_offline_store.athena import ( + AthenaOfflineStoreConfig, +) +from feast.infra.offline_stores.contrib.athena_offline_store.athena_source import ( + AthenaLoggingDestination, + SavedDatasetAthenaStorage, +) +from feast.infra.utils import aws_utils +from feast.repo_config import FeastConfigBaseModel +from tests.integration.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) + + +class AthenaDataSourceCreator(DataSourceCreator): + + tables: List[str] = [] + + def __init__(self, project_name: str, *args, **kwargs): + super().__init__(project_name) + self.client = aws_utils.get_athena_data_client("ap-northeast-2") + self.s3 = aws_utils.get_s3_resource("ap-northeast-2") + data_source = ( + os.environ.get("ATHENA_DATA_SOURCE") + if os.environ.get("ATHENA_DATA_SOURCE") + else "AwsDataCatalog" + ) + database = ( + os.environ.get("ATHENA_DATABASE") + if os.environ.get("ATHENA_DATABASE") + else "default" + ) + bucket_name = ( + os.environ.get("ATHENA_S3_BUCKET_NAME") + if os.environ.get("ATHENA_S3_BUCKET_NAME") + else "feast-integration-tests" + ) + self.offline_store_config = AthenaOfflineStoreConfig( + data_source=f"{data_source}", + region="ap-northeast-2", + database=f"{database}", + s3_staging_location=f"s3://{bucket_name}/test_dir", + ) + + def create_data_source( + self, + df: pd.DataFrame, + destination_name: str, + suffix: Optional[str] = None, + timestamp_field="ts", + created_timestamp_column="created_ts", + field_mapping: Dict[str, str] = None, + ) -> DataSource: + + table_name = destination_name + s3_target = ( + self.offline_store_config.s3_staging_location + + "/" + + self.project_name + + "/" + + table_name + + "/" + + table_name + + ".parquet" + ) + + aws_utils.upload_df_to_athena( + self.client, + self.offline_store_config.data_source, + self.offline_store_config.database, + self.s3, + s3_target, + table_name, + df, + ) + + self.tables.append(table_name) + + return AthenaSource( + table=table_name, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping or {"ts_1": "ts"}, + database=self.offline_store_config.database, + data_source=self.offline_store_config.data_source, + ) + + def create_saved_dataset_destination(self) -> SavedDatasetAthenaStorage: + table = self.get_prefixed_table_name( + f"persisted_ds_{str(uuid.uuid4()).replace('-', '_')}" + ) + self.tables.append(table) + + return SavedDatasetAthenaStorage( + table_ref=table, + database=self.offline_store_config.database, + data_source=self.offline_store_config.data_source, + ) + + def create_logged_features_destination(self) -> LoggingDestination: + table = self.get_prefixed_table_name( + f"persisted_ds_{str(uuid.uuid4()).replace('-', '_')}" + ) + self.tables.append(table) + + return AthenaLoggingDestination(table_name=table) + + def create_offline_store_config(self) -> FeastConfigBaseModel: + return self.offline_store_config + + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}_{suffix}" + + def teardown(self): + for table in self.tables: + aws_utils.execute_athena_query( + self.client, + self.offline_store_config.data_source, + self.offline_store_config.database, + f"DROP TABLE IF EXISTS {table}", + ) diff --git a/sdk/python/feast/infra/offline_stores/contrib/athena_repo_configuration.py b/sdk/python/feast/infra/offline_stores/contrib/athena_repo_configuration.py new file mode 100644 index 0000000000..32376eb652 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/athena_repo_configuration.py @@ -0,0 +1,15 @@ +from tests.integration.feature_repos.integration_test_repo_config import ( + IntegrationTestRepoConfig, +) +from tests.integration.feature_repos.universal.data_sources.athena import ( + AthenaDataSourceCreator, +) + +FULL_REPO_CONFIGS = [ + IntegrationTestRepoConfig( + provider="aws", + offline_store_creator=AthenaDataSourceCreator, + ), +] + +AVAILABLE_OFFLINE_STORES = [("aws", AthenaDataSourceCreator)] diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssql.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssql.py new file mode 100644 index 0000000000..8dc5f6c654 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssql.py @@ -0,0 +1,650 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +import warnings +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union + +import numpy as np +import pandas +import pyarrow +import pyarrow as pa +import sqlalchemy +from pydantic.types import StrictStr +from pydantic.typing import Literal +from sqlalchemy import create_engine +from sqlalchemy.engine import Engine +from sqlalchemy.orm import sessionmaker + +from feast import FileSource, errors +from feast.data_source import DataSource +from feast.errors import InvalidEntityType +from feast.feature_logging import LoggingConfig, LoggingSource +from feast.feature_view import FeatureView +from feast.infra.offline_stores import offline_utils +from feast.infra.offline_stores.file_source import SavedDatasetFileStorage +from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalMetadata +from feast.infra.offline_stores.offline_utils import ( + DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, + build_point_in_time_query, + get_feature_view_query_context, +) +from feast.infra.provider import RetrievalJob +from feast.infra.registry.base_registry import BaseRegistry +from feast.on_demand_feature_view import OnDemandFeatureView +from feast.repo_config import FeastBaseModel, RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.type_map import pa_to_mssql_type +from feast.usage import log_exceptions_and_usage + +# Make sure warning doesn't raise more than once. +warnings.simplefilter("once", RuntimeWarning) + +EntitySchema = Dict[str, np.dtype] + + +class MsSqlServerOfflineStoreConfig(FeastBaseModel): + """Offline store config for SQL Server""" + + type: Literal["mssql"] = "mssql" + """ Offline store type selector""" + + connection_string: StrictStr = "mssql+pyodbc://sa:yourStrong(!)Password@localhost:1433/feast_test?driver=ODBC+Driver+17+for+SQL+Server" + """Connection string containing the host, port, and configuration parameters for SQL Server + format: SQLAlchemy connection string, e.g. mssql+pyodbc://sa:yourStrong(!)Password@localhost:1433/feast_test?driver=ODBC+Driver+17+for+SQL+Server""" + + +def make_engine(config: MsSqlServerOfflineStoreConfig) -> Engine: + return create_engine(config.connection_string) + + +class MsSqlServerOfflineStore(OfflineStore): + """ + Microsoft SQL Server based offline store, supporting Azure Synapse or Azure SQL. + + Note: to use this, you'll need to have Microsoft ODBC 17 installed. + See https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15#17 + """ + + @staticmethod + @log_exceptions_and_usage(offline_store="mssql") + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + warnings.warn( + "The Azure Synapse + Azure SQL offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + assert type(data_source).__name__ == "MsSqlServerSource" + from_expression = data_source.get_table_query_string().replace("`", "") + + partition_by_join_key_string = ", ".join(join_key_columns) + if partition_by_join_key_string != "": + partition_by_join_key_string = ( + "PARTITION BY " + partition_by_join_key_string + ) + timestamps = [timestamp_field] + if created_timestamp_column: + timestamps.append(created_timestamp_column) + timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" + field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) + + query = f""" + SELECT {field_string} + FROM ( + SELECT {field_string}, + ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row + FROM {from_expression} inner_t + WHERE {timestamp_field} BETWEEN CONVERT(DATETIMEOFFSET, '{start_date}', 120) AND CONVERT(DATETIMEOFFSET, '{end_date}', 120) + ) outer_t + WHERE outer_t._feast_row = 1 + """ + engine = make_engine(config.offline_store) + + return MsSqlServerRetrievalJob( + query=query, + engine=engine, + config=config.offline_store, + full_feature_names=False, + on_demand_feature_views=None, + ) + + @staticmethod + @log_exceptions_and_usage(offline_store="mssql") + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + assert type(data_source).__name__ == "MsSqlServerSource" + warnings.warn( + "The Azure Synapse + Azure SQL offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + from_expression = data_source.get_table_query_string().replace("`", "") + timestamps = [timestamp_field] + field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) + + query = f""" + SELECT {field_string} + FROM ( + SELECT {field_string} + FROM {from_expression} + WHERE {timestamp_field} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' + ) + """ + engine = make_engine(config.offline_store) + + return MsSqlServerRetrievalJob( + query=query, + engine=engine, + config=config.offline_store, + full_feature_names=False, + on_demand_feature_views=None, + ) + + @staticmethod + @log_exceptions_and_usage(offline_store="mssql") + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pandas.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + warnings.warn( + "The Azure Synapse + Azure SQL offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + + expected_join_keys = _get_join_keys(project, feature_views, registry) + assert isinstance(config.offline_store, MsSqlServerOfflineStoreConfig) + engine = make_engine(config.offline_store) + if isinstance(entity_df, pandas.DataFrame): + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df( + dict(zip(list(entity_df.columns), list(entity_df.dtypes))) + ) + ) + entity_df[entity_df_event_timestamp_col] = pandas.to_datetime( + entity_df[entity_df_event_timestamp_col], utc=True + ).fillna(pandas.Timestamp.now()) + + elif isinstance(entity_df, str): + raise ValueError( + "string entities are currently not supported in the MsSQL offline store." + ) + ( + table_schema, + table_name, + ) = _upload_entity_df_into_sqlserver_and_get_entity_schema( + engine, config, entity_df, full_feature_names=full_feature_names + ) + + _assert_expected_columns_in_sqlserver( + expected_join_keys, + entity_df_event_timestamp_col, + table_schema, + ) + + entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( + entity_df, + entity_df_event_timestamp_col, + engine, + ) + + # Build a query context containing all information required to template the SQL query + query_context = get_feature_view_query_context( + feature_refs, + feature_views, + registry, + project, + entity_df_timestamp_range=entity_df_event_timestamp_range, + ) + + # Generate the SQL query from the query context + query = build_point_in_time_query( + query_context, + left_table_query_string=table_name, + entity_df_event_timestamp_col=entity_df_event_timestamp_col, + entity_df_columns=table_schema.keys(), + full_feature_names=full_feature_names, + query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, + ) + query = query.replace("`", "") + + job = MsSqlServerRetrievalJob( + query=query, + engine=engine, + config=config.offline_store, + full_feature_names=full_feature_names, + on_demand_feature_views=registry.list_on_demand_feature_views(project), + ) + return job + + @staticmethod + def write_logged_features( + config: RepoConfig, + data: Union[pyarrow.Table, Path], + source: LoggingSource, + logging_config: LoggingConfig, + registry: BaseRegistry, + ): + raise NotImplementedError() + + @staticmethod + def offline_write_batch( + config: RepoConfig, + feature_view: FeatureView, + table: pyarrow.Table, + progress: Optional[Callable[[int], Any]], + ): + raise NotImplementedError() + + +def _assert_expected_columns_in_dataframe( + join_keys: Set[str], entity_df_event_timestamp_col: str, entity_df: pandas.DataFrame +): + entity_df_columns = set(entity_df.columns.values) + expected_columns = join_keys.copy() + expected_columns.add(entity_df_event_timestamp_col) + + missing_keys = expected_columns - entity_df_columns + + if len(missing_keys) != 0: + raise errors.FeastEntityDFMissingColumnsError(expected_columns, missing_keys) + + +def _assert_expected_columns_in_sqlserver( + join_keys: Set[str], entity_df_event_timestamp_col: str, table_schema: EntitySchema +): + entity_columns = set(table_schema.keys()) + expected_columns = join_keys.copy() + expected_columns.add(entity_df_event_timestamp_col) + + missing_keys = expected_columns - entity_columns + + if len(missing_keys) != 0: + raise errors.FeastEntityDFMissingColumnsError(expected_columns, missing_keys) + + +def _get_join_keys( + project: str, feature_views: List[FeatureView], registry: BaseRegistry +) -> Set[str]: + join_keys = set() + for feature_view in feature_views: + entities = feature_view.entities + for entity_name in entities: + entity = registry.get_entity(entity_name, project) + join_keys.add(entity.join_key) + return join_keys + + +def _infer_event_timestamp_from_sqlserver_schema(table_schema) -> str: + if any( + schema_field["COLUMN_NAME"] == DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL + for schema_field in table_schema + ): + return DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL + else: + datetime_columns = list( + filter( + lambda schema_field: schema_field["DATA_TYPE"] == "DATETIMEOFFSET", + table_schema, + ) + ) + if len(datetime_columns) == 1: + print( + f"Using {datetime_columns[0]['COLUMN_NAME']} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}." + ) + return datetime_columns[0].name + else: + raise ValueError( + f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events." + ) + + +class MsSqlServerRetrievalJob(RetrievalJob): + def __init__( + self, + query: str, + engine: Engine, + config: MsSqlServerOfflineStoreConfig, + full_feature_names: bool, + on_demand_feature_views: Optional[List[OnDemandFeatureView]], + metadata: Optional[RetrievalMetadata] = None, + drop_columns: Optional[List[str]] = None, + ): + self.query = query + self.engine = engine + self._config = config + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views or [] + self._drop_columns = drop_columns + self._metadata = metadata + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: + return self._on_demand_feature_views + + def _to_df_internal(self) -> pandas.DataFrame: + return pandas.read_sql(self.query, con=self.engine).fillna(value=np.nan) + + def _to_arrow_internal(self) -> pyarrow.Table: + result = pandas.read_sql(self.query, con=self.engine).fillna(value=np.nan) + return pyarrow.Table.from_pandas(result) + + ## Implements persist in Feast 0.18 - This persists to filestorage + ## ToDo: Persist to Azure Storage + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): + assert isinstance(storage, SavedDatasetFileStorage) + + filesystem, path = FileSource.create_filesystem_and_path( + storage.file_options.uri, + storage.file_options.s3_endpoint_override, + ) + + if path.endswith(".parquet"): + pyarrow.parquet.write_table( + self.to_arrow(), where=path, filesystem=filesystem + ) + else: + # otherwise assume destination is directory + pyarrow.parquet.write_to_dataset( + self.to_arrow(), root_path=path, filesystem=filesystem + ) + + def supports_remote_storage_export(self) -> bool: + return False + + def to_remote_storage(self) -> List[str]: + raise NotImplementedError() + + @property + def metadata(self) -> Optional[RetrievalMetadata]: + return self._metadata + + +def _upload_entity_df_into_sqlserver_and_get_entity_schema( + engine: sqlalchemy.engine.Engine, + config: RepoConfig, + entity_df: Union[pandas.DataFrame, str], + full_feature_names: bool, +) -> Tuple[Dict[Any, Any], str]: + """ + Uploads a Pandas entity dataframe into a SQL Server table and constructs the + schema from the original entity_df dataframe. + """ + table_id = offline_utils.get_temp_entity_table_name() + session = sessionmaker(bind=engine)() + + if type(entity_df) is str: + # TODO: This should be a temporary table, right? + session.execute(f"SELECT * INTO {table_id} FROM ({entity_df}) t") # type: ignore + + session.commit() + + limited_entity_df = MsSqlServerRetrievalJob( + f"SELECT TOP 1 * FROM {table_id}", + engine, + config.offline_store, + full_feature_names=full_feature_names, + on_demand_feature_views=None, + ).to_df() + + entity_schema = ( + dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)), + table_id, + ) + + elif isinstance(entity_df, pandas.DataFrame): + # Drop the index so that we don't have unnecessary columns + engine.execute(_df_to_create_table_sql(entity_df, table_id)) + entity_df.to_sql(name=table_id, con=engine, index=False, if_exists="append") + entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)), table_id + + else: + raise ValueError( + f"The entity dataframe you have provided must be a SQL Server SQL query," + f" or a Pandas dataframe. But we found: {type(entity_df)} " + ) + + return entity_schema + + +def _df_to_create_table_sql(df: pandas.DataFrame, table_name: str) -> str: + pa_table = pa.Table.from_pandas(df) + + columns = [f""""{f.name}" {pa_to_mssql_type(f.type)}""" for f in pa_table.schema] + + return f""" + CREATE TABLE "{table_name}" ( + {", ".join(columns)} + ); + """ + + +def _get_entity_df_event_timestamp_range( + entity_df: Union[pandas.DataFrame, str], + entity_df_event_timestamp_col: str, + engine: Engine, +) -> Tuple[datetime, datetime]: + if isinstance(entity_df, pandas.DataFrame): + entity_df_event_timestamp = entity_df.loc[ + :, entity_df_event_timestamp_col + ].infer_objects() + if pandas.api.types.is_string_dtype(entity_df_event_timestamp): + entity_df_event_timestamp = pandas.to_datetime( + entity_df_event_timestamp, utc=True + ) + entity_df_event_timestamp_range = ( + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), + ) + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), determine range + # from table + df = pandas.read_sql(entity_df, con=engine).fillna(value=np.nan) + entity_df_event_timestamp = df.loc[ + :, entity_df_event_timestamp_col + ].infer_objects() + if pandas.api.types.is_string_dtype(entity_df_event_timestamp): + entity_df_event_timestamp = pandas.to_datetime( + entity_df_event_timestamp, utc=True + ) + entity_df_event_timestamp_range = ( + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), + ) + else: + raise InvalidEntityType(type(entity_df)) + + return entity_df_event_timestamp_range + + +# TODO: Optimizations +# * Use NEWID() instead of ROW_NUMBER(), or join on entity columns directly +# * Precompute ROW_NUMBER() so that it doesn't have to be recomputed for every query on entity_dataframe +# * Create temporary tables instead of keeping all tables in memory + +MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """ +/* + Compute a deterministic hash for the `left_table_query_string` that will be used throughout + all the logic as the field to GROUP BY the data +*/ +WITH entity_dataframe AS ( + SELECT *, + {{entity_df_event_timestamp_col}} AS entity_timestamp + {% for featureview in featureviews %} + ,CONCAT( + {% for entity_key in unique_entity_keys %} + {{entity_key}}, + {% endfor %} + {{entity_df_event_timestamp_col}} + ) AS {{featureview.name}}__entity_row_unique_id + {% endfor %} + FROM {{ left_table_query_string }} +), + +{% for featureview in featureviews %} + +{{ featureview.name }}__entity_dataframe AS ( + SELECT + {{ featureview.entities | join(', ')}}{% if featureview.entities %},{% else %}{% endif %} + entity_timestamp, + {{featureview.name}}__entity_row_unique_id + FROM entity_dataframe + GROUP BY + {{ featureview.entities | join(', ')}}{% if featureview.entities %},{% else %}{% endif %} + entity_timestamp, + {{featureview.name}}__entity_row_unique_id +), + +/* + This query template performs the point-in-time correctness join for a single feature set table + to the provided entity table. + + 1. We first join the current feature_view to the entity dataframe that has been passed. + This JOIN has the following logic: + - For each row of the entity dataframe, only keep the rows where the timestamp_field` + is less than the one provided in the entity dataframe + - If there a TTL for the current feature_view, also keep the rows where the `timestamp_field` + is higher the the one provided minus the TTL + - For each row, Join on the entity key and retrieve the `entity_row_unique_id` that has been + computed previously + + The output of this CTE will contain all the necessary information and already filtered out most + of the data that is not relevant. +*/ + +{{ featureview.name }}__subquery AS ( + SELECT + {{ featureview.timestamp_field }} as event_timestamp, + {{ featureview.created_timestamp_column ~ ' as created_timestamp,' if featureview.created_timestamp_column else '' }} + {{ featureview.entity_selections | join(', ')}}{% if featureview.entity_selections %},{% else %}{% endif %} + {% for feature in featureview.features %} + {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}{% if loop.last %}{% else %}, {% endif %} + {% endfor %} + FROM {{ featureview.table_subquery }} + WHERE {{ featureview.timestamp_field }} <= '{{ featureview.max_event_timestamp }}' + {% if featureview.ttl == 0 %}{% else %} + AND {{ featureview.timestamp_field }} >= '{{ featureview.min_event_timestamp }}' + {% endif %} +), + +{{ featureview.name }}__base AS ( + SELECT + subquery.*, + entity_dataframe.{{entity_df_event_timestamp_col}} AS entity_timestamp, + entity_dataframe.{{featureview.name}}__entity_row_unique_id + FROM {{ featureview.name }}__subquery AS subquery + INNER JOIN entity_dataframe + ON 1=1 + AND subquery.event_timestamp <= entity_dataframe.{{entity_df_event_timestamp_col}} + + {% if featureview.ttl == 0 %}{% else %} + AND {{ featureview.ttl }} > = DATEDIFF(SECOND, subquery.event_timestamp, entity_dataframe.{{entity_df_event_timestamp_col}}) + {% endif %} + + {% for entity in featureview.entities %} + AND subquery.{{ entity }} = entity_dataframe.{{ entity }} + {% endfor %} +), + +/* + 2. If the `created_timestamp_column` has been set, we need to + deduplicate the data first. This is done by calculating the + `MAX(created_at_timestamp)` for each event_timestamp. + We then join the data on the next CTE +*/ +{% if featureview.created_timestamp_column %} +{{ featureview.name }}__dedup AS ( + SELECT + {{featureview.name}}__entity_row_unique_id, + event_timestamp, + MAX(created_timestamp) as created_timestamp + FROM {{ featureview.name }}__base + GROUP BY {{featureview.name}}__entity_row_unique_id, event_timestamp +), +{% endif %} + +/* + 3. The data has been filtered during the first CTE "*__base" + Thus we only need to compute the latest timestamp of each feature. +*/ +{{ featureview.name }}__latest AS ( + SELECT + {{ featureview.name }}__base.{{ featureview.name }}__entity_row_unique_id, + MAX({{ featureview.name }}__base.event_timestamp) AS event_timestamp + {% if featureview.created_timestamp_column %} + ,MAX({{ featureview.name }}__base.created_timestamp) AS created_timestamp + {% endif %} + + FROM {{ featureview.name }}__base + {% if featureview.created_timestamp_column %} + INNER JOIN {{ featureview.name }}__dedup + ON {{ featureview.name }}__dedup.{{ featureview.name }}__entity_row_unique_id = {{ featureview.name }}__base.{{ featureview.name }}__entity_row_unique_id + AND {{ featureview.name }}__dedup.event_timestamp = {{ featureview.name }}__base.event_timestamp + AND {{ featureview.name }}__dedup.created_timestamp = {{ featureview.name }}__base.created_timestamp + {% endif %} + + GROUP BY {{ featureview.name }}__base.{{ featureview.name }}__entity_row_unique_id +), + +/* + 4. Once we know the latest value of each feature for a given timestamp, + we can join again the data back to the original "base" dataset +*/ +{{ featureview.name }}__cleaned AS ( + SELECT base.* + FROM {{ featureview.name }}__base as base + INNER JOIN {{ featureview.name }}__latest + ON base.{{ featureview.name }}__entity_row_unique_id = {{ featureview.name }}__latest.{{ featureview.name }}__entity_row_unique_id + AND base.event_timestamp = {{ featureview.name }}__latest.event_timestamp + {% if featureview.created_timestamp_column %} + AND base.created_timestamp = {{ featureview.name }}__latest.created_timestamp + {% endif %} +){% if loop.last %}{% else %}, {% endif %} + +{% endfor %} + +/* + Joins the outputs of multiple time travel joins to a single table. + The entity_dataframe dataset being our source of truth here. + */ + +SELECT {{ final_output_feature_names | join(', ')}} +FROM entity_dataframe +{% for featureview in featureviews %} +LEFT JOIN ( + SELECT + {{featureview.name}}__entity_row_unique_id + {% for feature in featureview.features %} + ,{% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %} + {% endfor %} + FROM "{{ featureview.name }}__cleaned" +) {{ featureview.name }}__cleaned +ON +{{ featureview.name }}__cleaned.{{ featureview.name }}__entity_row_unique_id = entity_dataframe.{{ featureview.name }}__entity_row_unique_id +{% endfor %} +""" diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssqlserver_source.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssqlserver_source.py new file mode 100644 index 0000000000..6b126fa40c --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/mssqlserver_source.py @@ -0,0 +1,252 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +import json +import warnings +from typing import Callable, Dict, Iterable, Optional, Tuple + +import pandas +from sqlalchemy import create_engine + +from feast import type_map +from feast.data_source import DataSource +from feast.infra.offline_stores.contrib.mssql_offline_store.mssql import ( + MsSqlServerOfflineStoreConfig, +) +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.repo_config import RepoConfig +from feast.value_type import ValueType + +# Make sure azure warning doesn't raise more than once. +warnings.simplefilter("once", RuntimeWarning) + + +class MsSqlServerOptions: + """ + DataSource MsSQLServer options used to source features from MsSQLServer query + """ + + def __init__( + self, + connection_str: Optional[str], + table_ref: Optional[str], + ): + self._connection_str = connection_str + self._table_ref = table_ref + + @property + def table_ref(self): + """ + Returns the table ref of this SQL Server source + """ + return self._table_ref + + @table_ref.setter + def table_ref(self, table_ref): + """ + Sets the table ref of this SQL Server source + """ + self._table_ref = table_ref + + @property + def connection_str(self): + """ + Returns the SqlServer SQL connection string referenced by this source + """ + return self._connection_str + + @connection_str.setter + def connection_str(self, connection_str): + """ + Sets the SqlServer SQL connection string referenced by this source + """ + self._connection_str = connection_str + + @classmethod + def from_proto( + cls, sqlserver_options_proto: DataSourceProto.CustomSourceOptions + ) -> "MsSqlServerOptions": + """ + Creates an MsSQLServerOptions from a protobuf representation of a SqlServer option + Args: + sqlserver_options_proto: A protobuf representation of a DataSource + Returns: + Returns a SQLServerOptions object based on the sqlserver_options protobuf + """ + options = json.loads(sqlserver_options_proto.configuration) + + sqlserver_options = cls( + table_ref=options["table_ref"], + connection_str=options["connection_str"], + ) + + return sqlserver_options + + def to_proto(self) -> DataSourceProto.CustomSourceOptions: + """ + Converts a MsSQLServerOptions object to a protobuf representation. + Returns: + CustomSourceOptions protobuf + """ + + sqlserver_options_proto = DataSourceProto.CustomSourceOptions( + configuration=json.dumps( + { + "table_ref": self._table_ref, + "connection_string": self._connection_str, + } + ).encode("utf-8") + ) + + return sqlserver_options_proto + + +class MsSqlServerSource(DataSource): + def __init__( + self, + name: str, + table_ref: Optional[str] = None, + event_timestamp_column: Optional[str] = None, + created_timestamp_column: Optional[str] = "", + field_mapping: Optional[Dict[str, str]] = None, + date_partition_column: Optional[str] = "", + connection_str: Optional[str] = "", + description: Optional[str] = None, + tags: Optional[Dict[str, str]] = None, + owner: Optional[str] = None, + ): + warnings.warn( + "The Azure Synapse + Azure SQL data source is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + self._mssqlserver_options = MsSqlServerOptions( + connection_str=connection_str, table_ref=table_ref + ) + self._connection_str = connection_str + + super().__init__( + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping, + date_partition_column=date_partition_column, + description=description, + tags=tags, + owner=owner, + name=name, + timestamp_field=event_timestamp_column, + ) + + def __eq__(self, other): + if not isinstance(other, MsSqlServerSource): + raise TypeError( + "Comparisons should only involve SqlServerSource class objects." + ) + + return ( + self.name == other.name + and self.mssqlserver_options.connection_str + == other.mssqlserver_options.connection_str + and self.timestamp_field == other.timestamp_field + and self.created_timestamp_column == other.created_timestamp_column + and self.field_mapping == other.field_mapping + ) + + def __hash__(self): + return hash( + ( + self.name, + self.mssqlserver_options.connection_str, + self.timestamp_field, + self.created_timestamp_column, + ) + ) + + @property + def table_ref(self): + return self._mssqlserver_options.table_ref + + @property + def mssqlserver_options(self): + """ + Returns the SQL Server options of this data source + """ + return self._mssqlserver_options + + @mssqlserver_options.setter + def mssqlserver_options(self, sqlserver_options): + """ + Sets the SQL Server options of this data source + """ + self._mssqlserver_options = sqlserver_options + + @staticmethod + def from_proto(data_source: DataSourceProto): + options = json.loads(data_source.custom_options.configuration) + return MsSqlServerSource( + name=data_source.name, + field_mapping=dict(data_source.field_mapping), + table_ref=options["table_ref"], + connection_str=options["connection_string"], + event_timestamp_column=data_source.timestamp_field, + created_timestamp_column=data_source.created_timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + + def to_proto(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + type=DataSourceProto.CUSTOM_SOURCE, + data_source_class_type="feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source.MsSqlServerSource", + field_mapping=self.field_mapping, + custom_options=self.mssqlserver_options.to_proto(), + ) + + data_source_proto.timestamp_field = self.timestamp_field + data_source_proto.created_timestamp_column = self.created_timestamp_column + data_source_proto.date_partition_column = self.date_partition_column + data_source_proto.name = self.name + return data_source_proto + + def get_table_query_string(self) -> str: + """Returns a string that can directly be used to reference this table in SQL""" + return f"`{self.table_ref}`" + + def validate(self, config: RepoConfig): + # As long as the query gets successfully executed, or the table exists, + # the data source is validated. We don't need the results though. + self.get_table_column_names_and_types(config) + return None + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + return type_map.mssql_to_feast_value_type + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + assert isinstance(config.offline_store, MsSqlServerOfflineStoreConfig) + conn = create_engine(config.offline_store.connection_string) + self._mssqlserver_options.connection_str = ( + config.offline_store.connection_string + ) + name_type_pairs = [] + if len(self.table_ref.split(".")) == 2: + database, table_name = self.table_ref.split(".") + columns_query = f""" + SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS + WHERE TABLE_NAME = '{table_name}' and table_schema = '{database}' + """ + else: + columns_query = f""" + SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS + WHERE TABLE_NAME = '{self.table_ref}' + """ + + table_schema = pandas.read_sql(columns_query, conn) + name_type_pairs.extend( + list( + zip( + table_schema["COLUMN_NAME"].to_list(), + table_schema["DATA_TYPE"].to_list(), + ) + ) + ) + return name_type_pairs diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/__init__.py new file mode 100644 index 0000000000..ae7affc838 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/__init__.py @@ -0,0 +1 @@ +from .data_source import mssql_container # noqa diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/data_source.py new file mode 100644 index 0000000000..9b751d98ef --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_offline_store/tests/data_source.py @@ -0,0 +1,108 @@ +from typing import Dict, List + +import pandas as pd +import pytest +from sqlalchemy import create_engine +from testcontainers.core.waiting_utils import wait_for_logs +from testcontainers.mssql import SqlServerContainer + +from feast.data_source import DataSource +from feast.infra.offline_stores.contrib.mssql_offline_store.mssql import ( + MsSqlServerOfflineStoreConfig, + _df_to_create_table_sql, +) +from feast.infra.offline_stores.contrib.mssql_offline_store.mssqlserver_source import ( + MsSqlServerSource, +) +from feast.saved_dataset import SavedDatasetStorage +from tests.integration.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) + +MSSQL_USER = "SA" +MSSQL_PASSWORD = "yourStrong(!)Password" + + +@pytest.fixture(scope="session") +def mssql_container(): + container = SqlServerContainer( + user=MSSQL_USER, + password=MSSQL_PASSWORD, + image="mcr.microsoft.com/azure-sql-edge:1.0.6", + ) + container.start() + log_string_to_wait_for = "Service Broker manager has started" + wait_for_logs(container=container, predicate=log_string_to_wait_for, timeout=30) + + yield container + container.stop() + + +class MsSqlDataSourceCreator(DataSourceCreator): + tables: List[str] = [] + + def __init__( + self, project_name: str, fixture_request: pytest.FixtureRequest, **kwargs + ): + super().__init__(project_name) + self.tables_created: List[str] = [] + self.container = fixture_request.getfixturevalue("mssql_container") + + if not self.container: + raise RuntimeError( + "In order to use this data source " + "'feast.infra.offline_stores.contrib.mssql_offline_store.tests' " + "must be include into pytest plugins" + ) + + def create_offline_store_config(self) -> MsSqlServerOfflineStoreConfig: + return MsSqlServerOfflineStoreConfig( + connection_string=self.container.get_connection_url(), + ) + + def create_data_source( + self, + df: pd.DataFrame, + destination_name: str, + timestamp_field="ts", + created_timestamp_column="created_ts", + field_mapping: Dict[str, str] = None, + **kwargs, + ) -> DataSource: + # Make sure the field mapping is correct and convert the datetime datasources. + if timestamp_field in df: + df[timestamp_field] = pd.to_datetime(df[timestamp_field], utc=True).fillna( + pd.Timestamp.now() + ) + if created_timestamp_column in df: + df[created_timestamp_column] = pd.to_datetime( + df[created_timestamp_column], utc=True + ).fillna(pd.Timestamp.now()) + + connection_string = self.create_offline_store_config().connection_string + engine = create_engine(connection_string) + destination_name = self.get_prefixed_table_name(destination_name) + # Create table + engine.execute(_df_to_create_table_sql(df, destination_name)) + + # Upload dataframe to azure table + df.to_sql(destination_name, engine, index=False, if_exists="append") + + self.tables.append(destination_name) + return MsSqlServerSource( + name="ci_mssql_source", + connection_str=connection_string, + table_ref=destination_name, + event_timestamp_column=timestamp_field, + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping or {"ts_1": "ts"}, + ) + + def create_saved_dataset_destination(self) -> SavedDatasetStorage: + pass + + def get_prefixed_table_name(self, destination_name: str) -> str: + return f"{self.project_name}_{destination_name}" + + def teardown(self): + pass diff --git a/sdk/python/feast/infra/offline_stores/contrib/mssql_repo_configuration.py b/sdk/python/feast/infra/offline_stores/contrib/mssql_repo_configuration.py new file mode 100644 index 0000000000..50d636ba90 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mssql_repo_configuration.py @@ -0,0 +1,13 @@ +from feast.infra.offline_stores.contrib.mssql_offline_store.tests.data_source import ( + MsSqlDataSourceCreator, +) +from tests.integration.feature_repos.repo_configuration import REDIS_CONFIG +from tests.integration.feature_repos.universal.online_store.redis import ( + RedisOnlineStoreCreator, +) + +AVAILABLE_OFFLINE_STORES = [ + ("local", MsSqlDataSourceCreator), +] + +AVAILABLE_ONLINE_STORES = {"redis": (REDIS_CONFIG, RedisOnlineStoreCreator)} diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres.py index 28944df72e..ada41c023b 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres.py @@ -5,6 +5,7 @@ Any, Callable, ContextManager, + Dict, Iterator, KeysView, List, @@ -13,6 +14,7 @@ Union, ) +import numpy as np import pandas as pd import pyarrow as pa from jinja2 import BaseLoader, Environment @@ -24,11 +26,15 @@ from feast.errors import InvalidEntityType from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL, FeatureView from feast.infra.offline_stores import offline_utils +from feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source import ( + SavedDatasetPostgreSQLStorage, +) from feast.infra.offline_stores.offline_store import ( OfflineStore, RetrievalJob, RetrievalMetadata, ) +from feast.infra.registry.registry import Registry from feast.infra.utils.postgres.connection_utils import ( _get_conn, df_to_postgres_table, @@ -36,7 +42,6 @@ ) from feast.infra.utils.postgres.postgres_config import PostgreSQLConfig from feast.on_demand_feature_view import OnDemandFeatureView -from feast.registry import Registry from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDatasetStorage from feast.type_map import pg_type_code_to_arrow @@ -62,6 +67,7 @@ def pull_latest_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: + assert isinstance(config.offline_store, PostgreSQLOfflineStoreConfig) assert isinstance(data_source, PostgreSQLSource) from_expression = data_source.get_table_query_string() @@ -112,24 +118,27 @@ def get_historical_features( project: str, full_feature_names: bool = False, ) -> RetrievalJob: + assert isinstance(config.offline_store, PostgreSQLOfflineStoreConfig) + for fv in feature_views: + assert isinstance(fv.batch_source, PostgreSQLSource) + + entity_schema = _get_entity_schema(entity_df, config) + + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df(entity_schema) + ) + + entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( + entity_df, + entity_df_event_timestamp_col, + config, + ) + @contextlib.contextmanager def query_generator() -> Iterator[str]: - table_name = None - if isinstance(entity_df, pd.DataFrame): - table_name = offline_utils.get_temp_entity_table_name() - entity_schema = df_to_postgres_table( - config.offline_store, entity_df, table_name - ) - df_query = table_name - elif isinstance(entity_df, str): - df_query = f"({entity_df}) AS sub" - entity_schema = get_query_schema(config.offline_store, df_query) - else: - raise TypeError(entity_df) - - entity_df_event_timestamp_col = ( - offline_utils.infer_event_timestamp_from_entity_df(entity_schema) - ) + table_name = offline_utils.get_temp_entity_table_name() + + _upload_entity_df(config, entity_df, table_name) expected_join_keys = offline_utils.get_expected_join_keys( project, feature_views, registry @@ -139,13 +148,6 @@ def query_generator() -> Iterator[str]: entity_schema, expected_join_keys, entity_df_event_timestamp_col ) - entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( - entity_df, - entity_df_event_timestamp_col, - config, - df_query, - ) - query_context = offline_utils.get_feature_view_query_context( feature_refs, feature_views, @@ -165,7 +167,7 @@ def query_generator() -> Iterator[str]: try: yield build_point_in_time_query( query_context_dict, - left_table_query_string=df_query, + left_table_query_string=table_name, entity_df_event_timestamp_col=entity_df_event_timestamp_col, entity_df_columns=entity_schema.keys(), query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, @@ -189,6 +191,12 @@ def query_generator() -> Iterator[str]: on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry ), + metadata=RetrievalMetadata( + features=feature_refs, + keys=list(entity_schema.keys() - {entity_df_event_timestamp_col}), + min_event_timestamp=entity_df_event_timestamp_range[0], + max_event_timestamp=entity_df_event_timestamp_range[1], + ), ) @staticmethod @@ -202,6 +210,7 @@ def pull_all_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: + assert isinstance(config.offline_store, PostgreSQLOfflineStoreConfig) assert isinstance(data_source, PostgreSQLSource) from_expression = data_source.get_table_query_string() @@ -247,7 +256,7 @@ def query_generator() -> Iterator[str]: self._query_generator = query_generator self.config = config self._full_feature_names = full_feature_names - self._on_demand_feature_views = on_demand_feature_views + self._on_demand_feature_views = on_demand_feature_views or [] self._metadata = metadata @property @@ -255,7 +264,7 @@ def full_feature_names(self) -> bool: return self._full_feature_names @property - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: return self._on_demand_feature_views def _to_df_internal(self) -> pd.DataFrame: @@ -293,15 +302,20 @@ def _to_arrow_internal(self) -> pa.Table: def metadata(self) -> Optional[RetrievalMetadata]: return self._metadata - def persist(self, storage: SavedDatasetStorage): - pass + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): + assert isinstance(storage, SavedDatasetPostgreSQLStorage) + + df_to_postgres_table( + config=self.config.offline_store, + df=self.to_df(), + table_name=storage.postgres_options._table, + ) def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, config: RepoConfig, - table_name: str, ) -> Tuple[datetime, datetime]: if isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[ @@ -312,15 +326,15 @@ def _get_entity_df_event_timestamp_range( entity_df_event_timestamp, utc=True ) entity_df_event_timestamp_range = ( - entity_df_event_timestamp.min(), - entity_df_event_timestamp.max(), + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), ) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), determine range # from table with _get_conn(config.offline_store) as conn, conn.cursor() as cur: cur.execute( - f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max FROM {table_name}" + f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max FROM ({entity_df}) as tmp_alias" ), res = cur.fetchone() entity_df_event_timestamp_range = (res[0], res[1]) @@ -374,6 +388,34 @@ def build_point_in_time_query( return query +def _upload_entity_df( + config: RepoConfig, entity_df: Union[pd.DataFrame, str], table_name: str +): + if isinstance(entity_df, pd.DataFrame): + # If the entity_df is a pandas dataframe, upload it to Postgres + df_to_postgres_table(config.offline_store, entity_df, table_name) + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), create a Postgres table out of it + with _get_conn(config.offline_store) as conn, conn.cursor() as cur: + cur.execute(f"CREATE TABLE {table_name} AS ({entity_df})") + else: + raise InvalidEntityType(type(entity_df)) + + +def _get_entity_schema( + entity_df: Union[pd.DataFrame, str], + config: RepoConfig, +) -> Dict[str, np.dtype]: + if isinstance(entity_df, pd.DataFrame): + return dict(zip(entity_df.columns, entity_df.dtypes)) + + elif isinstance(entity_df, str): + df_query = f"({entity_df}) AS sub" + return get_query_schema(config.offline_store, df_query) + else: + raise InvalidEntityType(type(entity_df)) + + # Copied from the Feast Redshift offline store implementation # Note: Keep this in sync with sdk/python/feast/infra/offline_stores/redshift.py: # MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres_source.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres_source.py index 74b7a5df8a..8a2e13e5c1 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres_source.py @@ -1,35 +1,48 @@ import json from typing import Callable, Dict, Iterable, Optional, Tuple +from typeguard import typechecked + from feast.data_source import DataSource +from feast.errors import DataSourceNoNameException from feast.infra.utils.postgres.connection_utils import _get_conn from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.protos.feast.core.SavedDataset_pb2 import ( + SavedDatasetStorage as SavedDatasetStorageProto, +) from feast.repo_config import RepoConfig +from feast.saved_dataset import SavedDatasetStorage from feast.type_map import pg_type_code_to_pg_type, pg_type_to_feast_value_type from feast.value_type import ValueType +@typechecked class PostgreSQLSource(DataSource): def __init__( self, - name: str, - query: str, + name: Optional[str] = None, + query: Optional[str] = None, + table: Optional[str] = None, timestamp_field: Optional[str] = "", created_timestamp_column: Optional[str] = "", field_mapping: Optional[Dict[str, str]] = None, - date_partition_column: Optional[str] = "", description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", ): - self._postgres_options = PostgreSQLOptions(name=name, query=query) + self._postgres_options = PostgreSQLOptions(name=name, query=query, table=table) + + # If no name, use the table as the default name. + if name is None and table is None: + raise DataSourceNoNameException() + name = name or table + assert name super().__init__( name=name, timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping, - date_partition_column=date_partition_column, description=description, tags=tags, owner=owner, @@ -45,7 +58,8 @@ def __eq__(self, other): ) return ( - self._postgres_options._query == other._postgres_options._query + super().__eq__(other) + and self._postgres_options._query == other._postgres_options._query and self.timestamp_field == other.timestamp_field and self.created_timestamp_column == other.created_timestamp_column and self.field_mapping == other.field_mapping @@ -56,13 +70,14 @@ def from_proto(data_source: DataSourceProto): assert data_source.HasField("custom_options") postgres_options = json.loads(data_source.custom_options.configuration) + return PostgreSQLSource( name=postgres_options["name"], query=postgres_options["query"], + table=postgres_options["table"], field_mapping=dict(data_source.field_mapping), timestamp_field=data_source.timestamp_field, created_timestamp_column=data_source.created_timestamp_column, - date_partition_column=data_source.date_partition_column, description=data_source.description, tags=dict(data_source.tags), owner=data_source.owner, @@ -82,7 +97,6 @@ def to_proto(self) -> DataSourceProto: data_source_proto.timestamp_field = self.timestamp_field data_source_proto.created_timestamp_column = self.created_timestamp_column - data_source_proto.date_partition_column = self.date_partition_column return data_source_proto @@ -105,26 +119,60 @@ def get_table_column_names_and_types( ) def get_table_query_string(self) -> str: - return f"({self._postgres_options._query})" + + if self._postgres_options._table: + return f"{self._postgres_options._table}" + else: + return f"({self._postgres_options._query})" class PostgreSQLOptions: - def __init__(self, name: str, query: Optional[str]): - self._name = name - self._query = query + def __init__( + self, + name: Optional[str], + query: Optional[str], + table: Optional[str], + ): + self._name = name or "" + self._query = query or "" + self._table = table or "" @classmethod def from_proto(cls, postgres_options_proto: DataSourceProto.CustomSourceOptions): config = json.loads(postgres_options_proto.configuration.decode("utf8")) - postgres_options = cls(name=config["name"], query=config["query"]) + postgres_options = cls( + name=config["name"], query=config["query"], table=config["table"] + ) return postgres_options def to_proto(self) -> DataSourceProto.CustomSourceOptions: postgres_options_proto = DataSourceProto.CustomSourceOptions( configuration=json.dumps( - {"name": self._name, "query": self._query} + {"name": self._name, "query": self._query, "table": self._table} ).encode() ) - return postgres_options_proto + + +class SavedDatasetPostgreSQLStorage(SavedDatasetStorage): + _proto_attr_name = "custom_storage" + + postgres_options: PostgreSQLOptions + + def __init__(self, table_ref: str): + self.postgres_options = PostgreSQLOptions( + table=table_ref, name=None, query=None + ) + + @staticmethod + def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: + return SavedDatasetPostgreSQLStorage( + table_ref=PostgreSQLOptions.from_proto(storage_proto.custom_storage)._table + ) + + def to_proto(self) -> SavedDatasetStorageProto: + return SavedDatasetStorageProto(custom_storage=self.postgres_options.to_proto()) + + def to_data_source(self) -> DataSource: + return PostgreSQLSource(table=self.postgres_options._table) diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py index c84fce03dc..f447950132 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py @@ -91,7 +91,6 @@ def create_data_source( if self.offline_store_config: df_to_postgres_table(self.offline_store_config, df, destination_name) - return PostgreSQLSource( name=destination_name, query=f"SELECT * FROM {destination_name}", diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_repo_configuration.py index 9b107aa7a3..2fa08bf47a 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_repo_configuration.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_repo_configuration.py @@ -1,7 +1,11 @@ from feast.infra.offline_stores.contrib.postgres_offline_store.tests.data_source import ( PostgreSQLDataSourceCreator, ) +from tests.integration.feature_repos.repo_configuration import REDIS_CONFIG +from tests.integration.feature_repos.universal.online_store.redis import ( + RedisOnlineStoreCreator, +) AVAILABLE_OFFLINE_STORES = [("local", PostgreSQLDataSourceCreator)] -AVAILABLE_ONLINE_STORES = {"postgres": (None, PostgreSQLDataSourceCreator)} +AVAILABLE_ONLINE_STORES = {"redis": (REDIS_CONFIG, RedisOnlineStoreCreator)} diff --git a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py index 2437714dec..58519014b4 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py +++ b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py @@ -1,11 +1,13 @@ +import tempfile import warnings from datetime import datetime -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import pandas import pandas as pd import pyarrow +import pyarrow.parquet as pq import pyspark from pydantic import StrictStr from pyspark import SparkConf @@ -26,7 +28,7 @@ RetrievalJob, RetrievalMetadata, ) -from feast.registry import Registry +from feast.infra.registry.registry import Registry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage from feast.type_map import spark_schema_to_np_dtypes @@ -119,6 +121,9 @@ def get_historical_features( full_feature_names: bool = False, ) -> RetrievalJob: assert isinstance(config.offline_store, SparkOfflineStoreConfig) + for fv in feature_views: + assert isinstance(fv.batch_source, SparkSource) + warnings.warn( "The spark offline store is an experimental feature in alpha development. " "Some functionality may still be unstable so functionality can change in the future.", @@ -189,6 +194,58 @@ def get_historical_features( ), ) + @staticmethod + def offline_write_batch( + config: RepoConfig, + feature_view: FeatureView, + table: pyarrow.Table, + progress: Optional[Callable[[int], Any]], + ): + assert isinstance(config.offline_store, SparkOfflineStoreConfig) + assert isinstance(feature_view.batch_source, SparkSource) + + pa_schema, column_names = offline_utils.get_pyarrow_schema_from_batch_source( + config, feature_view.batch_source + ) + if column_names != table.column_names: + raise ValueError( + f"The input pyarrow table has schema {table.schema} with the incorrect columns {table.column_names}. " + f"The schema is expected to be {pa_schema} with the columns (in this exact order) to be {column_names}." + ) + + spark_session = get_spark_session_or_start_new_with_repoconfig( + store_config=config.offline_store + ) + + if feature_view.batch_source.path: + # write data to disk so that it can be loaded into spark (for preserving column types) + with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp_file: + print(tmp_file.name) + pq.write_table(table, tmp_file.name) + + # load data + df_batch = spark_session.read.parquet(tmp_file.name) + + # load existing data to get spark table schema + df_existing = spark_session.read.format( + feature_view.batch_source.file_format + ).load(feature_view.batch_source.path) + + # cast columns if applicable + df_batch = _cast_data_frame(df_batch, df_existing) + + df_batch.write.format(feature_view.batch_source.file_format).mode( + "append" + ).save(feature_view.batch_source.path) + elif feature_view.batch_source.query: + raise NotImplementedError( + "offline_write_batch not implemented for batch sources specified by query" + ) + else: + raise NotImplementedError( + "offline_write_batch not implemented for batch sources specified by a table" + ) + @staticmethod @log_exceptions_and_usage(offline_store="spark") def pull_all_from_table_or_query( @@ -205,6 +262,7 @@ def pull_all_from_table_or_query( created_timestamp_column have all already been mapped to column names of the source table and those column names are the values passed into this function. """ + assert isinstance(config.offline_store, SparkOfflineStoreConfig) assert isinstance(data_source, SparkSource) warnings.warn( "The spark offline store is an experimental feature in alpha development. " @@ -245,7 +303,7 @@ def __init__( self.spark_session = spark_session self.query = query self._full_feature_names = full_feature_names - self._on_demand_feature_views = on_demand_feature_views + self._on_demand_feature_views = on_demand_feature_views or [] self._metadata = metadata @property @@ -253,7 +311,7 @@ def full_feature_names(self) -> bool: return self._full_feature_names @property - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: return self._on_demand_feature_views def to_spark_df(self) -> pyspark.sql.DataFrame: @@ -267,10 +325,13 @@ def _to_df_internal(self) -> pd.DataFrame: def _to_arrow_internal(self) -> pyarrow.Table: """Return dataset as pyarrow Table synchronously""" - df = self.to_df() - return pyarrow.Table.from_pandas(df) # noqa - def persist(self, storage: SavedDatasetStorage): + # write to temp parquet and then load it as pyarrow table from disk + with tempfile.TemporaryDirectory() as temp_dir: + self.to_spark_df().write.parquet(temp_dir, mode="overwrite") + return pq.read_table(temp_dir) + + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): """ Run the retrieval and persist the results in the same offline store used for read. Please note the persisting is done only within the scope of the spark session. @@ -383,6 +444,24 @@ def _format_datetime(t: datetime) -> str: return dt +def _cast_data_frame( + df_new: pyspark.sql.DataFrame, df_existing: pyspark.sql.DataFrame +) -> pyspark.sql.DataFrame: + """Convert new dataframe's columns to the same types as existing dataframe while preserving the order of columns""" + existing_dtypes = {k: v for k, v in df_existing.dtypes} + new_dtypes = {k: v for k, v in df_new.dtypes} + + select_expression = [] + for col, new_type in new_dtypes.items(): + existing_type = existing_dtypes[col] + if new_type != existing_type: + select_expression.append(f"cast({col} as {existing_type}) as {col}") + else: + select_expression.append(col) + + return df_new.selectExpr(*select_expression) + + MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """ /* Compute a deterministic hash for the `left_table_query_string` that will be used throughout diff --git a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py index 454e7ee87e..5b9f562181 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py @@ -6,6 +6,7 @@ from pyspark.sql import SparkSession +from feast import flags_helper from feast.data_source import DataSource from feast.errors import DataSourceNoNameException from feast.infra.offline_stores.offline_utils import get_temp_entity_table_name @@ -41,44 +42,34 @@ def __init__( event_timestamp_column: Optional[str] = None, created_timestamp_column: Optional[str] = None, field_mapping: Optional[Dict[str, str]] = None, - date_partition_column: Optional[str] = None, description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", timestamp_field: Optional[str] = None, ): - # If no name, use the table_ref as the default name - _name = name - if not _name: - if table: - _name = table - else: - raise DataSourceNoNameException() - - if date_partition_column: - warnings.warn( - ( - "The argument 'date_partition_column' is not supported for Spark sources." - "It will be removed in Feast 0.24+" - ), - DeprecationWarning, - ) + # If no name, use the table as the default name. + if name is None and table is None: + raise DataSourceNoNameException() + name = name or table + assert name super().__init__( - name=_name, - event_timestamp_column=event_timestamp_column, + name=name, + timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping, description=description, tags=tags, owner=owner, - timestamp_field=timestamp_field, - ) - warnings.warn( - "The spark data source API is an experimental feature in alpha development. " - "This API is unstable and it could and most probably will be changed in the future.", - RuntimeWarning, ) + + if not flags_helper.is_test(): + warnings.warn( + "The spark data source API is an experimental feature in alpha development. " + "This API is unstable and it could and most probably will be changed in the future.", + RuntimeWarning, + ) + self.spark_options = SparkOptions( table=table, query=query, diff --git a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/tests/data_source.py index 65cdde9457..ab1acbef73 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/tests/data_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/tests/data_source.py @@ -1,3 +1,6 @@ +import os +import shutil +import tempfile import uuid from typing import Dict, List @@ -48,6 +51,8 @@ def __init__(self, project_name: str, *args, **kwargs): def teardown(self): self.spark_session.stop() + for table in self.tables: + shutil.rmtree(table) def create_offline_store_config(self): self.spark_offline_store_config = SparkOfflineStoreConfig() @@ -86,14 +91,19 @@ def create_data_source( .appName("pytest-pyspark-local-testing") .getOrCreate() ) - self.spark_session.createDataFrame(df).createOrReplaceTempView(destination_name) - self.tables.append(destination_name) + temp_dir = tempfile.mkdtemp(prefix="spark_offline_store_test_data") + + path = os.path.join(temp_dir, destination_name) + self.tables.append(path) + + self.spark_session.createDataFrame(df).write.parquet(path) return SparkSource( - table=destination_name, + name=destination_name, + file_format="parquet", + path=path, timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, - date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, ) diff --git a/sdk/python/feast/infra/offline_stores/contrib/contrib_repo_configuration.py b/sdk/python/feast/infra/offline_stores/contrib/spark_repo_configuration.py similarity index 75% rename from sdk/python/feast/infra/offline_stores/contrib/contrib_repo_configuration.py rename to sdk/python/feast/infra/offline_stores/contrib/spark_repo_configuration.py index 083ec2b210..ec414f202a 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/contrib_repo_configuration.py +++ b/sdk/python/feast/infra/offline_stores/contrib/spark_repo_configuration.py @@ -1,9 +1,6 @@ from feast.infra.offline_stores.contrib.spark_offline_store.tests.data_source import ( SparkDataSourceCreator, ) -from feast.infra.offline_stores.contrib.trino_offline_store.tests.data_source import ( - TrinoSourceCreator, -) from tests.integration.feature_repos.repo_configuration import REDIS_CONFIG from tests.integration.feature_repos.universal.online_store.redis import ( RedisOnlineStoreCreator, @@ -11,7 +8,6 @@ AVAILABLE_OFFLINE_STORES = [ ("local", SparkDataSourceCreator), - ("local", TrinoSourceCreator), ] AVAILABLE_ONLINE_STORES = {"redis": (REDIS_CONFIG, RedisOnlineStoreCreator)} diff --git a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino.py b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino.py index 88a9021d1c..6c25b5768f 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino.py +++ b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino.py @@ -25,8 +25,8 @@ RetrievalJob, RetrievalMetadata, ) +from feast.infra.registry.registry import Registry from feast.on_demand_feature_view import OnDemandFeatureView -from feast.registry import Registry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage from feast.usage import log_exceptions_and_usage @@ -74,7 +74,7 @@ def __init__( self._client = client self._config = config self._full_feature_names = full_feature_names - self._on_demand_feature_views = on_demand_feature_views + self._on_demand_feature_views = on_demand_feature_views or [] self._metadata = metadata @property @@ -82,7 +82,7 @@ def full_feature_names(self) -> bool: return self._full_feature_names @property - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: return self._on_demand_feature_views def _to_df_internal(self) -> pd.DataFrame: @@ -126,7 +126,7 @@ def to_trino( self._client.execute_query(query_text=query) return destination_table - def persist(self, storage: SavedDatasetStorage): + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): """ Run the retrieval and persist the results in the same offline store used for read. """ @@ -161,14 +161,8 @@ def pull_latest_from_table_or_query( auth: Optional[Authentication] = None, http_scheme: Optional[str] = None, ) -> TrinoRetrievalJob: - if not isinstance(data_source, TrinoSource): - raise ValueError( - f"The data_source object is not a TrinoSource but is instead '{type(data_source)}'" - ) - if not isinstance(config.offline_store, TrinoOfflineStoreConfig): - raise ValueError( - f"The config.offline_store object is not a TrinoOfflineStoreConfig but is instead '{type(config.offline_store)}'" - ) + assert isinstance(config.offline_store, TrinoOfflineStoreConfig) + assert isinstance(data_source, TrinoSource) from_expression = data_source.get_table_query_string() @@ -222,10 +216,9 @@ def get_historical_features( auth: Optional[Authentication] = None, http_scheme: Optional[str] = None, ) -> TrinoRetrievalJob: - if not isinstance(config.offline_store, TrinoOfflineStoreConfig): - raise ValueError( - f"This function should be used with a TrinoOfflineStoreConfig object. Instead we have config.offline_store being '{type(config.offline_store)}'" - ) + assert isinstance(config.offline_store, TrinoOfflineStoreConfig) + for fv in feature_views: + assert isinstance(fv.batch_source, TrinoSource) client = _get_trino_client( config=config, user=user, auth=auth, http_scheme=http_scheme @@ -314,10 +307,8 @@ def pull_all_from_table_or_query( auth: Optional[Authentication] = None, http_scheme: Optional[str] = None, ) -> RetrievalJob: - if not isinstance(data_source, TrinoSource): - raise ValueError( - f"The data_source object is not a TrinoSource object but is instead a {type(data_source)}" - ) + assert isinstance(config.offline_store, TrinoOfflineStoreConfig) + assert isinstance(data_source, TrinoSource) from_expression = data_source.get_table_query_string() client = _get_trino_client( diff --git a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino_source.py b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino_source.py index d82650712e..6e989bd40c 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino_source.py @@ -2,6 +2,7 @@ from feast import ValueType from feast.data_source import DataSource +from feast.errors import DataSourceNoNameException from feast.infra.offline_stores.contrib.trino_offline_store.trino_queries import Trino from feast.infra.offline_stores.contrib.trino_offline_store.trino_type_map import ( trino_to_feast_value_type, @@ -86,26 +87,51 @@ class TrinoSource(DataSource): def __init__( self, *, - event_timestamp_column: Optional[str] = "", + name: Optional[str] = None, + timestamp_field: Optional[str] = None, table: Optional[str] = None, created_timestamp_column: Optional[str] = "", field_mapping: Optional[Dict[str, str]] = None, query: Optional[str] = None, - name: Optional[str] = None, description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", - timestamp_field: Optional[str] = None, ): + """ + Creates a TrinoSource object. + + Args: + name (optional): Name for the source. Defaults to the table if not specified, in which + case the table must be specified. + timestamp_field (optional): Event timestamp field used for point in time + joins of feature values. + table (optional): Trino table where the features are stored. Exactly one of 'table' and + 'query' must be specified. + created_timestamp_column (optional): Timestamp column indicating when the + row was created, used for deduplicating rows. + field_mapping (optional): A dictionary mapping of column names in this data + source to column names in a feature table or view. + query (optional): The query to be executed to obtain the features. Exactly one of 'table' + and 'query' must be specified. + description (optional): A human-readable description. + tags (optional): A dictionary of key-value pairs to store arbitrary metadata. + owner (optional): The owner of the snowflake source, typically the email of the primary + maintainer. + """ + # If no name, use the table as the default name. + if name is None and table is None: + raise DataSourceNoNameException() + name = name or table + assert name + super().__init__( name=name if name else "", - event_timestamp_column=event_timestamp_column, + timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping, description=description, tags=tags, owner=owner, - timestamp_field=timestamp_field, ) self._trino_options = TrinoOptions(table=table, query=query) @@ -120,7 +146,8 @@ def __eq__(self, other): ) return ( - self.name == other.name + super().__eq__(other) + and self.name == other.name and self.trino_options.table == other.trino_options.table and self.trino_options.query == other.trino_options.query and self.timestamp_field == other.timestamp_field @@ -183,7 +210,6 @@ def to_proto(self) -> DataSourceProto: data_source_proto.timestamp_field = self.timestamp_field data_source_proto.created_timestamp_column = self.created_timestamp_column - data_source_proto.date_partition_column = self.date_partition_column return data_source_proto diff --git a/sdk/python/feast/infra/offline_stores/contrib/trino_repo_configuration.py b/sdk/python/feast/infra/offline_stores/contrib/trino_repo_configuration.py new file mode 100644 index 0000000000..198227095d --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/trino_repo_configuration.py @@ -0,0 +1,13 @@ +from feast.infra.offline_stores.contrib.trino_offline_store.tests.data_source import ( + TrinoSourceCreator, +) +from tests.integration.feature_repos.repo_configuration import REDIS_CONFIG +from tests.integration.feature_repos.universal.online_store.redis import ( + RedisOnlineStoreCreator, +) + +AVAILABLE_OFFLINE_STORES = [ + ("local", TrinoSourceCreator), +] + +AVAILABLE_ONLINE_STORES = {"redis": (REDIS_CONFIG, RedisOnlineStoreCreator)} diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 829bd36c3d..09216ff8ff 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -1,3 +1,4 @@ +import os import uuid from datetime import datetime from pathlib import Path @@ -11,13 +12,16 @@ import pytz from pydantic.typing import Literal -from feast import FileSource, OnDemandFeatureView from feast.data_source import DataSource -from feast.errors import FeastJoinKeysDuringMaterialization +from feast.errors import ( + FeastJoinKeysDuringMaterialization, + SavedDatasetLocationAlreadyExists, +) from feast.feature_logging import LoggingConfig, LoggingSource from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL, FeatureView from feast.infra.offline_stores.file_source import ( FileLoggingDestination, + FileSource, SavedDatasetFileStorage, ) from feast.infra.offline_stores.offline_store import ( @@ -29,7 +33,8 @@ DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, get_pyarrow_schema_from_batch_source, ) -from feast.registry import BaseRegistry +from feast.infra.registry.base_registry import BaseRegistry +from feast.on_demand_feature_view import OnDemandFeatureView from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage from feast.usage import log_exceptions_and_usage @@ -59,9 +64,7 @@ def __init__( # The evaluation function executes a stored procedure to compute a historical retrieval. self.evaluation_function = evaluation_function self._full_feature_names = full_feature_names - self._on_demand_feature_views = ( - on_demand_feature_views if on_demand_feature_views else [] - ) + self._on_demand_feature_views = on_demand_feature_views or [] self._metadata = metadata @property @@ -69,7 +72,7 @@ def full_feature_names(self) -> bool: return self._full_feature_names @property - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: return self._on_demand_feature_views @log_exceptions_and_usage @@ -85,8 +88,13 @@ def _to_arrow_internal(self): df = self.evaluation_function().compute() return pyarrow.Table.from_pandas(df) - def persist(self, storage: SavedDatasetStorage): + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): assert isinstance(storage, SavedDatasetFileStorage) + + # Check if the specified location already exists. + if not allow_overwrite and os.path.exists(storage.file_options.uri): + raise SavedDatasetLocationAlreadyExists(location=storage.file_options.uri) + filesystem, path = FileSource.create_filesystem_and_path( storage.file_options.uri, storage.file_options.s3_endpoint_override, @@ -122,6 +130,10 @@ def get_historical_features( project: str, full_feature_names: bool = False, ) -> RetrievalJob: + assert isinstance(config.offline_store, FileOfflineStoreConfig) + for fv in feature_views: + assert isinstance(fv.batch_source, FileSource) + if not isinstance(entity_df, pd.DataFrame) and not isinstance( entity_df, dd.DataFrame ): @@ -290,6 +302,7 @@ def pull_latest_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: + assert isinstance(config.offline_store, FileOfflineStoreConfig) assert isinstance(data_source, FileSource) # Create lazy function that is only called from the RetrievalJob object @@ -370,6 +383,9 @@ def pull_all_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: + assert isinstance(config.offline_store, FileOfflineStoreConfig) + assert isinstance(data_source, FileSource) + return FileOfflineStore.pull_latest_from_table_or_query( config=config, data_source=data_source, @@ -390,6 +406,7 @@ def write_logged_features( logging_config: LoggingConfig, registry: BaseRegistry, ): + assert isinstance(config.offline_store, FileOfflineStoreConfig) destination = logging_config.destination assert isinstance(destination, FileLoggingDestination) @@ -420,18 +437,8 @@ def offline_write_batch( table: pyarrow.Table, progress: Optional[Callable[[int], Any]], ): - if not feature_view.batch_source: - raise ValueError( - "feature view does not have a batch source to persist offline data" - ) - if not isinstance(config.offline_store, FileOfflineStoreConfig): - raise ValueError( - f"offline store config is of type {type(config.offline_store)} when file type required" - ) - if not isinstance(feature_view.batch_source, FileSource): - raise ValueError( - f"feature view batch source is {type(feature_view.batch_source)} not file source" - ) + assert isinstance(config.offline_store, FileOfflineStoreConfig) + assert isinstance(feature_view.batch_source, FileSource) pa_schema, column_names = get_pyarrow_schema_from_batch_source( config, feature_view.batch_source diff --git a/sdk/python/feast/infra/offline_stores/file_source.py b/sdk/python/feast/infra/offline_stores/file_source.py index 6fdf95470f..135409ed04 100644 --- a/sdk/python/feast/infra/offline_stores/file_source.py +++ b/sdk/python/feast/infra/offline_stores/file_source.py @@ -1,4 +1,3 @@ -import warnings from typing import Callable, Dict, Iterable, List, Optional, Tuple from pyarrow._fs import FileSystem @@ -26,35 +25,33 @@ class FileSource(DataSource): def __init__( self, - *args, - path: Optional[str] = None, + *, + path: str, + name: Optional[str] = "", event_timestamp_column: Optional[str] = "", file_format: Optional[FileFormat] = None, created_timestamp_column: Optional[str] = "", field_mapping: Optional[Dict[str, str]] = None, - date_partition_column: Optional[str] = "", s3_endpoint_override: Optional[str] = None, - name: Optional[str] = "", description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", timestamp_field: Optional[str] = "", ): - """Create a FileSource from a file containing feature data. Only Parquet format supported. + """ + Creates a FileSource object. Args: - path: File path to file containing feature data. Must contain an event_timestamp column, entity columns and feature columns. + name (optional): Name for the file source. Defaults to the path. event_timestamp_column (optional): (Deprecated in favor of timestamp_field) Event timestamp column used for point in time joins of feature values. created_timestamp_column (optional): Timestamp column when row was created, used for deduplicating rows. file_format (optional): Explicitly set the file format. Allows Feast to bypass inferring the file format. field_mapping: A dictionary mapping of column names in this data source to feature names in a feature table or view. Only used for feature columns, not entities or timestamp columns. - date_partition_column (optional): Timestamp column used for partitioning. s3_endpoint_override (optional): Overrides AWS S3 enpoint with custom S3 storage - name (optional): Name for the file source. Defaults to the path. description (optional): A human-readable description. tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the file source, typically the email of the primary @@ -66,52 +63,20 @@ def __init__( >>> from feast import FileSource >>> file_source = FileSource(path="my_features.parquet", timestamp_field="event_timestamp") """ - positional_attributes = ["path"] - _path = path - if args: - if args: - warnings.warn( - ( - "File Source parameters should be specified as a keyword argument instead of a positional arg." - "Feast 0.24+ will not support positional arguments to construct File sources" - ), - DeprecationWarning, - ) - if len(args) > len(positional_attributes): - raise ValueError( - f"Only {', '.join(positional_attributes)} are allowed as positional args when defining " - f"File sources, for backwards compatibility." - ) - if len(args) >= 1: - _path = args[0] - if _path is None: - raise ValueError( - 'No "path" argument provided. Please set "path" to the location of your file source.' - ) self.file_options = FileOptions( file_format=file_format, - uri=_path, + uri=path, s3_endpoint_override=s3_endpoint_override, ) - if date_partition_column: - warnings.warn( - ( - "The argument 'date_partition_column' is not supported for File sources." - "It will be removed in Feast 0.24+" - ), - DeprecationWarning, - ) - super().__init__( name=name if name else path, - event_timestamp_column=event_timestamp_column, + timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping, description=description, tags=tags, owner=owner, - timestamp_field=timestamp_field, ) # Note: Python requires redefining hash in child classes that override __eq__ @@ -131,12 +96,20 @@ def __eq__(self, other): ) @property - def path(self): - """ - Returns the path of this file data source. - """ + def path(self) -> str: + """Returns the path of this file data source.""" return self.file_options.uri + @property + def file_format(self) -> Optional[FileFormat]: + """Returns the file format of this file data source.""" + return self.file_options.file_format + + @property + def s3_endpoint_override(self) -> Optional[str]: + """Returns the s3 endpoint override of this file data source.""" + return self.file_options.s3_endpoint_override + @staticmethod def from_proto(data_source: DataSourceProto): return FileSource( @@ -212,24 +185,33 @@ def get_table_query_string(self) -> str: class FileOptions: """ Configuration options for a file data source. + + Attributes: + uri: File source url, e.g. s3:// or local file. + s3_endpoint_override: Custom s3 endpoint (used only with s3 uri). + file_format: File source format, e.g. parquet. """ + uri: str + file_format: Optional[FileFormat] + s3_endpoint_override: str + def __init__( self, + uri: str, file_format: Optional[FileFormat], s3_endpoint_override: Optional[str], - uri: Optional[str], ): """ Initializes a FileOptions object. Args: + uri: File source url, e.g. s3:// or local file. file_format (optional): File source format, e.g. parquet. s3_endpoint_override (optional): Custom s3 endpoint (used only with s3 uri). - uri (optional): File source url, e.g. s3:// or local file. """ + self.uri = uri self.file_format = file_format - self.uri = uri or "" self.s3_endpoint_override = s3_endpoint_override or "" @classmethod @@ -304,6 +286,17 @@ def to_data_source(self) -> DataSource: s3_endpoint_override=self.file_options.s3_endpoint_override, ) + @staticmethod + def from_data_source(data_source: DataSource) -> "SavedDatasetStorage": + assert isinstance(data_source, FileSource) + return SavedDatasetFileStorage( + path=data_source.path, + file_format=data_source.file_format + if data_source.file_format + else ParquetFormat(), + s3_endpoint_override=data_source.s3_endpoint_override, + ) + class FileLoggingDestination(LoggingDestination): _proto_kind = "file_destination" diff --git a/sdk/python/feast/infra/offline_stores/offline_store.py b/sdk/python/feast/infra/offline_stores/offline_store.py index c8a0cb8a5c..9331b75ec2 100644 --- a/sdk/python/feast/infra/offline_stores/offline_store.py +++ b/sdk/python/feast/infra/offline_stores/offline_store.py @@ -20,12 +20,13 @@ import pandas as pd import pyarrow +from feast import flags_helper from feast.data_source import DataSource from feast.dqm.errors import ValidationFailed from feast.feature_logging import LoggingConfig, LoggingSource from feast.feature_view import FeatureView +from feast.infra.registry.base_registry import BaseRegistry from feast.on_demand_feature_view import OnDemandFeatureView -from feast.registry import BaseRegistry from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDatasetStorage @@ -58,25 +59,19 @@ def __init__( class RetrievalJob(ABC): - """RetrievalJob is used to manage the execution of a historical feature retrieval""" - - @property - @abstractmethod - def full_feature_names(self) -> bool: - pass - - @property - @abstractmethod - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: - pass + """A RetrievalJob manages the execution of a query to retrieve data from the offline store.""" def to_df( self, validation_reference: Optional["ValidationReference"] = None ) -> pd.DataFrame: """ - Return dataset as Pandas DataFrame synchronously including on demand transforms + Synchronously executes the underlying query and returns the result as a pandas dataframe. + + On demand transformations will be executed. If a validation reference is provided, the dataframe + will be validated. + Args: - validation_reference: If provided resulting dataset will be validated against this reference profile. + validation_reference (optional): The validation to apply against the retrieved dataframe. """ features_df = self._to_df_internal() @@ -91,12 +86,13 @@ def to_df( ) if validation_reference: - warnings.warn( - "Dataset validation is an experimental feature. " - "This API is unstable and it could and most probably will be changed in the future. " - "We do not guarantee that future changes will maintain backward compatibility.", - RuntimeWarning, - ) + if not flags_helper.is_test(): + warnings.warn( + "Dataset validation is an experimental feature. " + "This API is unstable and it could and most probably will be changed in the future. " + "We do not guarantee that future changes will maintain backward compatibility.", + RuntimeWarning, + ) validation_result = validation_reference.profile.validate(features_df) if not validation_result.is_success: @@ -104,23 +100,17 @@ def to_df( return features_df - @abstractmethod - def _to_df_internal(self) -> pd.DataFrame: - """Return dataset as Pandas DataFrame synchronously""" - pass - - @abstractmethod - def _to_arrow_internal(self) -> pyarrow.Table: - """Return dataset as pyarrow Table synchronously""" - pass - def to_arrow( self, validation_reference: Optional["ValidationReference"] = None ) -> pyarrow.Table: """ - Return dataset as pyarrow Table synchronously + Synchronously executes the underlying query and returns the result as an arrow table. + + On demand transformations will be executed. If a validation reference is provided, the dataframe + will be validated. + Args: - validation_reference: If provided resulting dataset will be validated against this reference profile. + validation_reference (optional): The validation to apply against the retrieved dataframe. """ if not self.on_demand_feature_views and not validation_reference: return self._to_arrow_internal() @@ -136,12 +126,13 @@ def to_arrow( ) if validation_reference: - warnings.warn( - "Dataset validation is an experimental feature. " - "This API is unstable and it could and most probably will be changed in the future. " - "We do not guarantee that future changes will maintain backward compatibility.", - RuntimeWarning, - ) + if not flags_helper.is_test(): + warnings.warn( + "Dataset validation is an experimental feature. " + "This API is unstable and it could and most probably will be changed in the future. " + "We do not guarantee that future changes will maintain backward compatibility.", + RuntimeWarning, + ) validation_result = validation_reference.profile.validate(features_df) if not validation_result.is_success: @@ -149,36 +140,73 @@ def to_arrow( return pyarrow.Table.from_pandas(features_df) + def to_sql(self) -> str: + """ + Return RetrievalJob generated SQL statement if applicable. + """ + pass + @abstractmethod - def persist(self, storage: SavedDatasetStorage): + def _to_df_internal(self) -> pd.DataFrame: """ - Run the retrieval and persist the results in the same offline store used for read. + Synchronously executes the underlying query and returns the result as a pandas dataframe. + + Does not handle on demand transformations or dataset validation. For either of those, + `to_df` should be used. """ pass - @property @abstractmethod - def metadata(self) -> Optional[RetrievalMetadata]: + def _to_arrow_internal(self) -> pyarrow.Table: """ - Return metadata information about retrieval. - Should be available even before materializing the dataset itself. + Synchronously executes the underlying query and returns the result as an arrow table. + + Does not handle on demand transformations or dataset validation. For either of those, + `to_arrow` should be used. """ pass - def supports_remote_storage_export(self) -> bool: + @property + @abstractmethod + def full_feature_names(self) -> bool: + """Returns True if full feature names should be applied to the results of the query.""" + pass + + @property + @abstractmethod + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: + """Returns a list containing all the on demand feature views to be handled.""" + pass + + @abstractmethod + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): """ - This method should return True if the RetrievalJob supports `to_remote_storage()`. + Synchronously executes the underlying query and persists the result in the same offline store + at the specified destination. + + Args: + storage: The saved dataset storage object specifying where the result should be persisted. + allow_overwrite: If True, a pre-existing location (e.g. table or file) can be overwritten. + Currently not all individual offline store implementations make use of this parameter. """ + pass + + @property + @abstractmethod + def metadata(self) -> Optional[RetrievalMetadata]: + """Returns metadata about the retrieval job.""" + pass + + def supports_remote_storage_export(self) -> bool: + """Returns True if the RetrievalJob supports `to_remote_storage`.""" return False def to_remote_storage(self) -> List[str]: """ - This method should export the result of this RetrievalJob to - remote storage (such as S3, GCS, HDFS, etc). - Implementations of this method should export the results as - multiple parquet files, each file sized appropriately - depending on how much data is being returned by the retrieval - job. + Synchronously executes the underlying query and exports the results to remote storage (e.g. S3 or GCS). + + Implementations of this method should export the results as multiple parquet files, each file sized + appropriately depending on how much data is being returned by the retrieval job. Returns: A list of parquet file paths in remote storage. @@ -188,8 +216,11 @@ def to_remote_storage(self) -> List[str]: class OfflineStore(ABC): """ - OfflineStore is an object used for all interaction between Feast and the service used for offline storage of - features. + An offline store defines the interface that Feast uses to interact with the storage and compute system that + handles offline features. + + Each offline store implementation is designed to work only with the corresponding data source. For example, + the SnowflakeOfflineStore can handle SnowflakeSources but not FileSources. """ @staticmethod @@ -205,24 +236,24 @@ def pull_latest_from_table_or_query( end_date: datetime, ) -> RetrievalJob: """ - This method pulls data from the offline store, and the FeatureStore class is used to write - this data into the online store. This method is invoked when running materialization (using - the `feast materialize` or `feast materialize-incremental` commands, or the corresponding - FeatureStore.materialize() method. This method pulls data from the offline store, and the FeatureStore - class is used to write this data into the online store. + Extracts the latest entity rows (i.e. the combination of join key columns, feature columns, and + timestamp columns) from the specified data source that lie within the specified time range. - Note that join_key_columns, feature_name_columns, timestamp_field, and created_timestamp_column - have all already been mapped to column names of the source table and those column names are the values passed - into this function. + All of the column names should refer to columns that exist in the data source. In particular, + any mapping of column names must have already happened. Args: - config: Repo configuration object - data_source: Data source to pull all of the columns from - join_key_columns: Columns of the join keys - feature_name_columns: Columns of the feature names needed - timestamp_field: Timestamp column - start_date: Starting date of query - end_date: Ending date of query + config: The config for the current feature store. + data_source: The data source from which the entity rows will be extracted. + join_key_columns: The columns of the join keys. + feature_name_columns: The columns of the features. + timestamp_field: The timestamp column, used to determine which rows are the most recent. + created_timestamp_column: The column indicating when the row was created, used to break ties. + start_date: The start of the time range. + end_date: The end of the time range. + + Returns: + A RetrievalJob that can be executed to get the entity rows. """ pass @@ -237,6 +268,25 @@ def get_historical_features( project: str, full_feature_names: bool = False, ) -> RetrievalJob: + """ + Retrieves the point-in-time correct historical feature values for the specified entity rows. + + Args: + config: The config for the current feature store. + feature_views: A list containing all feature views that are referenced in the entity rows. + feature_refs: The features to be retrieved. + entity_df: A collection of rows containing all entity columns on which features need to be joined, + as well as the timestamp column used for point-in-time joins. Either a pandas dataframe can be + provided or a SQL query. + registry: The registry for the current feature store. + project: Feast project to which the feature views belong. + full_feature_names: If True, feature names will be prefixed with the corresponding feature view name, + changing them from the format "feature" to "feature_view__feature" (e.g. "daily_transactions" + changes to "customer_fv__daily_transactions"). + + Returns: + A RetrievalJob that can be executed to get the features. + """ pass @staticmethod @@ -251,20 +301,23 @@ def pull_all_from_table_or_query( end_date: datetime, ) -> RetrievalJob: """ - Returns a Retrieval Job for all join key columns, feature name columns, and the event timestamp columns that occur between the start_date and end_date. + Extracts all the entity rows (i.e. the combination of join key columns, feature columns, and + timestamp columns) from the specified data source that lie within the specified time range. - Note that join_key_columns, feature_name_columns, timestamp_field, and created_timestamp_column - have all already been mapped to column names of the source table and those column names are the values passed - into this function. + All of the column names should refer to columns that exist in the data source. In particular, + any mapping of column names must have already happened. Args: - config: Repo configuration object - data_source: Data source to pull all of the columns from - join_key_columns: Columns of the join keys - feature_name_columns: Columns of the feature names needed - timestamp_field: Timestamp column - start_date: Starting date of query - end_date: Ending date of query + config: The config for the current feature store. + data_source: The data source from which the entity rows will be extracted. + join_key_columns: The columns of the join keys. + feature_name_columns: The columns of the features. + timestamp_field: The timestamp column. + start_date: The start of the time range. + end_date: The end of the time range. + + Returns: + A RetrievalJob that can be executed to get the entity rows. """ pass @@ -277,19 +330,18 @@ def write_logged_features( registry: BaseRegistry, ): """ - Write logged features to a specified destination (taken from logging_config) in the offline store. - Data can be appended to an existing table (destination) or a new one will be created automatically - (if it doesn't exist). - Hence, this function can be called repeatedly with the same destination to flush logs in chunks. + Writes logged features to a specified destination in the offline store. - Args: - config: Repo configuration object - data: Arrow table or path to parquet directory that contains logs dataset. - source: Logging source that provides schema and some additional metadata. - logging_config: used to determine destination - registry: Feast registry + If the specified destination exists, data will be appended; otherwise, the destination will be + created and data will be added. Thus this function can be called repeatedly with the same + destination to flush logs in chunks. - This is an optional method that could be supported only be some stores. + Args: + config: The config for the current feature store. + data: An arrow table or a path to parquet directory that contains the logs to write. + source: The logging source that provides a schema and some additional metadata. + logging_config: A LoggingConfig object that determines where the logs will be written. + registry: The registry for the current feature store. """ raise NotImplementedError() @@ -301,16 +353,13 @@ def offline_write_batch( progress: Optional[Callable[[int], Any]], ): """ - Write features to a specified destination in the offline store. - Data can be appended to an existing table (destination) or a new one will be created automatically - (if it doesn't exist). - Hence, this function can be called repeatedly with the same destination config to write features. + Writes the specified arrow table to the data source underlying the specified feature view. Args: - config: Repo configuration object - feature_view: FeatureView to write the data to. - table: pyarrow table containing feature data and timestamp column for historical feature retrieval - progress: Optional function to be called once every mini-batch of rows is written to - the online store. Can be used to display progress. + config: The config for the current feature store. + feature_view: The feature view whose batch source should be written. + table: The arrow table to write. + progress: Function to be called once a portion of the data has been written, used + to show progress. """ raise NotImplementedError() diff --git a/sdk/python/feast/infra/offline_stores/offline_utils.py b/sdk/python/feast/infra/offline_stores/offline_utils.py index 8b963a864b..42b8f8497a 100644 --- a/sdk/python/feast/infra/offline_stores/offline_utils.py +++ b/sdk/python/feast/infra/offline_stores/offline_utils.py @@ -17,7 +17,7 @@ from feast.feature_view import FeatureView from feast.importer import import_class from feast.infra.offline_stores.offline_store import OfflineStore -from feast.registry import BaseRegistry +from feast.infra.registry.base_registry import BaseRegistry from feast.repo_config import RepoConfig from feast.type_map import feast_value_type_to_pa from feast.utils import _get_requested_feature_views_to_features_dict, to_naive_utc @@ -93,6 +93,9 @@ class FeatureViewQueryContext: entity_selections: List[str] min_event_timestamp: Optional[str] max_event_timestamp: str + date_partition_column: Optional[ + str + ] # this attribute is added because partition pruning affects Athena's query performance. def get_feature_view_query_context( @@ -142,6 +145,11 @@ def get_feature_view_query_context( feature_view.batch_source.created_timestamp_column, ) + date_partition_column = reverse_field_mapping.get( + feature_view.batch_source.date_partition_column, + feature_view.batch_source.date_partition_column, + ) + max_event_timestamp = to_naive_utc(entity_df_timestamp_range[1]).isoformat() min_event_timestamp = None if feature_view.ttl: @@ -162,6 +170,7 @@ def get_feature_view_query_context( entity_selections=entity_selections, min_event_timestamp=min_event_timestamp, max_event_timestamp=max_event_timestamp, + date_partition_column=date_partition_column, ) query_context.append(context) diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py index df70f958f7..82b5150eaf 100644 --- a/sdk/python/feast/infra/offline_stores/redshift.py +++ b/sdk/python/feast/infra/offline_stores/redshift.py @@ -38,8 +38,8 @@ RedshiftLoggingDestination, SavedDatasetRedshiftStorage, ) +from feast.infra.registry.base_registry import BaseRegistry from feast.infra.utils import aws_utils -from feast.registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage from feast.usage import log_exceptions_and_usage @@ -141,6 +141,7 @@ def pull_all_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: + assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) assert isinstance(data_source, RedshiftSource) from_expression = data_source.get_table_query_string() @@ -182,6 +183,8 @@ def get_historical_features( full_feature_names: bool = False, ) -> RetrievalJob: assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) + for fv in feature_views: + assert isinstance(fv.batch_source, RedshiftSource) redshift_client = aws_utils.get_redshift_data_client( config.offline_store.region @@ -308,18 +311,8 @@ def offline_write_batch( table: pyarrow.Table, progress: Optional[Callable[[int], Any]], ): - if not feature_view.batch_source: - raise ValueError( - "feature view does not have a batch source to persist offline data" - ) - if not isinstance(config.offline_store, RedshiftOfflineStoreConfig): - raise ValueError( - f"offline store config is of type {type(config.offline_store)} when redshift type required" - ) - if not isinstance(feature_view.batch_source, RedshiftSource): - raise ValueError( - f"feature view batch source is {type(feature_view.batch_source)} not redshift source" - ) + assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) + assert isinstance(feature_view.batch_source, RedshiftSource) pa_schema, column_names = offline_utils.get_pyarrow_schema_from_batch_source( config, feature_view.batch_source @@ -395,9 +388,7 @@ def query_generator() -> Iterator[str]: + str(uuid.uuid4()) ) self._full_feature_names = full_feature_names - self._on_demand_feature_views = ( - on_demand_feature_views if on_demand_feature_views else [] - ) + self._on_demand_feature_views = on_demand_feature_views or [] self._metadata = metadata @property @@ -405,7 +396,7 @@ def full_feature_names(self) -> bool: return self._full_feature_names @property - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: return self._on_demand_feature_views @log_exceptions_and_usage @@ -485,7 +476,7 @@ def to_redshift(self, table_name: str) -> None: query, ) - def persist(self, storage: SavedDatasetStorage): + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): assert isinstance(storage, SavedDatasetRedshiftStorage) self.to_redshift(table_name=storage.redshift_options.table) diff --git a/sdk/python/feast/infra/offline_stores/redshift_source.py b/sdk/python/feast/infra/offline_stores/redshift_source.py index 24b2a04f93..d6d51d0310 100644 --- a/sdk/python/feast/infra/offline_stores/redshift_source.py +++ b/sdk/python/feast/infra/offline_stores/redshift_source.py @@ -1,11 +1,14 @@ -import warnings from typing import Callable, Dict, Iterable, Optional, Tuple from typeguard import typechecked from feast import type_map from feast.data_source import DataSource -from feast.errors import DataSourceNotFoundException, RedshiftCredentialsError +from feast.errors import ( + DataSourceNoNameException, + DataSourceNotFoundException, + RedshiftCredentialsError, +) from feast.feature_logging import LoggingDestination from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.protos.feast.core.FeatureService_pb2 import ( @@ -24,81 +27,64 @@ class RedshiftSource(DataSource): def __init__( self, *, - event_timestamp_column: Optional[str] = "", + name: Optional[str] = None, + timestamp_field: Optional[str] = "", table: Optional[str] = None, schema: Optional[str] = None, created_timestamp_column: Optional[str] = "", field_mapping: Optional[Dict[str, str]] = None, - date_partition_column: Optional[str] = None, query: Optional[str] = None, - name: Optional[str] = None, description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", database: Optional[str] = "", - timestamp_field: Optional[str] = "", ): """ Creates a RedshiftSource object. Args: - event_timestamp_column (optional): (Deprecated in favor of timestamp_field) Event - timestamp column used for point in time joins of feature values. - table (optional): Redshift table where the features are stored. + name (optional): Name for the source. Defaults to the table if not specified, in which + case the table must be specified. + timestamp_field (optional): Event timestamp field used for point in time + joins of feature values. + table (optional): Redshift table where the features are stored. Exactly one of 'table' + and 'query' must be specified. schema (optional): Redshift schema in which the table is located. created_timestamp_column (optional): Timestamp column indicating when the row was created, used for deduplicating rows. field_mapping (optional): A dictionary mapping of column names in this data source to column names in a feature table or view. - date_partition_column (deprecated): Timestamp column used for partitioning. - query (optional): The query to be executed to obtain the features. - name (optional): Name for the source. Defaults to the table_ref if not specified. + query (optional): The query to be executed to obtain the features. Exactly one of 'table' + and 'query' must be specified. description (optional): A human-readable description. tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the redshift source, typically the email of the primary maintainer. database (optional): The Redshift database name. - timestamp_field (optional): Event timestamp field used for point in time - joins of feature values. """ + if table is None and query is None: + raise ValueError('No "table" or "query" argument provided.') + # The default Redshift schema is named "public". _schema = "public" if table and not schema else schema self.redshift_options = RedshiftOptions( table=table, schema=_schema, query=query, database=database ) - if table is None and query is None: - raise ValueError('No "table" argument provided.') - _name = name - if not _name: - if table: - _name = table - else: - warnings.warn( - ( - f"Starting in Feast 0.24, Feast will require either a name for a data source (if using query) " - f"or `table`: {self.query}" - ), - DeprecationWarning, - ) - if date_partition_column: - warnings.warn( - ( - "The argument 'date_partition_column' is not supported for Redshift sources." - "It will be removed in Feast 0.24+" - ), - DeprecationWarning, - ) + # If no name, use the table as the default name. + if name is None and table is None: + raise DataSourceNoNameException() + name = name or table + assert name super().__init__( - name=_name if _name else "", - event_timestamp_column=event_timestamp_column, + name=name, + timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping, description=description, tags=tags, owner=owner, - timestamp_field=timestamp_field, ) @staticmethod @@ -114,11 +100,11 @@ def from_proto(data_source: DataSourceProto): """ return RedshiftSource( name=data_source.name, - field_mapping=dict(data_source.field_mapping), + timestamp_field=data_source.timestamp_field, table=data_source.redshift_options.table, schema=data_source.redshift_options.schema, - timestamp_field=data_source.timestamp_field, created_timestamp_column=data_source.created_timestamp_column, + field_mapping=dict(data_source.field_mapping), query=data_source.redshift_options.query, description=data_source.description, tags=dict(data_source.tags), diff --git a/sdk/python/feast/infra/offline_stores/snowflake.py b/sdk/python/feast/infra/offline_stores/snowflake.py index 0f4c6a7b52..aab6871865 100644 --- a/sdk/python/feast/infra/offline_stores/snowflake.py +++ b/sdk/python/feast/infra/offline_stores/snowflake.py @@ -19,8 +19,7 @@ import numpy as np import pandas as pd import pyarrow -import pyarrow as pa -from pydantic import Field +from pydantic import Field, StrictStr from pydantic.typing import Literal from pytz import utc @@ -40,13 +39,13 @@ SnowflakeLoggingDestination, SnowflakeSource, ) -from feast.infra.utils.snowflake_utils import ( +from feast.infra.registry.base_registry import BaseRegistry +from feast.infra.utils.snowflake.snowflake_utils import ( execute_snowflake_statement, get_snowflake_conn, write_pandas, write_parquet, ) -from feast.registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage from feast.usage import log_exceptions_and_usage @@ -85,10 +84,13 @@ class SnowflakeOfflineStoreConfig(FeastConfigBaseModel): warehouse: Optional[str] = None """ Snowflake warehouse name """ - database: Optional[str] = None + authenticator: Optional[str] = None + """ Snowflake authenticator name """ + + database: StrictStr """ Snowflake database name """ - schema_: Optional[str] = Field(None, alias="schema") + schema_: Optional[str] = Field("PUBLIC", alias="schema") """ Snowflake schema name """ storage_integration_name: Optional[str] = None @@ -114,12 +116,12 @@ def pull_latest_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: - assert isinstance(data_source, SnowflakeSource) assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) + assert isinstance(data_source, SnowflakeSource) - from_expression = ( - data_source.get_table_query_string() - ) # returns schema.table as a string + from_expression = data_source.get_table_query_string() + if not data_source.database and data_source.table: + from_expression = f'"{config.offline_store.database}"."{config.offline_store.schema_}".{from_expression}' if join_key_columns: partition_by_join_key_string = '"' + '", "'.join(join_key_columns) + '"' @@ -145,6 +147,9 @@ def pull_latest_from_table_or_query( snowflake_conn = get_snowflake_conn(config.offline_store) + start_date = start_date.astimezone(tz=utc) + end_date = end_date.astimezone(tz=utc) + query = f""" SELECT {field_string} @@ -153,7 +158,7 @@ def pull_latest_from_table_or_query( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS "_feast_row" FROM {from_expression} - WHERE "{timestamp_field}" BETWEEN TO_TIMESTAMP_NTZ({start_date.timestamp()}) AND TO_TIMESTAMP_NTZ({end_date.timestamp()}) + WHERE "{timestamp_field}" BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' ) WHERE "_feast_row" = 1 """ @@ -177,8 +182,12 @@ def pull_all_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: + assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) assert isinstance(data_source, SnowflakeSource) + from_expression = data_source.get_table_query_string() + if not data_source.database and data_source.table: + from_expression = f'"{config.offline_store.database}"."{config.offline_store.schema_}".{from_expression}' field_string = ( '"' @@ -219,6 +228,8 @@ def get_historical_features( full_feature_names: bool = False, ) -> RetrievalJob: assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) + for fv in feature_views: + assert isinstance(fv.batch_source, SnowflakeSource) snowflake_conn = get_snowflake_conn(config.offline_store) @@ -323,18 +334,8 @@ def offline_write_batch( table: pyarrow.Table, progress: Optional[Callable[[int], Any]], ): - if not feature_view.batch_source: - raise ValueError( - "feature view does not have a batch source to persist offline data" - ) - if not isinstance(config.offline_store, SnowflakeOfflineStoreConfig): - raise ValueError( - f"offline store config is of type {type(config.offline_store)} when snowflake type required" - ) - if not isinstance(feature_view.batch_source, SnowflakeSource): - raise ValueError( - f"feature view batch source is {type(feature_view.batch_source)} not snowflake source" - ) + assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) + assert isinstance(feature_view.batch_source, SnowflakeSource) pa_schema, column_names = offline_utils.get_pyarrow_schema_from_batch_source( config, feature_view.batch_source @@ -383,9 +384,7 @@ def query_generator() -> Iterator[str]: self.snowflake_conn = snowflake_conn self.config = config self._full_feature_names = full_feature_names - self._on_demand_feature_views = ( - on_demand_feature_views if on_demand_feature_views else [] - ) + self._on_demand_feature_views = on_demand_feature_views or [] self._metadata = metadata self.export_path: Optional[str] if self.config.offline_store.blob_export_location: @@ -398,7 +397,7 @@ def full_feature_names(self) -> bool: return self._full_feature_names @property - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + def on_demand_feature_views(self) -> List[OnDemandFeatureView]: return self._on_demand_feature_views def _to_df_internal(self) -> pd.DataFrame: @@ -410,7 +409,7 @@ def _to_df_internal(self) -> pd.DataFrame: return df - def _to_arrow_internal(self) -> pa.Table: + def _to_arrow_internal(self) -> pyarrow.Table: with self._query_generator() as query: pa_table = execute_snowflake_statement( @@ -423,13 +422,13 @@ def _to_arrow_internal(self) -> pa.Table: else: empty_result = execute_snowflake_statement(self.snowflake_conn, query) - return pa.Table.from_pandas( + return pyarrow.Table.from_pandas( pd.DataFrame(columns=[md.name for md in empty_result.description]) ) def to_snowflake(self, table_name: str, temporary=False) -> None: """Save dataset as a new Snowflake table""" - if self.on_demand_feature_views is not None: + if self.on_demand_feature_views: transformed_df = self.to_df() write_pandas( @@ -459,7 +458,7 @@ def to_arrow_chunks(self, arrow_options: Optional[Dict] = None) -> Optional[List return arrow_batches - def persist(self, storage: SavedDatasetStorage): + def persist(self, storage: SavedDatasetStorage, allow_overwrite: bool = False): assert isinstance(storage, SavedDatasetSnowflakeStorage) self.to_snowflake(table_name=storage.snowflake_options.table) @@ -486,21 +485,22 @@ def to_remote_storage(self) -> List[str]: table = f"temporary_{uuid.uuid4().hex}" self.to_snowflake(table) - copy_into_query = f"""copy into '{self.config.offline_store.blob_export_location}/{table}' from "{self.config.offline_store.database}"."{self.config.offline_store.schema_}"."{table}"\n - storage_integration = {self.config.offline_store.storage_integration_name}\n - file_format = (TYPE = PARQUET)\n - DETAILED_OUTPUT = TRUE\n - HEADER = TRUE;\n + query = f""" + COPY INTO '{self.config.offline_store.blob_export_location}/{table}' FROM "{self.config.offline_store.database}"."{self.config.offline_store.schema_}"."{table}"\n + STORAGE_INTEGRATION = {self.config.offline_store.storage_integration_name}\n + FILE_FORMAT = (TYPE = PARQUET) + DETAILED_OUTPUT = TRUE + HEADER = TRUE """ + cursor = execute_snowflake_statement(self.snowflake_conn, query) - cursor = execute_snowflake_statement(self.snowflake_conn, copy_into_query) - all_rows = ( - cursor.fetchall() - ) # This may be need pagination at some point in the future. file_name_column_index = [ idx for idx, rm in enumerate(cursor.description) if rm.name == "FILE_NAME" ][0] - return [f"{self.export_path}/{row[file_name_column_index]}" for row in all_rows] + return [ + f"{self.export_path}/{row[file_name_column_index]}" + for row in cursor.fetchall() + ] def _get_entity_schema( @@ -532,6 +532,7 @@ def _upload_entity_df( if isinstance(entity_df, pd.DataFrame): # Write the data from the DataFrame to the table + # Known issues with following entity data types: BINARY write_pandas( snowflake_conn, entity_df, diff --git a/sdk/python/feast/infra/offline_stores/snowflake_source.py b/sdk/python/feast/infra/offline_stores/snowflake_source.py index 258fba71b1..a25e8fd903 100644 --- a/sdk/python/feast/infra/offline_stores/snowflake_source.py +++ b/sdk/python/feast/infra/offline_stores/snowflake_source.py @@ -1,10 +1,10 @@ -import warnings from typing import Callable, Dict, Iterable, Optional, Tuple from typeguard import typechecked from feast import type_map from feast.data_source import DataSource +from feast.errors import DataSourceNoNameException, DataSourceNotFoundException from feast.feature_logging import LoggingDestination from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.protos.feast.core.FeatureService_pb2 import ( @@ -23,45 +23,47 @@ class SnowflakeSource(DataSource): def __init__( self, *, + name: Optional[str] = None, + timestamp_field: Optional[str] = "", database: Optional[str] = None, warehouse: Optional[str] = None, schema: Optional[str] = None, table: Optional[str] = None, query: Optional[str] = None, - event_timestamp_column: Optional[str] = "", - date_partition_column: Optional[str] = None, created_timestamp_column: Optional[str] = "", field_mapping: Optional[Dict[str, str]] = None, - name: Optional[str] = None, description: Optional[str] = "", tags: Optional[Dict[str, str]] = None, owner: Optional[str] = "", - timestamp_field: Optional[str] = "", ): """ Creates a SnowflakeSource object. Args: + name (optional): Name for the source. Defaults to the table if not specified, in which + case the table must be specified. + timestamp_field (optional): Event timestamp field used for point in time + joins of feature values. database (optional): Snowflake database where the features are stored. warehouse (optional): Snowflake warehouse where the database is stored. schema (optional): Snowflake schema in which the table is located. - table (optional): Snowflake table where the features are stored. - event_timestamp_column (optional): (Deprecated in favor of timestamp_field) Event - timestamp column used for point in time joins of feature values. - query (optional): The query to be executed to obtain the features. + table (optional): Snowflake table where the features are stored. Exactly one of 'table' + and 'query' must be specified. + query (optional): The query to be executed to obtain the features. Exactly one of 'table' + and 'query' must be specified. created_timestamp_column (optional): Timestamp column indicating when the row was created, used for deduplicating rows. field_mapping (optional): A dictionary mapping of column names in this data source to column names in a feature table or view. - date_partition_column (deprecated): Timestamp column used for partitioning. - name (optional): Name for the source. Defaults to the table if not specified. description (optional): A human-readable description. tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the snowflake source, typically the email of the primary maintainer. """ if table is None and query is None: - raise ValueError('No "table" argument provided.') + raise ValueError('No "table" or "query" argument provided.') + if table and query: + raise ValueError('Both "table" and "query" argument provided.') # The default Snowflake schema is named "PUBLIC". _schema = "PUBLIC" if (database and table and not schema) else schema @@ -74,38 +76,20 @@ def __init__( warehouse=warehouse, ) - # If no name, use the table as the default name - _name = name - if not _name: - if table: - _name = table - else: - warnings.warn( - ( - f"Starting in Feast 0.24, Feast will require either a name for a data source (if using query) " - f"or `table`: {self.query}" - ), - DeprecationWarning, - ) - - if date_partition_column: - warnings.warn( - ( - "The argument 'date_partition_column' is not supported for Snowflake sources." - "It will be removed in Feast 0.24+" - ), - DeprecationWarning, - ) + # If no name, use the table as the default name. + if name is None and table is None: + raise DataSourceNoNameException() + name = name or table + assert name super().__init__( - name=_name if _name else "", - event_timestamp_column=event_timestamp_column, + name=name, + timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping, description=description, tags=tags, owner=owner, - timestamp_field=timestamp_field, ) @staticmethod @@ -121,13 +105,13 @@ def from_proto(data_source: DataSourceProto): """ return SnowflakeSource( name=data_source.name, - field_mapping=dict(data_source.field_mapping), + timestamp_field=data_source.timestamp_field, database=data_source.snowflake_options.database, schema=data_source.snowflake_options.schema, table=data_source.snowflake_options.table, warehouse=data_source.snowflake_options.warehouse, - timestamp_field=data_source.timestamp_field, created_timestamp_column=data_source.created_timestamp_column, + field_mapping=dict(data_source.field_mapping), query=data_source.snowflake_options.query, description=data_source.description, tags=dict(data_source.tags), @@ -216,7 +200,7 @@ def get_table_query_string(self) -> str: @staticmethod def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: - return type_map.snowflake_python_type_to_feast_value_type + return type_map.snowflake_type_to_feast_value_type def get_table_column_names_and_types( self, config: RepoConfig @@ -227,31 +211,108 @@ def get_table_column_names_and_types( Args: config: A RepoConfig describing the feature repo """ - from feast.infra.offline_stores.snowflake import SnowflakeOfflineStoreConfig - from feast.infra.utils.snowflake_utils import ( + from feast.infra.utils.snowflake.snowflake_utils import ( execute_snowflake_statement, get_snowflake_conn, ) assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) - snowflake_conn = get_snowflake_conn(config.offline_store) - - if self.database and self.table: - query = f'SELECT * FROM "{self.database}"."{self.schema}"."{self.table}" LIMIT 1' - elif self.table: - query = f'SELECT * FROM "{self.table}" LIMIT 1' - else: - query = f"SELECT * FROM ({self.query}) LIMIT 1" + with get_snowflake_conn(config.offline_store) as conn: + query = f"SELECT * FROM {self.get_table_query_string()} LIMIT 5" + cursor = execute_snowflake_statement(conn, query) + + metadata = [ + { + "column_name": column.name, + "type_code": column.type_code, + "precision": column.precision, + "scale": column.scale, + "is_nullable": column.is_nullable, + "snowflake_type": None, + } + for column in cursor.description + ] + + if cursor.fetch_pandas_all().empty: + raise DataSourceNotFoundException( + "The following source:\n" + query + "\n ... is empty" + ) - result = execute_snowflake_statement(snowflake_conn, query).fetch_pandas_all() + for row in metadata: + if row["type_code"] == 0: + if row["scale"] == 0: + if row["precision"] <= 9: # max precision size to ensure INT32 + row["snowflake_type"] = "NUMBER32" + elif row["precision"] <= 18: # max precision size to ensure INT64 + row["snowflake_type"] = "NUMBER64" + else: + column = row["column_name"] + + with get_snowflake_conn(config.offline_store) as conn: + query = f'SELECT MAX("{column}") AS "{column}" FROM {self.get_table_query_string()}' + result = execute_snowflake_statement( + conn, query + ).fetch_pandas_all() + if ( + result.dtypes[column].name + in python_int_to_snowflake_type_map + ): + row["snowflake_type"] = python_int_to_snowflake_type_map[ + result.dtypes[column].name + ] + else: + raise NotImplementedError( + "Numbers larger than INT64 are not supported" + ) + else: + raise NotImplementedError( + "The following Snowflake Data Type is not supported: DECIMAL -- Convert to DOUBLE" + ) + elif row["type_code"] in [3, 5, 9, 10, 12]: + error = snowflake_unsupported_map[row["type_code"]] + raise NotImplementedError( + f"The following Snowflake Data Type is not supported: {error}" + ) + elif row["type_code"] in [1, 2, 4, 6, 7, 8, 11, 13]: + row["snowflake_type"] = snowflake_type_code_map[row["type_code"]] + else: + raise NotImplementedError( + f"The following Snowflake Column is not supported: {row['column_name']} (type_code: {row['type_code']})" + ) - if not result.empty: - metadata = result.dtypes.apply(str) - return list(zip(metadata.index, metadata)) - else: - raise ValueError("The following source:\n" + query + "\n ... is empty") + return [ + (column["column_name"], column["snowflake_type"]) for column in metadata + ] + + +snowflake_type_code_map = { + 0: "NUMBER", + 1: "DOUBLE", + 2: "VARCHAR", + 4: "TIMESTAMP", + 6: "TIMESTAMP_LTZ", + 7: "TIMESTAMP_TZ", + 8: "TIMESTAMP_NTZ", + 11: "BINARY", + 13: "BOOLEAN", +} + +snowflake_unsupported_map = { + 3: "DATE -- Convert to TIMESTAMP", + 5: "VARIANT -- Try converting to VARCHAR", + 9: "OBJECT -- Try converting to VARCHAR", + 10: "ARRAY -- Try converting to VARCHAR", + 12: "TIME -- Try converting to VARCHAR", +} + +python_int_to_snowflake_type_map = { + "int64": "NUMBER64", + "int32": "NUMBER32", + "int16": "NUMBER32", + "int8": "NUMBER32", +} class SnowflakeOptions: @@ -319,7 +380,11 @@ class SavedDatasetSnowflakeStorage(SavedDatasetStorage): def __init__(self, table_ref: str): self.snowflake_options = SnowflakeOptions( - database=None, schema=None, table=table_ref, query=None, warehouse=None + database=None, + schema=None, + table=table_ref, + query=None, + warehouse=None, ) @staticmethod diff --git a/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/README.md b/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/README.md new file mode 100644 index 0000000000..3dea1917aa --- /dev/null +++ b/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/README.md @@ -0,0 +1,131 @@ +# Cassandra/Astra DB Online Store + +This contribution makes it possible to use [Apache Cassandra™](https://cassandra.apache.org) / +[Astra DB](https://astra.datastax.com/) as online store for Feast. + +Once the database connection and the keyspace are configured, everything else +is handled as with any other online store: table creation, +read/write from/to table and table destruction. + +## Quick usage + +The following refers to the [Feast quickstart](https://docs.feast.dev/getting-started/quickstart) page. Only +Step 2 ("Create a feature repository") is slightly different, as it involves +a bit of specific configuration about the Astra DB / Cassandra cluster you +are going to use. + +It will be assumed that Feast has been installed in your system. + +### Creating the feature repository + +The easiest way to get started is to use the Feast CLI to initialize a new +feature store. Once Feast is installed, the command + +``` +feast init FEATURE_STORE_NAME -t cassandra +``` + +will interactively help you create the `feature_store.yaml` with the +required configuration details to access your Cassandra / Astra DB instance. + +Alternatively, you can run `feast init -t FEATURE_STORE_NAME`, as described +in the quickstart, and then manually edit the `online_store` key in +the `feature_store.yaml` file as detailed below. + +The following steps (setup of feature definitions, deployment of the store, +generation of training data, materialization, fetching of online/offline +features) proceed exactly as in the general Feast quickstart instructions. + +#### Cassandra setup + +The only required settings are `hosts` and `type`. The port number +is to be provided only if different than the default (9042), +and username/password only if the database requires authentication. + +```yaml +[...] +online_store: + type: cassandra + hosts: + - 192.168.1.1 + - 192.168.1.2 + - 192.168.1.3 + keyspace: KeyspaceName + port: 9042 # optional + username: user # optional + password: secret # optional + protocol_version: 5 # optional + load_balancing: # optional + local_dc: 'datacenter1' # optional + load_balancing_policy: 'TokenAwarePolicy(DCAwareRoundRobinPolicy)' # optional +``` + +#### Astra DB setup: + +To point Feast to using an Astra DB instance as online store, an +[Astra DB token](https://awesome-astra.github.io/docs/pages/astra/create-token/#c-procedure) +with "Database Administrator" role is required: provide the Client ID and +Client Secret in the token as username and password. + +The +["secure connect bundle"](https://awesome-astra.github.io/docs/pages/astra/download-scb/#c-procedure) +for connecting to the database is also needed: +its full path must be given in the configuration below: + +```yaml +[...] +online_store: + type: cassandra + secure_bundle_path: /path/to/secure/bundle.zip + keyspace: KeyspaceName + username: Client_ID + password: Client_Secret + protocol_version: 4 # optional + load_balancing: # optional + local_dc: 'eu-central-1' # optional + load_balancing_policy: 'TokenAwarePolicy(DCAwareRoundRobinPolicy)' # optional +``` + +#### Protocol version and load-balancing settings + +Whether on Astra DB or Cassandra, there are some optional settings in the +store definition yaml: + +```yaml + [...] + protocol_version: 5 # optional + load_balancing: # optional + local_dc: 'datacenter1' # optional + load_balancing_policy: 'TokenAwarePolicy(DCAwareRoundRobinPolicy)' # optional +``` + +If you specify a protocol version (4 for `Astra DB` as of June 2022, 5 for `Cassandra 4.*`), +you avoid the drivers having to negotiate it on their own, thus speeding up initialization +time (and reducing the `INFO` messages being logged). See [this page](https://docs.datastax.com/en/developer/python-driver/3.25/api/cassandra/#cassandra.ProtocolVersion) for a listing +of protocol versions. + +You should provide the load-balancing properties as well (the reference datacenter +to use for the connection and the load-balancing policy to use). In a future version +of the driver, according to the warnings issued in the logs, this will become mandatory. +The former parameter is a region name for Astra DB instances (as can be verified on the Astra DB UI). +See the source code of the online store integration for the allowed values of +the latter parameter. + +### More info + +For a more detailed walkthrough, please see the +[Awesome Astra](https://awesome-astra.github.io/docs/pages/tools/integration/feast/) +page on the Feast integration. + +## Features + +The plugin leverages the architecture of Cassandra for optimal performance: + +- table partitioning tailored to data access pattern; +- prepared statements. + +#### Credits + +The author of this plugin acknowledges prior exploratory work by +[`hamzakpt`](https://github.com/hamzakpt) and Brian Mortimore, +on which this implementation is loosely based. diff --git a/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/__init__.py b/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/cassandra_online_store.py b/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/cassandra_online_store.py new file mode 100644 index 0000000000..ee0cb19fef --- /dev/null +++ b/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/cassandra_online_store.py @@ -0,0 +1,547 @@ +# +# Copyright 2019 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Cassandra/Astra DB online store for Feast. +""" + +import logging +from datetime import datetime +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple + +from cassandra.auth import PlainTextAuthProvider +from cassandra.cluster import ( + EXEC_PROFILE_DEFAULT, + Cluster, + ExecutionProfile, + ResultSet, + Session, +) +from cassandra.policies import DCAwareRoundRobinPolicy, TokenAwarePolicy +from cassandra.query import PreparedStatement +from pydantic import StrictInt, StrictStr +from pydantic.typing import Literal + +from feast import Entity, FeatureView, RepoConfig +from feast.infra.key_encoding_utils import serialize_entity_key +from feast.infra.online_stores.online_store import OnlineStore +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.repo_config import FeastConfigBaseModel +from feast.usage import log_exceptions_and_usage, tracing_span + +# Error messages +E_CASSANDRA_UNEXPECTED_CONFIGURATION_CLASS = ( + "Unexpected configuration object (not a CassandraOnlineStoreConfig instance)" +) +E_CASSANDRA_NOT_CONFIGURED = ( + "Inconsistent Cassandra configuration: provide exactly one between " + "'hosts' and 'secure_bundle_path' and a 'keyspace'" +) +E_CASSANDRA_MISCONFIGURED = ( + "Inconsistent Cassandra configuration: provide either 'hosts' or " + "'secure_bundle_path', not both" +) +E_CASSANDRA_INCONSISTENT_AUTH = ( + "Username and password for Cassandra must be provided either both or none" +) +E_CASSANDRA_UNKNOWN_LB_POLICY = ( + "Unknown/unsupported Load Balancing Policy name in Cassandra configuration" +) + +# CQL command templates (that is, before replacing schema names) +INSERT_CQL_4_TEMPLATE = ( + "INSERT INTO {fqtable} (feature_name," + " value, entity_key, event_ts) VALUES" + " (?, ?, ?, ?);" +) + +SELECT_CQL_TEMPLATE = "SELECT {columns} FROM {fqtable} WHERE entity_key = ?;" + +CREATE_TABLE_CQL_TEMPLATE = """ + CREATE TABLE IF NOT EXISTS {fqtable} ( + entity_key TEXT, + feature_name TEXT, + value BLOB, + event_ts TIMESTAMP, + created_ts TIMESTAMP, + PRIMARY KEY ((entity_key), feature_name) + ) WITH CLUSTERING ORDER BY (feature_name ASC); +""" + +DROP_TABLE_CQL_TEMPLATE = "DROP TABLE IF EXISTS {fqtable};" + +# op_name -> (cql template string, prepare boolean) +CQL_TEMPLATE_MAP = { + # Queries/DML, statements to be prepared + "insert4": (INSERT_CQL_4_TEMPLATE, True), + "select": (SELECT_CQL_TEMPLATE, True), + # DDL, do not prepare these + "drop": (DROP_TABLE_CQL_TEMPLATE, False), + "create": (CREATE_TABLE_CQL_TEMPLATE, False), +} + +# Logger +logger = logging.getLogger(__name__) + + +class CassandraInvalidConfig(Exception): + def __init__(self, msg: str): + super().__init__(msg) + + +class CassandraOnlineStoreConfig(FeastConfigBaseModel): + """ + Configuration for the Cassandra/Astra DB online store. + + Exactly one of `hosts` and `secure_bundle_path` must be provided; + depending on which one, the connection will be to a regular Cassandra + or an Astra DB instance (respectively). + + If connecting to Astra DB, authentication must be provided with username + and password being the Client ID and Client Secret of the database token. + """ + + type: Literal["cassandra"] = "cassandra" + """Online store type selector.""" + + # settings for connection to Cassandra / Astra DB + + hosts: Optional[List[StrictStr]] = None + """List of host addresses to reach the cluster.""" + + secure_bundle_path: Optional[StrictStr] = None + """Path to the secure connect bundle (for Astra DB; replaces hosts).""" + + port: Optional[StrictInt] = None + """Port number for connecting to the cluster (optional).""" + + keyspace: StrictStr = "feast_keyspace" + """Target Cassandra keyspace where all tables will be.""" + + username: Optional[StrictStr] = None + """Username for DB auth, possibly Astra DB token Client ID.""" + + password: Optional[StrictStr] = None + """Password for DB auth, possibly Astra DB token Client Secret.""" + + protocol_version: Optional[StrictInt] = None + """Explicit specification of the CQL protocol version used.""" + + class CassandraLoadBalancingPolicy(FeastConfigBaseModel): + """ + Configuration block related to the Cluster's load-balancing policy. + """ + + load_balancing_policy: StrictStr + """ + A stringy description of the load balancing policy to instantiate + the cluster with. Supported values: + "DCAwareRoundRobinPolicy" + "TokenAwarePolicy(DCAwareRoundRobinPolicy)" + """ + + local_dc: StrictStr = "datacenter1" + """The local datacenter, usually necessary to create the policy.""" + + load_balancing: Optional[CassandraLoadBalancingPolicy] = None + """ + Details on the load-balancing policy: it will be + wrapped into an execution profile if present. + """ + + +class CassandraOnlineStore(OnlineStore): + """ + Cassandra/Astra DB online store implementation for Feast. + + Attributes: + _cluster: Cassandra cluster to connect to. + _session: (DataStax Cassandra drivers) session object + to issue commands. + _keyspace: Cassandra keyspace all tables live in. + _prepared_statements: cache of statements prepared by the driver. + """ + + _cluster: Cluster = None + _session: Session = None + _keyspace: str = "feast_keyspace" + _prepared_statements: Dict[str, PreparedStatement] = {} + + def _get_session(self, config: RepoConfig): + """ + Establish the database connection, if not yet created, + and return it. + + Also perform basic config validation checks. + """ + + online_store_config = config.online_store + if not isinstance(online_store_config, CassandraOnlineStoreConfig): + raise CassandraInvalidConfig(E_CASSANDRA_UNEXPECTED_CONFIGURATION_CLASS) + + if self._session: + return self._session + if not self._session: + # configuration consistency checks + hosts = online_store_config.hosts + secure_bundle_path = online_store_config.secure_bundle_path + port = online_store_config.port or 9042 + keyspace = online_store_config.keyspace + username = online_store_config.username + password = online_store_config.password + protocol_version = online_store_config.protocol_version + + db_directions = hosts or secure_bundle_path + if not db_directions or not keyspace: + raise CassandraInvalidConfig(E_CASSANDRA_NOT_CONFIGURED) + if hosts and secure_bundle_path: + raise CassandraInvalidConfig(E_CASSANDRA_MISCONFIGURED) + if (username is None) ^ (password is None): + raise CassandraInvalidConfig(E_CASSANDRA_INCONSISTENT_AUTH) + + if username is not None: + auth_provider = PlainTextAuthProvider( + username=username, + password=password, + ) + else: + auth_provider = None + + # handling of load-balancing policy (optional) + if online_store_config.load_balancing: + # construct a proper execution profile embedding + # the configured LB policy + _lbp_name = online_store_config.load_balancing.load_balancing_policy + if _lbp_name == "DCAwareRoundRobinPolicy": + lb_policy = DCAwareRoundRobinPolicy( + local_dc=online_store_config.load_balancing.local_dc, + ) + elif _lbp_name == "TokenAwarePolicy(DCAwareRoundRobinPolicy)": + lb_policy = TokenAwarePolicy( + DCAwareRoundRobinPolicy( + local_dc=online_store_config.load_balancing.local_dc, + ) + ) + else: + raise CassandraInvalidConfig(E_CASSANDRA_UNKNOWN_LB_POLICY) + + # wrap it up in a map of ex.profiles with a default + exe_profile = ExecutionProfile(load_balancing_policy=lb_policy) + execution_profiles = {EXEC_PROFILE_DEFAULT: exe_profile} + else: + execution_profiles = None + + # additional optional keyword args to Cluster + cluster_kwargs = { + k: v + for k, v in { + "protocol_version": protocol_version, + "execution_profiles": execution_profiles, + }.items() + if v is not None + } + + # creation of Cluster (Cassandra vs. Astra) + if hosts: + self._cluster = Cluster( + hosts, port=port, auth_provider=auth_provider, **cluster_kwargs + ) + else: + # we use 'secure_bundle_path' + self._cluster = Cluster( + cloud={"secure_connect_bundle": secure_bundle_path}, + auth_provider=auth_provider, + **cluster_kwargs, + ) + + # creation of Session + self._keyspace = keyspace + self._session = self._cluster.connect(self._keyspace) + + return self._session + + def __del__(self): + """ + One may be tempted to reclaim resources and do, here: + if self._session: + self._session.shutdown() + But *beware*, DON'T DO THIS. + Indeed this could destroy the session object before some internal + tasks runs in other threads (this is handled internally in the + Cassandra driver). + You'd get a RuntimeError "cannot schedule new futures after shutdown". + """ + pass + + @log_exceptions_and_usage(online_store="cassandra") + def online_write_batch( + self, + config: RepoConfig, + table: FeatureView, + data: List[ + Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] + ], + progress: Optional[Callable[[int], Any]], + ) -> None: + """ + Write a batch of features of several entities to the database. + + Args: + config: The RepoConfig for the current FeatureStore. + table: Feast FeatureView. + data: a list of quadruplets containing Feature data. Each + quadruplet contains an Entity Key, a dict containing feature + values, an event timestamp for the row, and + the created timestamp for the row if it exists. + progress: Optional function to be called once every mini-batch of + rows is written to the online store. Can be used to + display progress. + """ + project = config.project + for entity_key, values, timestamp, created_ts in data: + entity_key_bin = serialize_entity_key( + entity_key, entity_key_serialization_version=2 + ).hex() + with tracing_span(name="remote_call"): + self._write_rows( + config, + project, + table, + entity_key_bin, + values.items(), + timestamp, + created_ts, + ) + if progress: + progress(1) + + @log_exceptions_and_usage(online_store="cassandra") + def online_read( + self, + config: RepoConfig, + table: FeatureView, + entity_keys: List[EntityKeyProto], + requested_features: Optional[List[str]] = None, + ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + """ + Read feature values pertaining to the requested entities from + the online store. + + Args: + config: The RepoConfig for the current FeatureStore. + table: Feast FeatureView. + entity_keys: a list of entity keys that should be read + from the FeatureStore. + """ + project = config.project + + result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] + + for entity_key in entity_keys: + entity_key_bin = serialize_entity_key( + entity_key, entity_key_serialization_version=2 + ).hex() + + with tracing_span(name="remote_call"): + feature_rows = self._read_rows_by_entity_key( + config, + project, + table, + entity_key_bin, + columns=["feature_name", "value", "event_ts"], + ) + + res = {} + res_ts = None + for feature_row in feature_rows: + if ( + requested_features is None + or feature_row.feature_name in requested_features + ): + val = ValueProto() + val.ParseFromString(feature_row.value) + res[feature_row.feature_name] = val + res_ts = feature_row.event_ts + if not res: + result.append((None, None)) + else: + result.append((res_ts, res)) + return result + + @log_exceptions_and_usage(online_store="cassandra") + def update( + self, + config: RepoConfig, + tables_to_delete: Sequence[FeatureView], + tables_to_keep: Sequence[FeatureView], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + partial: bool, + ): + """ + Update schema on DB, by creating and destroying tables accordingly. + + Args: + config: The RepoConfig for the current FeatureStore. + tables_to_delete: Tables to delete from the Online Store. + tables_to_keep: Tables to keep in the Online Store. + """ + project = config.project + + for table in tables_to_keep: + with tracing_span(name="remote_call"): + self._create_table(config, project, table) + for table in tables_to_delete: + with tracing_span(name="remote_call"): + self._drop_table(config, project, table) + + @log_exceptions_and_usage(online_store="cassandra") + def teardown( + self, + config: RepoConfig, + tables: Sequence[FeatureView], + entities: Sequence[Entity], + ): + """ + Delete tables from the database. + + Args: + config: The RepoConfig for the current FeatureStore. + tables: Tables to delete from the feature repo. + """ + project = config.project + + for table in tables: + with tracing_span(name="remote_call"): + self._drop_table(config, project, table) + + @staticmethod + def _fq_table_name(keyspace: str, project: str, table: FeatureView) -> str: + """ + Generate a fully-qualified table name, + including quotes and keyspace. + """ + return f'"{keyspace}"."{project}_{table.name}"' + + def _write_rows( + self, + config: RepoConfig, + project: str, + table: FeatureView, + entity_key_bin: str, + features_vals: Iterable[Tuple[str, ValueProto]], + timestamp: datetime, + created_ts: Optional[datetime], + ): + """ + Handle the CQL (low-level) insertion of feature values to a table. + + Note: `created_ts` can be None: in that case we avoid explicitly + inserting it to prevent unnecessary tombstone creation on Cassandra. + Note: `created_ts` is being deprecated (July 2022) and the following + reflects this fact. + """ + session: Session = self._get_session(config) + keyspace: str = self._keyspace + fqtable = CassandraOnlineStore._fq_table_name(keyspace, project, table) + insert_cql = self._get_cql_statement(config, "insert4", fqtable=fqtable) + for feature_name, val in features_vals: + params: Sequence[object] = ( + feature_name, + val.SerializeToString(), + entity_key_bin, + timestamp, + ) + session.execute( + insert_cql, + params, + ) + + def _read_rows_by_entity_key( + self, + config: RepoConfig, + project: str, + table: FeatureView, + entity_key_bin: str, + columns: Optional[List[str]] = None, + ) -> ResultSet: + """ + Handle the CQL (low-level) reading of feature values from a table. + """ + session: Session = self._get_session(config) + keyspace: str = self._keyspace + fqtable = CassandraOnlineStore._fq_table_name(keyspace, project, table) + projection_columns = "*" if columns is None else ", ".join(columns) + select_cql = self._get_cql_statement( + config, + "select", + fqtable=fqtable, + columns=projection_columns, + ) + return session.execute(select_cql, [entity_key_bin]) + + def _drop_table( + self, + config: RepoConfig, + project: str, + table: FeatureView, + ): + """Handle the CQL (low-level) deletion of a table.""" + session: Session = self._get_session(config) + keyspace: str = self._keyspace + fqtable = CassandraOnlineStore._fq_table_name(keyspace, project, table) + drop_cql = self._get_cql_statement(config, "drop", fqtable) + logger.info(f"Deleting table {fqtable}.") + session.execute(drop_cql) + + def _create_table(self, config: RepoConfig, project: str, table: FeatureView): + """Handle the CQL (low-level) creation of a table.""" + session: Session = self._get_session(config) + keyspace: str = self._keyspace + fqtable = CassandraOnlineStore._fq_table_name(keyspace, project, table) + create_cql = self._get_cql_statement(config, "create", fqtable) + logger.info(f"Creating table {fqtable}.") + session.execute(create_cql) + + def _get_cql_statement( + self, config: RepoConfig, op_name: str, fqtable: str, **kwargs + ): + """ + Resolve an 'op_name' (create, insert4, etc) into a CQL statement + ready to be bound to parameters when executing. + + If the statement is defined to be 'prepared', use an instance-specific + cache of prepared statements. + + This additional layer makes it easy to control whether to use prepared + statements and, if so, on which database operations. + """ + session: Session = self._get_session(config) + template, prepare = CQL_TEMPLATE_MAP[op_name] + statement = template.format( + fqtable=fqtable, + **kwargs, + ) + if prepare: + # using the statement itself as key (no problem with that) + cache_key = statement + if cache_key not in self._prepared_statements: + logger.info(f"Preparing a {op_name} statement on {fqtable}.") + self._prepared_statements[cache_key] = session.prepare(statement) + return self._prepared_statements[cache_key] + else: + return statement diff --git a/sdk/python/feast/infra/online_stores/contrib/cassandra_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/cassandra_repo_configuration.py new file mode 100644 index 0000000000..a1d619646f --- /dev/null +++ b/sdk/python/feast/infra/online_stores/contrib/cassandra_repo_configuration.py @@ -0,0 +1,26 @@ +# +# Copyright 2019 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from tests.integration.feature_repos.integration_test_repo_config import ( + IntegrationTestRepoConfig, +) +from tests.integration.feature_repos.universal.online_store.cassandra import ( + CassandraOnlineStoreCreator, +) + +FULL_REPO_CONFIGS = [ + IntegrationTestRepoConfig(online_store_creator=CassandraOnlineStoreCreator), +] diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py new file mode 100644 index 0000000000..2a9f0d54cd --- /dev/null +++ b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py @@ -0,0 +1,10 @@ +from feast.infra.offline_stores.contrib.postgres_offline_store.tests.data_source import ( + PostgreSQLDataSourceCreator, +) +from tests.integration.feature_repos.integration_test_repo_config import ( + IntegrationTestRepoConfig, +) + +FULL_REPO_CONFIGS = [ + IntegrationTestRepoConfig(online_store_creator=PostgreSQLDataSourceCreator), +] diff --git a/sdk/python/feast/infra/online_stores/datastore.py b/sdk/python/feast/infra/online_stores/datastore.py index eabf2ccefc..ed4e7612ba 100644 --- a/sdk/python/feast/infra/online_stores/datastore.py +++ b/sdk/python/feast/infra/online_stores/datastore.py @@ -35,11 +35,12 @@ from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.repo_config import FeastConfigBaseModel, RepoConfig -from feast.usage import log_exceptions_and_usage, tracing_span +from feast.usage import get_user_agent, log_exceptions_and_usage, tracing_span LOGGER = logging.getLogger(__name__) try: + from google.api_core import client_info as http_client_info from google.auth.exceptions import DefaultCredentialsError from google.cloud import datastore from google.cloud.datastore.client import Key @@ -49,6 +50,10 @@ raise FeastExtrasDependencyImportError("gcp", str(e)) +def get_http_client_info(): + return http_client_info.ClientInfo(user_agent=get_user_agent()) + + ProtoBatch = Sequence[ Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] ] @@ -75,8 +80,13 @@ class DatastoreOnlineStoreConfig(FeastConfigBaseModel): class DatastoreOnlineStore(OnlineStore): """ - OnlineStore is an object used for all interaction between Feast and the service used for offline storage of - features. + Google Cloud Datastore implementation of the online store interface. + + See https://github.com/feast-dev/feast/blob/master/docs/specs/online_store_format.md#google-datastore-online-store-format + for more details about the data model for this implementation. + + Attributes: + _client: Datastore connection. """ _client: Optional[datastore.Client] = None @@ -331,8 +341,7 @@ def _initialize_client( ) -> datastore.Client: try: client = datastore.Client( - project=project_id, - namespace=namespace, + project=project_id, namespace=namespace, client_info=get_http_client_info() ) return client except DefaultCredentialsError as e: diff --git a/sdk/python/feast/infra/online_stores/dynamodb.py b/sdk/python/feast/infra/online_stores/dynamodb.py index 257a1fd80d..525978e736 100644 --- a/sdk/python/feast/infra/online_stores/dynamodb.py +++ b/sdk/python/feast/infra/online_stores/dynamodb.py @@ -16,7 +16,7 @@ from datetime import datetime from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple -from pydantic import StrictStr +from pydantic import StrictBool, StrictStr from pydantic.typing import Literal, Union from feast import Entity, FeatureView, utils @@ -30,10 +30,11 @@ from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.repo_config import FeastConfigBaseModel, RepoConfig -from feast.usage import log_exceptions_and_usage, tracing_span +from feast.usage import get_user_agent, log_exceptions_and_usage, tracing_span try: import boto3 + from botocore.config import Config from botocore.exceptions import ClientError except ImportError as e: from feast.errors import FeastExtrasDependencyImportError @@ -62,10 +63,13 @@ class DynamoDBOnlineStoreConfig(FeastConfigBaseModel): table_name_template: StrictStr = "{project}.{table_name}" """DynamoDB table name template""" + consistent_reads: StrictBool = False + """Whether to read from Dynamodb by forcing consistent reads""" + class DynamoDBOnlineStore(OnlineStore): """ - Online feature store for AWS DynamoDB. + AWS DynamoDB implementation of the online store interface. Attributes: _dynamodb_client: Boto3 DynamoDB client. @@ -236,12 +240,12 @@ def online_read( batch_entity_ids = { table_instance.name: { "Keys": [{"entity_id": entity_id} for entity_id in batch], - "ConsistentRead": True, + "ConsistentRead": online_config.consistent_reads, } } with tracing_span(name="remote_call"): response = dynamodb_resource.batch_get_item( - RequestItems=batch_entity_ids + RequestItems=batch_entity_ids, ) response = response.get("Responses") table_responses = response.get(table_instance.name) @@ -330,7 +334,12 @@ def _write_batch_non_duplicates( def _initialize_dynamodb_client(region: str, endpoint_url: Optional[str] = None): - return boto3.client("dynamodb", region_name=region, endpoint_url=endpoint_url) + return boto3.client( + "dynamodb", + region_name=region, + endpoint_url=endpoint_url, + config=Config(user_agent=get_user_agent()), + ) def _initialize_dynamodb_resource(region: str, endpoint_url: Optional[str] = None): diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index 04c6a065fb..fcc3376dce 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -27,8 +27,7 @@ class OnlineStore(ABC): """ - OnlineStore is an object used for all interaction between Feast and the service used for online storage of - features. + The interface that Feast uses to interact with the storage system that handles online features. """ @abstractmethod @@ -42,21 +41,20 @@ def online_write_batch( progress: Optional[Callable[[int], Any]], ) -> None: """ - Write a batch of feature rows to the online store. This is a low level interface, not - expected to be used by the users directly. + Writes a batch of feature rows to the online store. - If a tz-naive timestamp is passed to this method, it should be assumed to be UTC by implementors. + If a tz-naive timestamp is passed to this method, it is assumed to be UTC. Args: - config: The RepoConfig for the current FeatureStore. - table: Feast FeatureView - data: a list of quadruplets containing Feature data. Each quadruplet contains an Entity Key, - a dict containing feature values, an event timestamp for the row, and - the created timestamp for the row if it exists. - progress: Optional function to be called once every mini-batch of rows is written to - the online store. Can be used to display progress. + config: The config for the current feature store. + table: Feature view to which these feature rows correspond. + data: A list of quadruplets containing feature data. Each quadruplet contains an entity + key, a dict containing feature values, an event timestamp for the row, and the created + timestamp for the row if it exists. + progress: Function to be called once a batch of rows is written to the online store, used + to show progress. """ - ... + pass @abstractmethod def online_read( @@ -67,20 +65,20 @@ def online_read( requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: """ - Read feature values given an Entity Key. This is a low level interface, not - expected to be used by the users directly. + Reads features values for the given entity keys. Args: - config: The RepoConfig for the current FeatureStore. - table: Feast FeatureView - entity_keys: a list of entity keys that should be read from the FeatureStore. - requested_features: (Optional) A subset of the features that should be read from the FeatureStore. + config: The config for the current feature store. + table: The feature view whose feature values should be read. + entity_keys: The list of entity keys for which feature values should be read. + requested_features: The list of features that should be read. + Returns: - Data is returned as a list, one item per entity key in the original order as the entity_keys argument. - Each item in the list is a tuple of event_ts for the row, and the feature data as a dict from feature names - to values. Values are returned as Value proto message. + A list of the same length as entity_keys. Each item in the list is a tuple where the first + item is the event timestamp for the row, and the second item is a dict mapping feature names + to values, which are returned in proto format. """ - ... + pass @abstractmethod def update( @@ -92,7 +90,21 @@ def update( entities_to_keep: Sequence[Entity], partial: bool, ): - ... + """ + Reconciles cloud resources with the specified set of Feast objects. + + Args: + config: The config for the current feature store. + tables_to_delete: Feature views whose corresponding infrastructure should be deleted. + tables_to_keep: Feature views whose corresponding infrastructure should not be deleted, and + may need to be updated. + entities_to_delete: Entities whose corresponding infrastructure should be deleted. + entities_to_keep: Entities whose corresponding infrastructure should not be deleted, and + may need to be updated. + partial: If true, tables_to_delete and tables_to_keep are not exhaustive lists, so + infrastructure corresponding to other feature views should be not be touched. + """ + pass def plan( self, config: RepoConfig, desired_registry_proto: RegistryProto @@ -101,7 +113,7 @@ def plan( Returns the set of InfraObjects required to support the desired registry. Args: - config: The RepoConfig for the current FeatureStore. + config: The config for the current feature store. desired_registry_proto: The desired registry, in proto form. """ return [] @@ -113,4 +125,12 @@ def teardown( tables: Sequence[FeatureView], entities: Sequence[Entity], ): - ... + """ + Tears down all cloud resources for the specified set of Feast objects. + + Args: + config: The config for the current feature store. + tables: Feature views whose corresponding infrastructure should be deleted. + entities: Entities whose corresponding infrastructure should be deleted. + """ + pass diff --git a/sdk/python/feast/infra/online_stores/redis.py b/sdk/python/feast/infra/online_stores/redis.py index da458a3693..8af2097076 100644 --- a/sdk/python/feast/infra/online_stores/redis.py +++ b/sdk/python/feast/infra/online_stores/redis.py @@ -74,6 +74,16 @@ class RedisOnlineStoreConfig(FeastConfigBaseModel): class RedisOnlineStore(OnlineStore): + """ + Redis implementation of the online store interface. + + See https://github.com/feast-dev/feast/blob/master/docs/specs/online_store_format.md#redis-online-store-format + for more details about the data model for this implementation. + + Attributes: + _client: Redis connection. + """ + _client: Optional[Union[Redis, RedisCluster]] = None def delete_entity_values(self, config: RepoConfig, join_keys: List[str]): diff --git a/sdk/python/feast/infra/online_stores/snowflake.py b/sdk/python/feast/infra/online_stores/snowflake.py index 73c68e4bc0..a52beb73f7 100644 --- a/sdk/python/feast/infra/online_stores/snowflake.py +++ b/sdk/python/feast/infra/online_stores/snowflake.py @@ -6,19 +6,23 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple import pandas as pd -import pytz -from pydantic import Field +from pydantic import Field, StrictStr from pydantic.schema import Literal from feast.entity import Entity from feast.feature_view import FeatureView from feast.infra.key_encoding_utils import serialize_entity_key from feast.infra.online_stores.online_store import OnlineStore -from feast.infra.utils.snowflake_utils import get_snowflake_conn, write_pandas_binary +from feast.infra.utils.snowflake.snowflake_utils import ( + execute_snowflake_statement, + get_snowflake_conn, + write_pandas_binary, +) from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.usage import log_exceptions_and_usage +from feast.utils import to_naive_utc class SnowflakeOnlineStoreConfig(FeastConfigBaseModel): @@ -47,7 +51,10 @@ class SnowflakeOnlineStoreConfig(FeastConfigBaseModel): warehouse: Optional[str] = None """ Snowflake warehouse name """ - database: Optional[str] = None + authenticator: Optional[str] = None + """ Snowflake authenticator name """ + + database: StrictStr """ Snowflake database name """ schema_: Optional[str] = Field("PUBLIC", alias="schema") @@ -84,24 +91,15 @@ def online_write_batch( index=range(0, len(values)), ) - timestamp = _to_naive_utc(timestamp) + timestamp = to_naive_utc(timestamp) if created_ts is not None: - created_ts = _to_naive_utc(created_ts) + created_ts = to_naive_utc(created_ts) - entity_key_serialization_version = ( - config.entity_key_serialization_version - if config.entity_key_serialization_version - else 2 - ) for j, (feature_name, val) in enumerate(values.items()): df.loc[j, "entity_feature_key"] = serialize_entity_key( - entity_key, - entity_key_serialization_version, + entity_key, 2 ) + bytes(feature_name, encoding="utf-8") - df.loc[j, "entity_key"] = serialize_entity_key( - entity_key, - entity_key_serialization_version, - ) + df.loc[j, "entity_key"] = serialize_entity_key(entity_key, 2) df.loc[j, "feature_name"] = feature_name df.loc[j, "value"] = val.SerializeToString() df.loc[j, "event_ts"] = timestamp @@ -113,13 +111,20 @@ def online_write_batch( agg_df = pd.concat(dfs) # This combines both the data upload plus the overwrite in the same transaction + table_path = ( + f'"{config.online_store.database}"."{config.online_store.schema_}"' + ) with get_snowflake_conn(config.online_store, autocommit=False) as conn: write_pandas_binary( - conn, agg_df, f"[online-transient] {config.project}_{table.name}" + conn, + agg_df, + table_name=f"[online-transient] {config.project}_{table.name}", + database=f"{config.online_store.database}", + schema=f"{config.online_store.schema_}", ) # special function for writing binary to snowflake query = f""" - INSERT OVERWRITE INTO "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}" + INSERT OVERWRITE INTO {table_path}."[online-transient] {config.project}_{table.name}" SELECT "entity_feature_key", "entity_key", @@ -132,12 +137,11 @@ def online_write_batch( *, ROW_NUMBER() OVER(PARTITION BY "entity_key","feature_name" ORDER BY "event_ts" DESC, "created_ts" DESC) AS "_feast_row" FROM - "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}") + {table_path}."[online-transient] {config.project}_{table.name}") WHERE "_feast_row" = 1; """ - - conn.cursor().execute(query) + execute_snowflake_statement(conn, query) if progress: progress(len(data)) @@ -156,18 +160,12 @@ def online_read( result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] - entity_key_serialization_version = ( - config.entity_key_serialization_version - if config.entity_key_serialization_version - else 2 - ) - entity_fetch_str = ",".join( [ ( "TO_BINARY(" + hexlify( - serialize_entity_key(combo[0], entity_key_serialization_version) + serialize_entity_key(combo[0], 2) + bytes(combo[1], encoding="utf-8") ).__str__()[1:] + ")" @@ -176,28 +174,20 @@ def online_read( ] ) + table_path = f'"{config.online_store.database}"."{config.online_store.schema_}"' with get_snowflake_conn(config.online_store) as conn: - - df = ( - conn.cursor() - .execute( - f""" + query = f""" SELECT "entity_key", "feature_name", "value", "event_ts" FROM - "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}" + {table_path}."[online-transient] {config.project}_{table.name}" WHERE "entity_feature_key" IN ({entity_fetch_str}) - """, - ) - .fetch_pandas_all() - ) + """ + df = execute_snowflake_statement(conn, query).fetch_pandas_all() for entity_key in entity_keys: - entity_key_bin = serialize_entity_key( - entity_key, - entity_key_serialization_version, - ) + entity_key_bin = serialize_entity_key(entity_key, 2) res = {} res_ts = None for index, row in df[df["entity_key"] == entity_key_bin].iterrows(): @@ -224,26 +214,24 @@ def update( ): assert isinstance(config.online_store, SnowflakeOnlineStoreConfig) + table_path = f'"{config.online_store.database}"."{config.online_store.schema_}"' with get_snowflake_conn(config.online_store) as conn: - for table in tables_to_keep: - - conn.cursor().execute( - f"""CREATE TRANSIENT TABLE IF NOT EXISTS "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}" ( + query = f""" + CREATE TRANSIENT TABLE IF NOT EXISTS {table_path}."[online-transient] {config.project}_{table.name}" ( "entity_feature_key" BINARY, "entity_key" BINARY, "feature_name" VARCHAR, "value" BINARY, "event_ts" TIMESTAMP, "created_ts" TIMESTAMP - )""" - ) + ) + """ + execute_snowflake_statement(conn, query) for table in tables_to_delete: - - conn.cursor().execute( - f'DROP TABLE IF EXISTS "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}"' - ) + query = f'DROP TABLE IF EXISTS {table_path}."[online-transient] {config.project}_{table.name}"' + execute_snowflake_statement(conn, query) def teardown( self, @@ -253,15 +241,8 @@ def teardown( ): assert isinstance(config.online_store, SnowflakeOnlineStoreConfig) + table_path = f'"{config.online_store.database}"."{config.online_store.schema_}"' with get_snowflake_conn(config.online_store) as conn: - for table in tables: - query = f'DROP TABLE IF EXISTS "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}"' - conn.cursor().execute(query) - - -def _to_naive_utc(ts: datetime): - if ts.tzinfo is None: - return ts - else: - return ts.astimezone(pytz.utc).replace(tzinfo=None) + query = f'DROP TABLE IF EXISTS {table_path}."[online-transient] {config.project}_{table.name}"' + execute_snowflake_statement(conn, query) diff --git a/sdk/python/feast/infra/online_stores/sqlite.py b/sdk/python/feast/infra/online_stores/sqlite.py index a880cef050..6949b2bf24 100644 --- a/sdk/python/feast/infra/online_stores/sqlite.py +++ b/sdk/python/feast/infra/online_stores/sqlite.py @@ -50,8 +50,7 @@ class SqliteOnlineStoreConfig(FeastConfigBaseModel): class SqliteOnlineStore(OnlineStore): """ - OnlineStore is an object used for all interaction between Feast and the service used for offline storage of - features. + SQLite implementation of the online store interface. Not recommended for production usage. Attributes: _conn: SQLite connection. diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index 0b09f5df43..bb5cd38a83 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -11,17 +11,18 @@ from feast.feature_logging import FeatureServiceLoggingSource from feast.feature_service import FeatureService from feast.feature_view import FeatureView -from feast.infra.materialization import BatchMaterializationEngine, MaterializationTask from feast.infra.materialization.batch_materialization_engine import ( + BatchMaterializationEngine, MaterializationJobStatus, + MaterializationTask, ) from feast.infra.offline_stores.offline_store import RetrievalJob from feast.infra.offline_stores.offline_utils import get_offline_store_from_config from feast.infra.online_stores.helpers import get_online_store_from_config from feast.infra.provider import Provider +from feast.infra.registry.base_registry import BaseRegistry from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto -from feast.registry import BaseRegistry from feast.repo_config import BATCH_ENGINE_CLASS_FOR_TYPE, RepoConfig from feast.saved_dataset import SavedDataset from feast.stream_feature_view import StreamFeatureView @@ -37,7 +38,7 @@ class PassthroughProvider(Provider): """ - The Passthrough provider delegates all operations to the underlying online and offline stores. + The passthrough provider delegates all operations to the underlying online and offline stores. """ def __init__(self, config: RepoConfig): diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 086c9ec6b3..7d3c37e4c2 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -1,4 +1,4 @@ -import abc +from abc import ABC, abstractmethod from datetime import datetime from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union @@ -13,10 +13,10 @@ from feast.importer import import_class from feast.infra.infra_object import Infra from feast.infra.offline_stores.offline_store import RetrievalJob +from feast.infra.registry.base_registry import BaseRegistry from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto -from feast.registry import BaseRegistry from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDataset @@ -24,15 +24,22 @@ "gcp": "feast.infra.gcp.GcpProvider", "aws": "feast.infra.aws.AwsProvider", "local": "feast.infra.local.LocalProvider", + "azure": "feast.infra.contrib.azure_provider.AzureProvider", } -class Provider(abc.ABC): - @abc.abstractmethod +class Provider(ABC): + """ + A provider defines an implementation of a feature store object. It orchestrates the various + components of a feature store, such as the offline store, online store, and materialization + engine. It is configured through a RepoConfig object. + """ + + @abstractmethod def __init__(self, config: RepoConfig): - ... + pass - @abc.abstractmethod + @abstractmethod def update_infra( self, project: str, @@ -43,22 +50,20 @@ def update_infra( partial: bool, ): """ - Reconcile cloud resources with the objects declared in the feature repo. + Reconciles cloud resources with the specified set of Feast objects. Args: - project: Project to which tables belong - tables_to_delete: Tables that were deleted from the feature repo, so provider needs to - clean up the corresponding cloud resources. - tables_to_keep: Tables that are still in the feature repo. Depending on implementation, - provider may or may not need to update the corresponding resources. - entities_to_delete: Entities that were deleted from the feature repo, so provider needs to - clean up the corresponding cloud resources. - entities_to_keep: Entities that are still in the feature repo. Depending on implementation, - provider may or may not need to update the corresponding resources. - partial: if true, then tables_to_delete and tables_to_keep are *not* exhaustive lists. - There may be other tables that are not touched by this update. + project: Feast project to which the objects belong. + tables_to_delete: Feature views whose corresponding infrastructure should be deleted. + tables_to_keep: Feature views whose corresponding infrastructure should not be deleted, and + may need to be updated. + entities_to_delete: Entities whose corresponding infrastructure should be deleted. + entities_to_keep: Entities whose corresponding infrastructure should not be deleted, and + may need to be updated. + partial: If true, tables_to_delete and tables_to_keep are not exhaustive lists, so + infrastructure corresponding to other feature views should be not be touched. """ - ... + pass def plan_infra( self, config: RepoConfig, desired_registry_proto: RegistryProto @@ -72,7 +77,7 @@ def plan_infra( """ return Infra() - @abc.abstractmethod + @abstractmethod def teardown_infra( self, project: str, @@ -80,16 +85,16 @@ def teardown_infra( entities: Sequence[Entity], ): """ - Tear down all cloud resources for a repo. + Tears down all cloud resources for the specified set of Feast objects. Args: - project: Feast project to which tables belong - tables: Tables that are declared in the feature repo. - entities: Entities that are declared in the feature repo. + project: Feast project to which the objects belong. + tables: Feature views whose corresponding infrastructure should be deleted. + entities: Entities whose corresponding infrastructure should be deleted. """ - ... + pass - @abc.abstractmethod + @abstractmethod def online_write_batch( self, config: RepoConfig, @@ -100,21 +105,20 @@ def online_write_batch( progress: Optional[Callable[[int], Any]], ) -> None: """ - Write a batch of feature rows to the online store. This is a low level interface, not - expected to be used by the users directly. + Writes a batch of feature rows to the online store. If a tz-naive timestamp is passed to this method, it is assumed to be UTC. Args: - config: The RepoConfig for the current FeatureStore. - table: Feast FeatureView - data: a list of quadruplets containing Feature data. Each quadruplet contains an Entity Key, - a dict containing feature values, an event timestamp for the row, and - the created timestamp for the row if it exists. - progress: Optional function to be called once every mini-batch of rows is written to - the online store. Can be used to display progress. + config: The config for the current feature store. + table: Feature view to which these feature rows correspond. + data: A list of quadruplets containing feature data. Each quadruplet contains an entity + key, a dict containing feature values, an event timestamp for the row, and the created + timestamp for the row if it exists. + progress: Function to be called once a batch of rows is written to the online store, used + to show progress. """ - ... + pass def ingest_df( self, @@ -123,7 +127,12 @@ def ingest_df( df: pd.DataFrame, ): """ - Ingests a DataFrame directly into the online store + Persists a dataframe to the online store. + + Args: + feature_view: The feature view to which the dataframe corresponds. + entities: The entities that are referenced by the dataframe. + df: The dataframe to be persisted. """ pass @@ -133,11 +142,15 @@ def ingest_df_to_offline_store( df: pyarrow.Table, ): """ - Ingests a DataFrame directly into the offline store + Persists a dataframe to the offline store. + + Args: + feature_view: The feature view to which the dataframe corresponds. + df: The dataframe to be persisted. """ pass - @abc.abstractmethod + @abstractmethod def materialize_single_feature_view( self, config: RepoConfig, @@ -148,9 +161,21 @@ def materialize_single_feature_view( project: str, tqdm_builder: Callable[[int], tqdm], ) -> None: + """ + Writes latest feature values in the specified time range to the online store. + + Args: + config: The config for the current feature store. + feature_view: The feature view to materialize. + start_date: The start of the time range. + end_date: The end of the time range. + registry: The registry for the current feature store. + project: Feast project to which the objects belong. + tqdm_builder: A function to monitor the progress of materialization. + """ pass - @abc.abstractmethod + @abstractmethod def get_historical_features( self, config: RepoConfig, @@ -161,9 +186,28 @@ def get_historical_features( project: str, full_feature_names: bool, ) -> RetrievalJob: + """ + Retrieves the point-in-time correct historical feature values for the specified entity rows. + + Args: + config: The config for the current feature store. + feature_views: A list containing all feature views that are referenced in the entity rows. + feature_refs: The features to be retrieved. + entity_df: A collection of rows containing all entity columns on which features need to be joined, + as well as the timestamp column used for point-in-time joins. Either a pandas dataframe can be + provided or a SQL query. + registry: The registry for the current feature store. + project: Feast project to which the feature views belong. + full_feature_names: If True, feature names will be prefixed with the corresponding feature view name, + changing them from the format "feature" to "feature_view__feature" (e.g. "daily_transactions" + changes to "customer_fv__daily_transactions"). + + Returns: + A RetrievalJob that can be executed to get the features. + """ pass - @abc.abstractmethod + @abstractmethod def online_read( self, config: RepoConfig, @@ -172,32 +216,38 @@ def online_read( requested_features: List[str] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: """ - Read feature values given an Entity Key. This is a low level interface, not - expected to be used by the users directly. + Reads features values for the given entity keys. + + Args: + config: The config for the current feature store. + table: The feature view whose feature values should be read. + entity_keys: The list of entity keys for which feature values should be read. + requested_features: The list of features that should be read. Returns: - Data is returned as a list, one item per entity key. Each item in the list is a tuple - of event_ts for the row, and the feature data as a dict from feature names to values. - Values are returned as Value proto message. + A list of the same length as entity_keys. Each item in the list is a tuple where the first + item is the event timestamp for the row, and the second item is a dict mapping feature names + to values, which are returned in proto format. """ - ... + pass - @abc.abstractmethod + @abstractmethod def retrieve_saved_dataset( self, config: RepoConfig, dataset: SavedDataset ) -> RetrievalJob: """ - Read saved dataset from offline store. - All parameters for retrieval (like path, datetime boundaries, column names for both keys and features, etc) - are determined from SavedDataset object. + Reads a saved dataset. - Returns: - RetrievalJob object, which is lazy wrapper for actual query performed under the hood. + Args: + config: The config for the current feature store. + dataset: A SavedDataset object containing all parameters necessary for retrieving the dataset. + Returns: + A RetrievalJob that can be executed to get the saved dataset. """ - ... + pass - @abc.abstractmethod + @abstractmethod def write_feature_service_logs( self, feature_service: FeatureService, @@ -206,16 +256,20 @@ def write_feature_service_logs( registry: BaseRegistry, ): """ - Write features and entities logged by a feature server to an offline store. + Writes features and entities logged by a feature server to the offline store. - Schema of logs table is being inferred from the provided feature service. - Only feature services with configured logging are accepted. + The schema of the logs table is inferred from the specified feature service. Only feature + services with configured logging are accepted. - Logs dataset can be passed as Arrow Table or path to parquet directory. + Args: + feature_service: The feature service to be logged. + logs: The logs, either as an arrow table or as a path to a parquet directory. + config: The config for the current feature store. + registry: The registry for the current feature store. """ - ... + pass - @abc.abstractmethod + @abstractmethod def retrieve_feature_service_logs( self, feature_service: FeatureService, @@ -225,21 +279,26 @@ def retrieve_feature_service_logs( registry: BaseRegistry, ) -> RetrievalJob: """ - Read logged features from an offline store for a given time window [from, to). - Target table is determined based on logging configuration from the feature service. + Reads logged features for the specified time window. - Returns: - RetrievalJob object, which wraps the query to the offline store. + Args: + feature_service: The feature service whose logs should be retrieved. + start_date: The start of the window. + end_date: The end of the window. + config: The config for the current feature store. + registry: The registry for the current feature store. + Returns: + A RetrievalJob that can be executed to get the feature service logs. """ - ... + pass def get_feature_server_endpoint(self) -> Optional[str]: """Returns endpoint for the feature server, if it exists.""" return None -def get_provider(config: RepoConfig, repo_path: Path) -> Provider: +def get_provider(config: RepoConfig) -> Provider: if "." not in config.provider: if config.provider not in PROVIDERS_CLASS_FOR_TYPE: raise errors.FeastProviderNotImplementedError(config.provider) diff --git a/sdk/python/feast/infra/registry/__init__.py b/sdk/python/feast/infra/registry/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/registry/base_registry.py b/sdk/python/feast/infra/registry/base_registry.py new file mode 100644 index 0000000000..5edfae3472 --- /dev/null +++ b/sdk/python/feast/infra/registry/base_registry.py @@ -0,0 +1,647 @@ +# Copyright 2019 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +from abc import ABC, abstractmethod +from collections import defaultdict +from datetime import datetime +from typing import Any, Dict, List, Optional + +from google.protobuf.json_format import MessageToJson +from proto import Message + +from feast.base_feature_view import BaseFeatureView +from feast.data_source import DataSource +from feast.entity import Entity +from feast.feature_service import FeatureService +from feast.feature_view import FeatureView +from feast.infra.infra_object import Infra +from feast.on_demand_feature_view import OnDemandFeatureView +from feast.project_metadata import ProjectMetadata +from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto +from feast.request_feature_view import RequestFeatureView +from feast.saved_dataset import SavedDataset, ValidationReference +from feast.stream_feature_view import StreamFeatureView + + +class BaseRegistry(ABC): + """ + The interface that Feast uses to apply, list, retrieve, and delete Feast objects (e.g. entities, + feature views, and data sources). + """ + + # Entity operations + @abstractmethod + def apply_entity(self, entity: Entity, project: str, commit: bool = True): + """ + Registers a single entity with Feast + + Args: + entity: Entity that will be registered + project: Feast project that this entity belongs to + commit: Whether the change should be persisted immediately + """ + + @abstractmethod + def delete_entity(self, name: str, project: str, commit: bool = True): + """ + Deletes an entity or raises an exception if not found. + + Args: + name: Name of entity + project: Feast project that this entity belongs to + commit: Whether the change should be persisted immediately + """ + + @abstractmethod + def get_entity(self, name: str, project: str, allow_cache: bool = False) -> Entity: + """ + Retrieves an entity. + + Args: + name: Name of entity + project: Feast project that this entity belongs to + allow_cache: Whether to allow returning this entity from a cached registry + + Returns: + Returns either the specified entity, or raises an exception if + none is found + """ + + @abstractmethod + def list_entities(self, project: str, allow_cache: bool = False) -> List[Entity]: + """ + Retrieve a list of entities from the registry + + Args: + allow_cache: Whether to allow returning entities from a cached registry + project: Filter entities based on project name + + Returns: + List of entities + """ + + # Data source operations + @abstractmethod + def apply_data_source( + self, data_source: DataSource, project: str, commit: bool = True + ): + """ + Registers a single data source with Feast + + Args: + data_source: A data source that will be registered + project: Feast project that this data source belongs to + commit: Whether to immediately commit to the registry + """ + + @abstractmethod + def delete_data_source(self, name: str, project: str, commit: bool = True): + """ + Deletes a data source or raises an exception if not found. + + Args: + name: Name of data source + project: Feast project that this data source belongs to + commit: Whether the change should be persisted immediately + """ + + @abstractmethod + def get_data_source( + self, name: str, project: str, allow_cache: bool = False + ) -> DataSource: + """ + Retrieves a data source. + + Args: + name: Name of data source + project: Feast project that this data source belongs to + allow_cache: Whether to allow returning this data source from a cached registry + + Returns: + Returns either the specified data source, or raises an exception if none is found + """ + + @abstractmethod + def list_data_sources( + self, project: str, allow_cache: bool = False + ) -> List[DataSource]: + """ + Retrieve a list of data sources from the registry + + Args: + project: Filter data source based on project name + allow_cache: Whether to allow returning data sources from a cached registry + + Returns: + List of data sources + """ + + # Feature service operations + @abstractmethod + def apply_feature_service( + self, feature_service: FeatureService, project: str, commit: bool = True + ): + """ + Registers a single feature service with Feast + + Args: + feature_service: A feature service that will be registered + project: Feast project that this entity belongs to + """ + + @abstractmethod + def delete_feature_service(self, name: str, project: str, commit: bool = True): + """ + Deletes a feature service or raises an exception if not found. + + Args: + name: Name of feature service + project: Feast project that this feature service belongs to + commit: Whether the change should be persisted immediately + """ + + @abstractmethod + def get_feature_service( + self, name: str, project: str, allow_cache: bool = False + ) -> FeatureService: + """ + Retrieves a feature service. + + Args: + name: Name of feature service + project: Feast project that this feature service belongs to + allow_cache: Whether to allow returning this feature service from a cached registry + + Returns: + Returns either the specified feature service, or raises an exception if + none is found + """ + + @abstractmethod + def list_feature_services( + self, project: str, allow_cache: bool = False + ) -> List[FeatureService]: + """ + Retrieve a list of feature services from the registry + + Args: + allow_cache: Whether to allow returning entities from a cached registry + project: Filter entities based on project name + + Returns: + List of feature services + """ + + # Feature view operations + @abstractmethod + def apply_feature_view( + self, feature_view: BaseFeatureView, project: str, commit: bool = True + ): + """ + Registers a single feature view with Feast + + Args: + feature_view: Feature view that will be registered + project: Feast project that this feature view belongs to + commit: Whether the change should be persisted immediately + """ + + @abstractmethod + def delete_feature_view(self, name: str, project: str, commit: bool = True): + """ + Deletes a feature view or raises an exception if not found. + + Args: + name: Name of feature view + project: Feast project that this feature view belongs to + commit: Whether the change should be persisted immediately + """ + + # stream feature view operations + @abstractmethod + def get_stream_feature_view( + self, name: str, project: str, allow_cache: bool = False + ): + """ + Retrieves a stream feature view. + + Args: + name: Name of stream feature view + project: Feast project that this feature view belongs to + allow_cache: Allow returning feature view from the cached registry + + Returns: + Returns either the specified feature view, or raises an exception if + none is found + """ + + @abstractmethod + def list_stream_feature_views( + self, project: str, allow_cache: bool = False + ) -> List[StreamFeatureView]: + """ + Retrieve a list of stream feature views from the registry + + Args: + project: Filter stream feature views based on project name + allow_cache: Whether to allow returning stream feature views from a cached registry + + Returns: + List of stream feature views + """ + + # on demand feature view operations + @abstractmethod + def get_on_demand_feature_view( + self, name: str, project: str, allow_cache: bool = False + ) -> OnDemandFeatureView: + """ + Retrieves an on demand feature view. + + Args: + name: Name of on demand feature view + project: Feast project that this on demand feature view belongs to + allow_cache: Whether to allow returning this on demand feature view from a cached registry + + Returns: + Returns either the specified on demand feature view, or raises an exception if + none is found + """ + + @abstractmethod + def list_on_demand_feature_views( + self, project: str, allow_cache: bool = False + ) -> List[OnDemandFeatureView]: + """ + Retrieve a list of on demand feature views from the registry + + Args: + project: Filter on demand feature views based on project name + allow_cache: Whether to allow returning on demand feature views from a cached registry + + Returns: + List of on demand feature views + """ + + # regular feature view operations + @abstractmethod + def get_feature_view( + self, name: str, project: str, allow_cache: bool = False + ) -> FeatureView: + """ + Retrieves a feature view. + + Args: + name: Name of feature view + project: Feast project that this feature view belongs to + allow_cache: Allow returning feature view from the cached registry + + Returns: + Returns either the specified feature view, or raises an exception if + none is found + """ + + @abstractmethod + def list_feature_views( + self, project: str, allow_cache: bool = False + ) -> List[FeatureView]: + """ + Retrieve a list of feature views from the registry + + Args: + allow_cache: Allow returning feature views from the cached registry + project: Filter feature views based on project name + + Returns: + List of feature views + """ + + # request feature view operations + @abstractmethod + def get_request_feature_view(self, name: str, project: str) -> RequestFeatureView: + """ + Retrieves a request feature view. + + Args: + name: Name of request feature view + project: Feast project that this feature view belongs to + allow_cache: Allow returning feature view from the cached registry + + Returns: + Returns either the specified feature view, or raises an exception if + none is found + """ + + @abstractmethod + def list_request_feature_views( + self, project: str, allow_cache: bool = False + ) -> List[RequestFeatureView]: + """ + Retrieve a list of request feature views from the registry + + Args: + allow_cache: Allow returning feature views from the cached registry + project: Filter feature views based on project name + + Returns: + List of request feature views + """ + + @abstractmethod + def apply_materialization( + self, + feature_view: FeatureView, + project: str, + start_date: datetime, + end_date: datetime, + commit: bool = True, + ): + """ + Updates materialization intervals tracked for a single feature view in Feast + + Args: + feature_view: Feature view that will be updated with an additional materialization interval tracked + project: Feast project that this feature view belongs to + start_date (datetime): Start date of the materialization interval to track + end_date (datetime): End date of the materialization interval to track + commit: Whether the change should be persisted immediately + """ + + # Saved dataset operations + @abstractmethod + def apply_saved_dataset( + self, + saved_dataset: SavedDataset, + project: str, + commit: bool = True, + ): + """ + Stores a saved dataset metadata with Feast + + Args: + saved_dataset: SavedDataset that will be added / updated to registry + project: Feast project that this dataset belongs to + commit: Whether the change should be persisted immediately + """ + + @abstractmethod + def get_saved_dataset( + self, name: str, project: str, allow_cache: bool = False + ) -> SavedDataset: + """ + Retrieves a saved dataset. + + Args: + name: Name of dataset + project: Feast project that this dataset belongs to + allow_cache: Whether to allow returning this dataset from a cached registry + + Returns: + Returns either the specified SavedDataset, or raises an exception if + none is found + """ + + def delete_saved_dataset(self, name: str, project: str, allow_cache: bool = False): + """ + Delete a saved dataset. + + Args: + name: Name of dataset + project: Feast project that this dataset belongs to + allow_cache: Whether to allow returning this dataset from a cached registry + + Returns: + Returns either the specified SavedDataset, or raises an exception if + none is found + """ + + @abstractmethod + def list_saved_datasets( + self, project: str, allow_cache: bool = False + ) -> List[SavedDataset]: + """ + Retrieves a list of all saved datasets in specified project + + Args: + project: Feast project + allow_cache: Whether to allow returning this dataset from a cached registry + + Returns: + Returns the list of SavedDatasets + """ + + # Validation reference operations + @abstractmethod + def apply_validation_reference( + self, + validation_reference: ValidationReference, + project: str, + commit: bool = True, + ): + """ + Persist a validation reference + + Args: + validation_reference: ValidationReference that will be added / updated to registry + project: Feast project that this dataset belongs to + commit: Whether the change should be persisted immediately + """ + + @abstractmethod + def delete_validation_reference(self, name: str, project: str, commit: bool = True): + """ + Deletes a validation reference or raises an exception if not found. + + Args: + name: Name of validation reference + project: Feast project that this object belongs to + commit: Whether the change should be persisted immediately + """ + + @abstractmethod + def get_validation_reference( + self, name: str, project: str, allow_cache: bool = False + ) -> ValidationReference: + """ + Retrieves a validation reference. + + Args: + name: Name of dataset + project: Feast project that this dataset belongs to + allow_cache: Whether to allow returning this dataset from a cached registry + + Returns: + Returns either the specified ValidationReference, or raises an exception if + none is found + """ + + # TODO: Needs to be implemented. + def list_validation_references( + self, project: str, allow_cache: bool = False + ) -> List[ValidationReference]: + + """ + Retrieve a list of validation references from the registry + + Args: + allow_cache: Allow returning feature views from the cached registry + project: Filter feature views based on project name + + Returns: + List of request feature views + """ + + def list_project_metadata( + self, project: str, allow_cache: bool = False + ) -> List[ProjectMetadata]: + """ + Retrieves project metadata + + Args: + project: Filter metadata based on project name + allow_cache: Allow returning feature views from the cached registry + + Returns: + List of project metadata + """ + + @abstractmethod + def update_infra(self, infra: Infra, project: str, commit: bool = True): + """ + Updates the stored Infra object. + + Args: + infra: The new Infra object to be stored. + project: Feast project that the Infra object refers to + commit: Whether the change should be persisted immediately + """ + + @abstractmethod + def get_infra(self, project: str, allow_cache: bool = False) -> Infra: + """ + Retrieves the stored Infra object. + + Args: + project: Feast project that the Infra object refers to + allow_cache: Whether to allow returning this entity from a cached registry + + Returns: + The stored Infra object. + """ + + @abstractmethod + def apply_user_metadata( + self, + project: str, + feature_view: BaseFeatureView, + metadata_bytes: Optional[bytes], + ): + ... + + @abstractmethod + def get_user_metadata( + self, project: str, feature_view: BaseFeatureView + ) -> Optional[bytes]: + ... + + @abstractmethod + def proto(self) -> RegistryProto: + """ + Retrieves a proto version of the registry. + + Returns: + The registry proto object. + """ + + @abstractmethod + def commit(self): + """Commits the state of the registry cache to the remote registry store.""" + + @abstractmethod + def refresh(self, project: Optional[str]): + """Refreshes the state of the registry cache by fetching the registry state from the remote registry store.""" + + @staticmethod + def _message_to_sorted_dict(message: Message) -> Dict[str, Any]: + return json.loads(MessageToJson(message, sort_keys=True)) + + def to_dict(self, project: str) -> Dict[str, List[Any]]: + """Returns a dictionary representation of the registry contents for the specified project. + + For each list in the dictionary, the elements are sorted by name, so this + method can be used to compare two registries. + + Args: + project: Feast project to convert to a dict + """ + registry_dict: Dict[str, Any] = defaultdict(list) + registry_dict["project"] = project + for project_metadata in sorted(self.list_project_metadata(project=project)): + registry_dict["projectMetadata"].append( + self._message_to_sorted_dict(project_metadata.to_proto()) + ) + for data_source in sorted( + self.list_data_sources(project=project), key=lambda ds: ds.name + ): + registry_dict["dataSources"].append( + self._message_to_sorted_dict(data_source.to_proto()) + ) + for entity in sorted( + self.list_entities(project=project), key=lambda entity: entity.name + ): + registry_dict["entities"].append( + self._message_to_sorted_dict(entity.to_proto()) + ) + for feature_view in sorted( + self.list_feature_views(project=project), + key=lambda feature_view: feature_view.name, + ): + registry_dict["featureViews"].append( + self._message_to_sorted_dict(feature_view.to_proto()) + ) + for feature_service in sorted( + self.list_feature_services(project=project), + key=lambda feature_service: feature_service.name, + ): + registry_dict["featureServices"].append( + self._message_to_sorted_dict(feature_service.to_proto()) + ) + for on_demand_feature_view in sorted( + self.list_on_demand_feature_views(project=project), + key=lambda on_demand_feature_view: on_demand_feature_view.name, + ): + odfv_dict = self._message_to_sorted_dict(on_demand_feature_view.to_proto()) + + odfv_dict["spec"]["userDefinedFunction"][ + "body" + ] = on_demand_feature_view.udf_string + registry_dict["onDemandFeatureViews"].append(odfv_dict) + for request_feature_view in sorted( + self.list_request_feature_views(project=project), + key=lambda request_feature_view: request_feature_view.name, + ): + registry_dict["requestFeatureViews"].append( + self._message_to_sorted_dict(request_feature_view.to_proto()) + ) + for saved_dataset in sorted( + self.list_saved_datasets(project=project), key=lambda item: item.name + ): + registry_dict["savedDatasets"].append( + self._message_to_sorted_dict(saved_dataset.to_proto()) + ) + for infra_object in sorted(self.get_infra(project=project).infra_objects): + registry_dict["infra"].append( + self._message_to_sorted_dict(infra_object.to_proto()) + ) + return registry_dict diff --git a/sdk/python/feast/infra/registry/contrib/azure/__init__.py b/sdk/python/feast/infra/registry/contrib/azure/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/registry/contrib/azure/azure_registry_store.py b/sdk/python/feast/infra/registry/contrib/azure/azure_registry_store.py new file mode 100644 index 0000000000..9c00170b0f --- /dev/null +++ b/sdk/python/feast/infra/registry/contrib/azure/azure_registry_store.py @@ -0,0 +1,98 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import uuid +from datetime import datetime +from pathlib import Path +from tempfile import TemporaryFile +from urllib.parse import urlparse + +from feast.infra.registry.registry import RegistryConfig +from feast.infra.registry.registry_store import RegistryStore +from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto + +REGISTRY_SCHEMA_VERSION = "1" + + +class AzBlobRegistryStore(RegistryStore): + def __init__(self, registry_config: RegistryConfig, repo_path: Path): + try: + import logging + + from azure.identity import DefaultAzureCredential + from azure.storage.blob import BlobServiceClient + except ImportError as e: + from feast.errors import FeastExtrasDependencyImportError + + raise FeastExtrasDependencyImportError("az", str(e)) + + self._uri = urlparse(registry_config.path) + self._account_url = self._uri.scheme + "://" + self._uri.netloc + container_path = self._uri.path.lstrip("/").split("/") + self._container = container_path.pop(0) + self._path = "/".join(container_path) + + try: + # turn the verbosity of the blob client to warning and above (this reduces verbosity) + logger = logging.getLogger("azure") + logger.setLevel(logging.ERROR) + + # Attempt to use shared account key to login first + if "REGISTRY_BLOB_KEY" in os.environ: + client = BlobServiceClient( + account_url=self._account_url, + credential=os.environ["REGISTRY_BLOB_KEY"], + ) + self.blob = client.get_blob_client( + container=self._container, blob=self._path + ) + return + + default_credential = DefaultAzureCredential( + exclude_shared_token_cache_credential=True + ) + + client = BlobServiceClient( + account_url=self._account_url, credential=default_credential + ) + self.blob = client.get_blob_client( + container=self._container, blob=self._path + ) + except Exception as e: + print( + f"Could not connect to blob. Check the following\nIs the URL specified correctly?\nIs you IAM role set to Storage Blob Data Contributor? \n Errored out with exception {e}" + ) + + return + + def get_registry_proto(self): + file_obj = TemporaryFile() + registry_proto = RegistryProto() + + if self.blob.exists(): + download_stream = self.blob.download_blob() + file_obj.write(download_stream.readall()) + + file_obj.seek(0) + registry_proto.ParseFromString(file_obj.read()) + return registry_proto + raise FileNotFoundError( + f'Registry not found at path "{self._uri.geturl()}". Have you run "feast apply"?' + ) + + def update_registry_proto(self, registry_proto: RegistryProto): + self._write_registry(registry_proto) + + def teardown(self): + self.blob.delete_blob() + + def _write_registry(self, registry_proto: RegistryProto): + registry_proto.version_id = str(uuid.uuid4()) + registry_proto.last_updated.FromDatetime(datetime.utcnow()) + + file_obj = TemporaryFile() + file_obj.write(registry_proto.SerializeToString()) + file_obj.seek(0) + self.blob.upload_blob(file_obj, overwrite=True) # type: ignore + return diff --git a/sdk/python/feast/infra/registry_stores/contrib/postgres/registry_store.py b/sdk/python/feast/infra/registry/contrib/postgres/postgres_registry_store.py similarity index 98% rename from sdk/python/feast/infra/registry_stores/contrib/postgres/registry_store.py rename to sdk/python/feast/infra/registry/contrib/postgres/postgres_registry_store.py index b3c0c6bd36..362ec9f485 100644 --- a/sdk/python/feast/infra/registry_stores/contrib/postgres/registry_store.py +++ b/sdk/python/feast/infra/registry/contrib/postgres/postgres_registry_store.py @@ -3,10 +3,10 @@ import psycopg2 from psycopg2 import sql +from feast.infra.registry.registry_store import RegistryStore from feast.infra.utils.postgres.connection_utils import _get_conn from feast.infra.utils.postgres.postgres_config import PostgreSQLConfig from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto -from feast.registry_store import RegistryStore from feast.repo_config import RegistryConfig diff --git a/sdk/python/feast/infra/registry/file.py b/sdk/python/feast/infra/registry/file.py new file mode 100644 index 0000000000..3ee75a7880 --- /dev/null +++ b/sdk/python/feast/infra/registry/file.py @@ -0,0 +1,47 @@ +import uuid +from datetime import datetime +from pathlib import Path + +from feast.infra.registry.registry_store import RegistryStore +from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto +from feast.repo_config import RegistryConfig +from feast.usage import log_exceptions_and_usage + + +class FileRegistryStore(RegistryStore): + def __init__(self, registry_config: RegistryConfig, repo_path: Path): + registry_path = Path(registry_config.path) + if registry_path.is_absolute(): + self._filepath = registry_path + else: + self._filepath = repo_path.joinpath(registry_path) + + @log_exceptions_and_usage(registry="local") + def get_registry_proto(self): + registry_proto = RegistryProto() + if self._filepath.exists(): + registry_proto.ParseFromString(self._filepath.read_bytes()) + return registry_proto + raise FileNotFoundError( + f'Registry not found at path "{self._filepath}". Have you run "feast apply"?' + ) + + @log_exceptions_and_usage(registry="local") + def update_registry_proto(self, registry_proto: RegistryProto): + self._write_registry(registry_proto) + + def teardown(self): + try: + self._filepath.unlink() + except FileNotFoundError: + # If the file deletion fails with FileNotFoundError, the file has already + # been deleted. + pass + + def _write_registry(self, registry_proto: RegistryProto): + registry_proto.version_id = str(uuid.uuid4()) + registry_proto.last_updated.FromDatetime(datetime.utcnow()) + file_dir = self._filepath.parent + file_dir.mkdir(exist_ok=True) + with open(self._filepath, mode="wb", buffering=0) as f: + f.write(registry_proto.SerializeToString()) diff --git a/sdk/python/feast/infra/registry/gcs.py b/sdk/python/feast/infra/registry/gcs.py new file mode 100644 index 0000000000..6f922d4ea2 --- /dev/null +++ b/sdk/python/feast/infra/registry/gcs.py @@ -0,0 +1,75 @@ +import uuid +from datetime import datetime +from pathlib import Path +from tempfile import TemporaryFile +from urllib.parse import urlparse + +from feast.infra.registry.registry_store import RegistryStore +from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto +from feast.repo_config import RegistryConfig +from feast.usage import log_exceptions_and_usage + + +class GCSRegistryStore(RegistryStore): + def __init__(self, registry_config: RegistryConfig, repo_path: Path): + uri = registry_config.path + try: + import google.cloud.storage as storage + except ImportError as e: + from feast.errors import FeastExtrasDependencyImportError + + raise FeastExtrasDependencyImportError("gcp", str(e)) + + self.gcs_client = storage.Client() + self._uri = urlparse(uri) + self._bucket = self._uri.hostname + self._blob = self._uri.path.lstrip("/") + + @log_exceptions_and_usage(registry="gs") + def get_registry_proto(self): + import google.cloud.storage as storage + from google.cloud.exceptions import NotFound + + file_obj = TemporaryFile() + registry_proto = RegistryProto() + try: + bucket = self.gcs_client.get_bucket(self._bucket) + except NotFound: + raise Exception( + f"No bucket named {self._bucket} exists; please create it first." + ) + if storage.Blob(bucket=bucket, name=self._blob).exists(self.gcs_client): + self.gcs_client.download_blob_to_file( + self._uri.geturl(), file_obj, timeout=30 + ) + file_obj.seek(0) + registry_proto.ParseFromString(file_obj.read()) + return registry_proto + raise FileNotFoundError( + f'Registry not found at path "{self._uri.geturl()}". Have you run "feast apply"?' + ) + + @log_exceptions_and_usage(registry="gs") + def update_registry_proto(self, registry_proto: RegistryProto): + self._write_registry(registry_proto) + + def teardown(self): + from google.cloud.exceptions import NotFound + + gs_bucket = self.gcs_client.get_bucket(self._bucket) + try: + gs_bucket.delete_blob(self._blob) + except NotFound: + # If the blob deletion fails with NotFound, it has already been deleted. + pass + + def _write_registry(self, registry_proto: RegistryProto): + registry_proto.version_id = str(uuid.uuid4()) + registry_proto.last_updated.FromDatetime(datetime.utcnow()) + # we have already checked the bucket exists so no need to do it again + gs_bucket = self.gcs_client.get_bucket(self._bucket) + blob = gs_bucket.blob(self._blob) + file_obj = TemporaryFile() + file_obj.write(registry_proto.SerializeToString()) + file_obj.seek(0) + blob.upload_from_file(file_obj) diff --git a/sdk/python/feast/registry.py b/sdk/python/feast/infra/registry/registry.py similarity index 55% rename from sdk/python/feast/registry.py rename to sdk/python/feast/infra/registry/registry.py index 336bb2429f..09d22ee376 100644 --- a/sdk/python/feast/registry.py +++ b/sdk/python/feast/infra/registry/registry.py @@ -11,12 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import abc -import json import logging import uuid -from abc import abstractmethod -from collections import defaultdict from datetime import datetime, timedelta from enum import Enum from pathlib import Path @@ -24,9 +20,7 @@ from typing import Any, Dict, List, Optional from urllib.parse import urlparse -import dill from google.protobuf.internal.containers import RepeatedCompositeFieldContainer -from google.protobuf.json_format import MessageToJson from proto import Message from feast import usage @@ -48,11 +42,12 @@ from feast.feature_view import FeatureView from feast.importer import import_class from feast.infra.infra_object import Infra +from feast.infra.registry.base_registry import BaseRegistry +from feast.infra.registry.registry_store import NoopRegistryStore from feast.on_demand_feature_view import OnDemandFeatureView from feast.project_metadata import ProjectMetadata from feast.protos.feast.core.Registry_pb2 import ProjectMetadata as ProjectMetadataProto from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto -from feast.registry_store import NoopRegistryStore from feast.repo_config import RegistryConfig from feast.repo_contents import RepoContents from feast.request_feature_view import RequestFeatureView @@ -62,17 +57,18 @@ REGISTRY_SCHEMA_VERSION = "1" REGISTRY_STORE_CLASS_FOR_TYPE = { - "GCSRegistryStore": "feast.infra.gcp.GCSRegistryStore", - "S3RegistryStore": "feast.infra.aws.S3RegistryStore", - "LocalRegistryStore": "feast.infra.local.LocalRegistryStore", - "PostgreSQLRegistryStore": "feast.infra.registry_stores.contrib.postgres.registry_store.PostgreSQLRegistryStore", + "GCSRegistryStore": "feast.infra.registry.gcs.GCSRegistryStore", + "S3RegistryStore": "feast.infra.registry.s3.S3RegistryStore", + "FileRegistryStore": "feast.infra.registry.file.FileRegistryStore", + "PostgreSQLRegistryStore": "feast.infra.registry.contrib.postgres.postgres_registry_store.PostgreSQLRegistryStore", + "AzureRegistryStore": "feast.infra.registry.contrib.azure.azure_registry_store.AzBlobRegistryStore", } REGISTRY_STORE_CLASS_FOR_SCHEME = { "gs": "GCSRegistryStore", "s3": "S3RegistryStore", - "file": "LocalRegistryStore", - "": "LocalRegistryStore", + "file": "FileRegistryStore", + "": "FileRegistryStore", } @@ -149,613 +145,6 @@ def get_registry_store_class_from_scheme(registry_path: str): return get_registry_store_class_from_type(registry_store_type) -class BaseRegistry(abc.ABC): - # Entity operations - @abstractmethod - def apply_entity(self, entity: Entity, project: str, commit: bool = True): - """ - Registers a single entity with Feast - - Args: - entity: Entity that will be registered - project: Feast project that this entity belongs to - commit: Whether the change should be persisted immediately - """ - - @abstractmethod - def delete_entity(self, name: str, project: str, commit: bool = True): - """ - Deletes an entity or raises an exception if not found. - - Args: - name: Name of entity - project: Feast project that this entity belongs to - commit: Whether the change should be persisted immediately - """ - - @abstractmethod - def get_entity(self, name: str, project: str, allow_cache: bool = False) -> Entity: - """ - Retrieves an entity. - - Args: - name: Name of entity - project: Feast project that this entity belongs to - allow_cache: Whether to allow returning this entity from a cached registry - - Returns: - Returns either the specified entity, or raises an exception if - none is found - """ - - @abstractmethod - def list_entities(self, project: str, allow_cache: bool = False) -> List[Entity]: - """ - Retrieve a list of entities from the registry - - Args: - allow_cache: Whether to allow returning entities from a cached registry - project: Filter entities based on project name - - Returns: - List of entities - """ - - # Data source operations - @abstractmethod - def apply_data_source( - self, data_source: DataSource, project: str, commit: bool = True - ): - """ - Registers a single data source with Feast - - Args: - data_source: A data source that will be registered - project: Feast project that this data source belongs to - commit: Whether to immediately commit to the registry - """ - - @abstractmethod - def delete_data_source(self, name: str, project: str, commit: bool = True): - """ - Deletes a data source or raises an exception if not found. - - Args: - name: Name of data source - project: Feast project that this data source belongs to - commit: Whether the change should be persisted immediately - """ - - @abstractmethod - def get_data_source( - self, name: str, project: str, allow_cache: bool = False - ) -> DataSource: - """ - Retrieves a data source. - - Args: - name: Name of data source - project: Feast project that this data source belongs to - allow_cache: Whether to allow returning this data source from a cached registry - - Returns: - Returns either the specified data source, or raises an exception if none is found - """ - - @abstractmethod - def list_data_sources( - self, project: str, allow_cache: bool = False - ) -> List[DataSource]: - """ - Retrieve a list of data sources from the registry - - Args: - project: Filter data source based on project name - allow_cache: Whether to allow returning data sources from a cached registry - - Returns: - List of data sources - """ - - # Feature service operations - @abstractmethod - def apply_feature_service( - self, feature_service: FeatureService, project: str, commit: bool = True - ): - """ - Registers a single feature service with Feast - - Args: - feature_service: A feature service that will be registered - project: Feast project that this entity belongs to - """ - - @abstractmethod - def delete_feature_service(self, name: str, project: str, commit: bool = True): - """ - Deletes a feature service or raises an exception if not found. - - Args: - name: Name of feature service - project: Feast project that this feature service belongs to - commit: Whether the change should be persisted immediately - """ - - @abstractmethod - def get_feature_service( - self, name: str, project: str, allow_cache: bool = False - ) -> FeatureService: - """ - Retrieves a feature service. - - Args: - name: Name of feature service - project: Feast project that this feature service belongs to - allow_cache: Whether to allow returning this feature service from a cached registry - - Returns: - Returns either the specified feature service, or raises an exception if - none is found - """ - - @abstractmethod - def list_feature_services( - self, project: str, allow_cache: bool = False - ) -> List[FeatureService]: - """ - Retrieve a list of feature services from the registry - - Args: - allow_cache: Whether to allow returning entities from a cached registry - project: Filter entities based on project name - - Returns: - List of feature services - """ - - # Feature view operations - @abstractmethod - def apply_feature_view( - self, feature_view: BaseFeatureView, project: str, commit: bool = True - ): - """ - Registers a single feature view with Feast - - Args: - feature_view: Feature view that will be registered - project: Feast project that this feature view belongs to - commit: Whether the change should be persisted immediately - """ - - @abstractmethod - def delete_feature_view(self, name: str, project: str, commit: bool = True): - """ - Deletes a feature view or raises an exception if not found. - - Args: - name: Name of feature view - project: Feast project that this feature view belongs to - commit: Whether the change should be persisted immediately - """ - - # stream feature view operations - @abstractmethod - def get_stream_feature_view( - self, name: str, project: str, allow_cache: bool = False - ): - """ - Retrieves a stream feature view. - - Args: - name: Name of stream feature view - project: Feast project that this feature view belongs to - allow_cache: Allow returning feature view from the cached registry - - Returns: - Returns either the specified feature view, or raises an exception if - none is found - """ - - @abstractmethod - def list_stream_feature_views( - self, project: str, allow_cache: bool = False - ) -> List[StreamFeatureView]: - """ - Retrieve a list of stream feature views from the registry - - Args: - project: Filter stream feature views based on project name - allow_cache: Whether to allow returning stream feature views from a cached registry - - Returns: - List of stream feature views - """ - - # on demand feature view operations - @abstractmethod - def get_on_demand_feature_view( - self, name: str, project: str, allow_cache: bool = False - ) -> OnDemandFeatureView: - """ - Retrieves an on demand feature view. - - Args: - name: Name of on demand feature view - project: Feast project that this on demand feature view belongs to - allow_cache: Whether to allow returning this on demand feature view from a cached registry - - Returns: - Returns either the specified on demand feature view, or raises an exception if - none is found - """ - - @abstractmethod - def list_on_demand_feature_views( - self, project: str, allow_cache: bool = False - ) -> List[OnDemandFeatureView]: - """ - Retrieve a list of on demand feature views from the registry - - Args: - project: Filter on demand feature views based on project name - allow_cache: Whether to allow returning on demand feature views from a cached registry - - Returns: - List of on demand feature views - """ - - # regular feature view operations - @abstractmethod - def get_feature_view( - self, name: str, project: str, allow_cache: bool = False - ) -> FeatureView: - """ - Retrieves a feature view. - - Args: - name: Name of feature view - project: Feast project that this feature view belongs to - allow_cache: Allow returning feature view from the cached registry - - Returns: - Returns either the specified feature view, or raises an exception if - none is found - """ - - @abstractmethod - def list_feature_views( - self, project: str, allow_cache: bool = False - ) -> List[FeatureView]: - """ - Retrieve a list of feature views from the registry - - Args: - allow_cache: Allow returning feature views from the cached registry - project: Filter feature views based on project name - - Returns: - List of feature views - """ - - # request feature view operations - @abstractmethod - def get_request_feature_view(self, name: str, project: str) -> RequestFeatureView: - """ - Retrieves a request feature view. - - Args: - name: Name of request feature view - project: Feast project that this feature view belongs to - allow_cache: Allow returning feature view from the cached registry - - Returns: - Returns either the specified feature view, or raises an exception if - none is found - """ - - @abstractmethod - def list_request_feature_views( - self, project: str, allow_cache: bool = False - ) -> List[RequestFeatureView]: - """ - Retrieve a list of request feature views from the registry - - Args: - allow_cache: Allow returning feature views from the cached registry - project: Filter feature views based on project name - - Returns: - List of request feature views - """ - - @abstractmethod - def apply_materialization( - self, - feature_view: FeatureView, - project: str, - start_date: datetime, - end_date: datetime, - commit: bool = True, - ): - """ - Updates materialization intervals tracked for a single feature view in Feast - - Args: - feature_view: Feature view that will be updated with an additional materialization interval tracked - project: Feast project that this feature view belongs to - start_date (datetime): Start date of the materialization interval to track - end_date (datetime): End date of the materialization interval to track - commit: Whether the change should be persisted immediately - """ - - # Saved dataset operations - @abstractmethod - def apply_saved_dataset( - self, - saved_dataset: SavedDataset, - project: str, - commit: bool = True, - ): - """ - Stores a saved dataset metadata with Feast - - Args: - saved_dataset: SavedDataset that will be added / updated to registry - project: Feast project that this dataset belongs to - commit: Whether the change should be persisted immediately - """ - - @abstractmethod - def get_saved_dataset( - self, name: str, project: str, allow_cache: bool = False - ) -> SavedDataset: - """ - Retrieves a saved dataset. - - Args: - name: Name of dataset - project: Feast project that this dataset belongs to - allow_cache: Whether to allow returning this dataset from a cached registry - - Returns: - Returns either the specified SavedDataset, or raises an exception if - none is found - """ - - def delete_saved_dataset(self, name: str, project: str, allow_cache: bool = False): - """ - Delete a saved dataset. - - Args: - name: Name of dataset - project: Feast project that this dataset belongs to - allow_cache: Whether to allow returning this dataset from a cached registry - - Returns: - Returns either the specified SavedDataset, or raises an exception if - none is found - """ - - @abstractmethod - def list_saved_datasets( - self, project: str, allow_cache: bool = False - ) -> List[SavedDataset]: - """ - Retrieves a list of all saved datasets in specified project - - Args: - project: Feast project - allow_cache: Whether to allow returning this dataset from a cached registry - - Returns: - Returns the list of SavedDatasets - """ - - # Validation reference operations - @abstractmethod - def apply_validation_reference( - self, - validation_reference: ValidationReference, - project: str, - commit: bool = True, - ): - """ - Persist a validation reference - - Args: - validation_reference: ValidationReference that will be added / updated to registry - project: Feast project that this dataset belongs to - commit: Whether the change should be persisted immediately - """ - - @abstractmethod - def delete_validation_reference(self, name: str, project: str, commit: bool = True): - """ - Deletes a validation reference or raises an exception if not found. - - Args: - name: Name of validation reference - project: Feast project that this object belongs to - commit: Whether the change should be persisted immediately - """ - - @abstractmethod - def get_validation_reference( - self, name: str, project: str, allow_cache: bool = False - ) -> ValidationReference: - """ - Retrieves a validation reference. - - Args: - name: Name of dataset - project: Feast project that this dataset belongs to - allow_cache: Whether to allow returning this dataset from a cached registry - - Returns: - Returns either the specified ValidationReference, or raises an exception if - none is found - """ - - # TODO: Needs to be implemented. - def list_validation_references( - self, project: str, allow_cache: bool = False - ) -> List[ValidationReference]: - - """ - Retrieve a list of validation references from the registry - - Args: - allow_cache: Allow returning feature views from the cached registry - project: Filter feature views based on project name - - Returns: - List of request feature views - """ - - def list_project_metadata( - self, project: str, allow_cache: bool = False - ) -> List[ProjectMetadata]: - """ - Retrieves project metadata - - Args: - project: Filter metadata based on project name - allow_cache: Allow returning feature views from the cached registry - - Returns: - List of project metadata - """ - - @abstractmethod - def update_infra(self, infra: Infra, project: str, commit: bool = True): - """ - Updates the stored Infra object. - - Args: - infra: The new Infra object to be stored. - project: Feast project that the Infra object refers to - commit: Whether the change should be persisted immediately - """ - - @abstractmethod - def get_infra(self, project: str, allow_cache: bool = False) -> Infra: - """ - Retrieves the stored Infra object. - - Args: - project: Feast project that the Infra object refers to - allow_cache: Whether to allow returning this entity from a cached registry - - Returns: - The stored Infra object. - """ - - @abstractmethod - def apply_user_metadata( - self, - project: str, - feature_view: BaseFeatureView, - metadata_bytes: Optional[bytes], - ): - ... - - @abstractmethod - def get_user_metadata( - self, project: str, feature_view: BaseFeatureView - ) -> Optional[bytes]: - ... - - @abstractmethod - def proto(self) -> RegistryProto: - """ - Retrieves a proto version of the registry. - - Returns: - The registry proto object. - """ - - @abstractmethod - def commit(self): - """Commits the state of the registry cache to the remote registry store.""" - - @abstractmethod - def refresh(self, project: Optional[str]): - """Refreshes the state of the registry cache by fetching the registry state from the remote registry store.""" - - @staticmethod - def _message_to_sorted_dict(message: Message) -> Dict[str, Any]: - return json.loads(MessageToJson(message, sort_keys=True)) - - def to_dict(self, project: str) -> Dict[str, List[Any]]: - """Returns a dictionary representation of the registry contents for the specified project. - - For each list in the dictionary, the elements are sorted by name, so this - method can be used to compare two registries. - - Args: - project: Feast project to convert to a dict - """ - registry_dict: Dict[str, Any] = defaultdict(list) - registry_dict["project"] = project - for project_metadata in sorted(self.list_project_metadata(project=project)): - registry_dict["projectMetadata"].append( - self._message_to_sorted_dict(project_metadata.to_proto()) - ) - for data_source in sorted( - self.list_data_sources(project=project), key=lambda ds: ds.name - ): - registry_dict["dataSources"].append( - self._message_to_sorted_dict(data_source.to_proto()) - ) - for entity in sorted( - self.list_entities(project=project), key=lambda entity: entity.name - ): - registry_dict["entities"].append( - self._message_to_sorted_dict(entity.to_proto()) - ) - for feature_view in sorted( - self.list_feature_views(project=project), - key=lambda feature_view: feature_view.name, - ): - registry_dict["featureViews"].append( - self._message_to_sorted_dict(feature_view.to_proto()) - ) - for feature_service in sorted( - self.list_feature_services(project=project), - key=lambda feature_service: feature_service.name, - ): - registry_dict["featureServices"].append( - self._message_to_sorted_dict(feature_service.to_proto()) - ) - for on_demand_feature_view in sorted( - self.list_on_demand_feature_views(project=project), - key=lambda on_demand_feature_view: on_demand_feature_view.name, - ): - odfv_dict = self._message_to_sorted_dict(on_demand_feature_view.to_proto()) - odfv_dict["spec"]["userDefinedFunction"]["body"] = dill.source.getsource( - on_demand_feature_view.udf - ) - registry_dict["onDemandFeatureViews"].append(odfv_dict) - for request_feature_view in sorted( - self.list_request_feature_views(project=project), - key=lambda request_feature_view: request_feature_view.name, - ): - registry_dict["requestFeatureViews"].append( - self._message_to_sorted_dict(request_feature_view.to_proto()) - ) - for saved_dataset in sorted( - self.list_saved_datasets(project=project), key=lambda item: item.name - ): - registry_dict["savedDatasets"].append( - self._message_to_sorted_dict(saved_dataset.to_proto()) - ) - for infra_object in sorted(self.get_infra(project=project).infra_objects): - registry_dict["infra"].append( - self._message_to_sorted_dict(infra_object.to_proto()) - ) - return registry_dict - - def _get_project_metadata( registry_proto: Optional[RegistryProto], project: str ) -> Optional[ProjectMetadataProto]: @@ -776,10 +165,6 @@ def _init_project_metadata(cached_registry_proto: RegistryProto, project: str): class Registry(BaseRegistry): - """ - Registry: A registry allows for the management and persistence of feature definitions and related metadata. - """ - def apply_user_metadata( self, project: str, @@ -806,7 +191,7 @@ def __new__( # We override __new__ so that we can inspect registry_config and create a SqlRegistry without callers # needing to make any changes. if registry_config and registry_config.registry_type == "sql": - from feast.infra.registry_stores.sql import SqlRegistry + from feast.infra.registry.sql import SqlRegistry return SqlRegistry(registry_config, repo_path) else: @@ -864,14 +249,6 @@ def _initialize_registry(self, project: str): self._registry_store.update_registry_proto(registry_proto) def update_infra(self, infra: Infra, project: str, commit: bool = True): - """ - Updates the stored Infra object. - - Args: - infra: The new Infra object to be stored. - project: Feast project that the Infra object refers to - commit: Whether the change should be persisted immediately - """ self._prepare_registry_for_changes(project) assert self.cached_registry_proto @@ -880,30 +257,12 @@ def update_infra(self, infra: Infra, project: str, commit: bool = True): self.commit() def get_infra(self, project: str, allow_cache: bool = False) -> Infra: - """ - Retrieves the stored Infra object. - - Args: - project: Feast project that the Infra object refers to - allow_cache: Whether to allow returning this entity from a cached registry - - Returns: - The stored Infra object. - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) return Infra.from_proto(registry_proto.infra) def apply_entity(self, entity: Entity, project: str, commit: bool = True): - """ - Registers a single entity with Feast - - Args: - entity: Entity that will be registered - project: Feast project that this entity belongs to - commit: Whether the change should be persisted immediately - """ entity.is_valid() now = datetime.utcnow() @@ -931,16 +290,6 @@ def apply_entity(self, entity: Entity, project: str, commit: bool = True): self.commit() def list_entities(self, project: str, allow_cache: bool = False) -> List[Entity]: - """ - Retrieve a list of entities from the registry - - Args: - allow_cache: Whether to allow returning entities from a cached registry - project: Filter entities based on project name - - Returns: - List of entities - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -953,16 +302,6 @@ def list_entities(self, project: str, allow_cache: bool = False) -> List[Entity] def list_data_sources( self, project: str, allow_cache: bool = False ) -> List[DataSource]: - """ - Retrieve a list of data sources from the registry - - Args: - project: Filter data source based on project name - allow_cache: Whether to allow returning data sources from a cached registry - - Returns: - List of data sources - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -975,14 +314,6 @@ def list_data_sources( def apply_data_source( self, data_source: DataSource, project: str, commit: bool = True ): - """ - Registers a single data source with Feast - - Args: - data_source: A data source that will be registered - project: Feast project that this data source belongs to - commit: Whether to immediately commit to the registry - """ registry = self._prepare_registry_for_changes(project) for idx, existing_data_source_proto in enumerate(registry.data_sources): if existing_data_source_proto.name == data_source.name: @@ -1000,14 +331,6 @@ def apply_data_source( self.commit() def delete_data_source(self, name: str, project: str, commit: bool = True): - """ - Deletes a data source or raises an exception if not found. - - Args: - name: Name of data source - project: Feast project that this data source belongs to - commit: Whether the change should be persisted immediately - """ self._prepare_registry_for_changes(project) assert self.cached_registry_proto @@ -1024,13 +347,6 @@ def delete_data_source(self, name: str, project: str, commit: bool = True): def apply_feature_service( self, feature_service: FeatureService, project: str, commit: bool = True ): - """ - Registers a single feature service with Feast - - Args: - feature_service: A feature service that will be registered - project: Feast project that this entity belongs to - """ now = datetime.utcnow() if not feature_service.created_timestamp: feature_service.created_timestamp = now @@ -1055,17 +371,6 @@ def apply_feature_service( def list_feature_services( self, project: str, allow_cache: bool = False ) -> List[FeatureService]: - """ - Retrieve a list of feature services from the registry - - Args: - allow_cache: Whether to allow returning entities from a cached registry - project: Filter entities based on project name - - Returns: - List of feature services - """ - registry = self._get_registry_proto(project=project, allow_cache=allow_cache) feature_services = [] for feature_service_proto in registry.feature_services: @@ -1078,18 +383,6 @@ def list_feature_services( def get_feature_service( self, name: str, project: str, allow_cache: bool = False ) -> FeatureService: - """ - Retrieves a feature service. - - Args: - name: Name of feature service - project: Feast project that this feature service belongs to - allow_cache: Whether to allow returning this feature service from a cached registry - - Returns: - Returns either the specified feature service, or raises an exception if - none is found - """ registry = self._get_registry_proto(project=project, allow_cache=allow_cache) for feature_service_proto in registry.feature_services: @@ -1101,18 +394,6 @@ def get_feature_service( raise FeatureServiceNotFoundException(name, project=project) def get_entity(self, name: str, project: str, allow_cache: bool = False) -> Entity: - """ - Retrieves an entity. - - Args: - name: Name of entity - project: Feast project that this entity belongs to - allow_cache: Whether to allow returning this entity from a cached registry - - Returns: - Returns either the specified entity, or raises an exception if - none is found - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -1124,14 +405,6 @@ def get_entity(self, name: str, project: str, allow_cache: bool = False) -> Enti def apply_feature_view( self, feature_view: BaseFeatureView, project: str, commit: bool = True ): - """ - Registers a single feature view with Feast - - Args: - feature_view: Feature view that will be registered - project: Feast project that this feature view belongs to - commit: Whether the change should be persisted immediately - """ feature_view.ensure_valid() now = datetime.utcnow() @@ -1188,16 +461,6 @@ def apply_feature_view( def list_stream_feature_views( self, project: str, allow_cache: bool = False ) -> List[StreamFeatureView]: - """ - Retrieve a list of stream feature views from the registry - - Args: - project: Filter stream feature views based on project name - allow_cache: Whether to allow returning stream feature views from a cached registry - - Returns: - List of stream feature views - """ registry = self._get_registry_proto(project=project, allow_cache=allow_cache) stream_feature_views = [] for stream_feature_view in registry.stream_feature_views: @@ -1210,17 +473,6 @@ def list_stream_feature_views( def list_on_demand_feature_views( self, project: str, allow_cache: bool = False ) -> List[OnDemandFeatureView]: - """ - Retrieve a list of on demand feature views from the registry - - Args: - project: Filter on demand feature views based on project name - allow_cache: Whether to allow returning on demand feature views from a cached registry - - Returns: - List of on demand feature views - """ - registry = self._get_registry_proto(project=project, allow_cache=allow_cache) on_demand_feature_views = [] for on_demand_feature_view in registry.on_demand_feature_views: @@ -1233,18 +485,6 @@ def list_on_demand_feature_views( def get_on_demand_feature_view( self, name: str, project: str, allow_cache: bool = False ) -> OnDemandFeatureView: - """ - Retrieves an on demand feature view. - - Args: - name: Name of on demand feature view - project: Feast project that this on demand feature view belongs to - allow_cache: Whether to allow returning this on demand feature view from a cached registry - - Returns: - Returns either the specified on demand feature view, or raises an exception if - none is found - """ registry = self._get_registry_proto(project=project, allow_cache=allow_cache) for on_demand_feature_view in registry.on_demand_feature_views: @@ -1258,17 +498,6 @@ def get_on_demand_feature_view( def get_data_source( self, name: str, project: str, allow_cache: bool = False ) -> DataSource: - """ - Retrieves a data source. - - Args: - name: Name of data source - project: Feast project that this data source belongs to - allow_cache: Whether to allow returning this data source from a cached registry - - Returns: - Returns either the specified data source, or raises an exception if none is found - """ registry = self._get_registry_proto(project=project, allow_cache=allow_cache) for data_source in registry.data_sources: @@ -1284,16 +513,6 @@ def apply_materialization( end_date: datetime, commit: bool = True, ): - """ - Updates materialization intervals tracked for a single feature view in Feast - - Args: - feature_view: Feature view that will be updated with an additional materialization interval tracked - project: Feast project that this feature view belongs to - start_date (datetime): Start date of the materialization interval to track - end_date (datetime): End date of the materialization interval to track - commit: Whether the change should be persisted immediately - """ self._prepare_registry_for_changes(project) assert self.cached_registry_proto @@ -1348,16 +567,6 @@ def apply_materialization( def list_feature_views( self, project: str, allow_cache: bool = False ) -> List[FeatureView]: - """ - Retrieve a list of feature views from the registry - - Args: - allow_cache: Allow returning feature views from the cached registry - project: Filter feature views based on project name - - Returns: - List of feature views - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -1368,17 +577,6 @@ def list_feature_views( return feature_views def get_request_feature_view(self, name: str, project: str): - """ - Retrieves a feature view. - - Args: - name: Name of feature view - project: Feast project that this feature view belongs to - - Returns: - Returns either the specified feature view, or raises an exception if - none is found - """ registry_proto = self._get_registry_proto(project=project, allow_cache=False) for feature_view_proto in registry_proto.feature_views: if ( @@ -1391,16 +589,6 @@ def get_request_feature_view(self, name: str, project: str): def list_request_feature_views( self, project: str, allow_cache: bool = False ) -> List[RequestFeatureView]: - """ - Retrieve a list of request feature views from the registry - - Args: - allow_cache: Allow returning feature views from the cached registry - project: Filter feature views based on project name - - Returns: - List of feature views - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -1415,18 +603,6 @@ def list_request_feature_views( def get_feature_view( self, name: str, project: str, allow_cache: bool = False ) -> FeatureView: - """ - Retrieves a feature view. - - Args: - name: Name of feature view - project: Feast project that this feature view belongs to - allow_cache: Allow returning feature view from the cached registry - - Returns: - Returns either the specified feature view, or raises an exception if - none is found - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -1441,18 +617,6 @@ def get_feature_view( def get_stream_feature_view( self, name: str, project: str, allow_cache: bool = False ) -> StreamFeatureView: - """ - Retrieves a stream feature view. - - Args: - name: Name of stream feature view - project: Feast project that this stream feature view belongs to - allow_cache: Allow returning feature view from the cached registry - - Returns: - Returns either the specified feature view, or raises an exception if - none is found - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -1465,14 +629,6 @@ def get_stream_feature_view( raise FeatureViewNotFoundException(name, project) def delete_feature_service(self, name: str, project: str, commit: bool = True): - """ - Deletes a feature service or raises an exception if not found. - - Args: - name: Name of feature service - project: Feast project that this feature service belongs to - commit: Whether the change should be persisted immediately - """ self._prepare_registry_for_changes(project) assert self.cached_registry_proto @@ -1490,14 +646,6 @@ def delete_feature_service(self, name: str, project: str, commit: bool = True): raise FeatureServiceNotFoundException(name, project) def delete_feature_view(self, name: str, project: str, commit: bool = True): - """ - Deletes a feature view or raises an exception if not found. - - Args: - name: Name of feature view - project: Feast project that this feature view belongs to - commit: Whether the change should be persisted immediately - """ self._prepare_registry_for_changes(project) assert self.cached_registry_proto @@ -1552,14 +700,6 @@ def delete_feature_view(self, name: str, project: str, commit: bool = True): raise FeatureViewNotFoundException(name, project) def delete_entity(self, name: str, project: str, commit: bool = True): - """ - Deletes an entity or raises an exception if not found. - - Args: - name: Name of entity - project: Feast project that this entity belongs to - commit: Whether the change should be persisted immediately - """ self._prepare_registry_for_changes(project) assert self.cached_registry_proto @@ -1583,14 +723,6 @@ def apply_saved_dataset( project: str, commit: bool = True, ): - """ - Stores a saved dataset metadata with Feast - - Args: - saved_dataset: SavedDataset that will be added / updated to registry - project: Feast project that this dataset belongs to - commit: Whether the change should be persisted immediately - """ now = datetime.utcnow() if not saved_dataset.created_timestamp: saved_dataset.created_timestamp = now @@ -1618,18 +750,6 @@ def apply_saved_dataset( def get_saved_dataset( self, name: str, project: str, allow_cache: bool = False ) -> SavedDataset: - """ - Retrieves a saved dataset. - - Args: - name: Name of dataset - project: Feast project that this dataset belongs to - allow_cache: Whether to allow returning this dataset from a cached registry - - Returns: - Returns either the specified SavedDataset, or raises an exception if - none is found - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -1644,16 +764,6 @@ def get_saved_dataset( def list_saved_datasets( self, project: str, allow_cache: bool = False ) -> List[SavedDataset]: - """ - Retrieves a list of all saved datasets in specified project - - Args: - project: Feast project - allow_cache: Whether to allow returning this dataset from a cached registry - - Returns: - Returns the list of SavedDatasets - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -1669,14 +779,6 @@ def apply_validation_reference( project: str, commit: bool = True, ): - """ - Persist a validation reference - - Args: - validation_reference: ValidationReference that will be added / updated to registry - project: Feast project that this dataset belongs to - commit: Whether the change should be persisted immediately - """ validation_reference_proto = validation_reference.to_proto() validation_reference_proto.project = project @@ -1698,18 +800,6 @@ def apply_validation_reference( def get_validation_reference( self, name: str, project: str, allow_cache: bool = False ) -> ValidationReference: - """ - Retrieves a validation reference. - - Args: - name: Name of dataset - project: Feast project that this dataset belongs to - allow_cache: Whether to allow returning this dataset from a cached registry - - Returns: - Returns either the specified ValidationReference, or raises an exception if - none is found - """ registry_proto = self._get_registry_proto( project=project, allow_cache=allow_cache ) @@ -1722,14 +812,6 @@ def get_validation_reference( raise ValidationReferenceNotFound(name, project=project) def delete_validation_reference(self, name: str, project: str, commit: bool = True): - """ - Deletes a validation reference or raises an exception if not found. - - Args: - name: Name of validation reference - project: Feast project that this object belongs to - commit: Whether the change should be persisted immediately - """ registry_proto = self._prepare_registry_for_changes(project) for idx, existing_validation_reference in enumerate( registry_proto.validation_references diff --git a/sdk/python/feast/registry_store.py b/sdk/python/feast/infra/registry/registry_store.py similarity index 100% rename from sdk/python/feast/registry_store.py rename to sdk/python/feast/infra/registry/registry_store.py diff --git a/sdk/python/feast/infra/registry/s3.py b/sdk/python/feast/infra/registry/s3.py new file mode 100644 index 0000000000..d3772910f5 --- /dev/null +++ b/sdk/python/feast/infra/registry/s3.py @@ -0,0 +1,80 @@ +import os +import uuid +from datetime import datetime +from pathlib import Path +from tempfile import TemporaryFile +from urllib.parse import urlparse + +from feast.errors import S3RegistryBucketForbiddenAccess, S3RegistryBucketNotExist +from feast.infra.registry.registry_store import RegistryStore +from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto +from feast.repo_config import RegistryConfig +from feast.usage import log_exceptions_and_usage + +try: + import boto3 +except ImportError as e: + from feast.errors import FeastExtrasDependencyImportError + + raise FeastExtrasDependencyImportError("aws", str(e)) + + +class S3RegistryStore(RegistryStore): + def __init__(self, registry_config: RegistryConfig, repo_path: Path): + uri = registry_config.path + self._uri = urlparse(uri) + self._bucket = self._uri.hostname + self._key = self._uri.path.lstrip("/") + + self.s3_client = boto3.resource( + "s3", endpoint_url=os.environ.get("FEAST_S3_ENDPOINT_URL") + ) + + @log_exceptions_and_usage(registry="s3") + def get_registry_proto(self): + file_obj = TemporaryFile() + registry_proto = RegistryProto() + try: + from botocore.exceptions import ClientError + except ImportError as e: + from feast.errors import FeastExtrasDependencyImportError + + raise FeastExtrasDependencyImportError("aws", str(e)) + try: + bucket = self.s3_client.Bucket(self._bucket) + self.s3_client.meta.client.head_bucket(Bucket=bucket.name) + except ClientError as e: + # If a client error is thrown, then check that it was a 404 error. + # If it was a 404 error, then the bucket does not exist. + error_code = int(e.response["Error"]["Code"]) + if error_code == 404: + raise S3RegistryBucketNotExist(self._bucket) + else: + raise S3RegistryBucketForbiddenAccess(self._bucket) from e + + try: + obj = bucket.Object(self._key) + obj.download_fileobj(file_obj) + file_obj.seek(0) + registry_proto.ParseFromString(file_obj.read()) + return registry_proto + except ClientError as e: + raise FileNotFoundError( + f"Error while trying to locate Registry at path {self._uri.geturl()}" + ) from e + + @log_exceptions_and_usage(registry="s3") + def update_registry_proto(self, registry_proto: RegistryProto): + self._write_registry(registry_proto) + + def teardown(self): + self.s3_client.Object(self._bucket, self._key).delete() + + def _write_registry(self, registry_proto: RegistryProto): + registry_proto.version_id = str(uuid.uuid4()) + registry_proto.last_updated.FromDatetime(datetime.utcnow()) + # we have already checked the bucket exists so no need to do it again + file_obj = TemporaryFile() + file_obj.write(registry_proto.SerializeToString()) + file_obj.seek(0) + self.s3_client.Bucket(self._bucket).put_object(Body=file_obj, Key=self._key) diff --git a/sdk/python/feast/infra/registry_stores/sql.py b/sdk/python/feast/infra/registry/sql.py similarity index 95% rename from sdk/python/feast/infra/registry_stores/sql.py rename to sdk/python/feast/infra/registry/sql.py index 9c6b47a714..7867cdff4c 100644 --- a/sdk/python/feast/infra/registry_stores/sql.py +++ b/sdk/python/feast/infra/registry/sql.py @@ -2,7 +2,7 @@ from datetime import datetime from enum import Enum from pathlib import Path -from typing import Any, List, Optional, Set, Union +from typing import Any, Callable, List, Optional, Set, Union from sqlalchemy import ( # type: ignore BigInteger, @@ -34,6 +34,7 @@ from feast.feature_service import FeatureService from feast.feature_view import FeatureView from feast.infra.infra_object import Infra +from feast.infra.registry.base_registry import BaseRegistry from feast.on_demand_feature_view import OnDemandFeatureView from feast.project_metadata import ProjectMetadata from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto @@ -57,7 +58,6 @@ from feast.protos.feast.core.ValidationProfile_pb2 import ( ValidationReference as ValidationReferenceProto, ) -from feast.registry import BaseRegistry from feast.repo_config import RegistryConfig from feast.request_feature_view import RequestFeatureView from feast.saved_dataset import SavedDataset, ValidationReference @@ -560,7 +560,7 @@ def update_infra(self, infra: Infra, project: str, commit: bool = True): ) def get_infra(self, project: str, allow_cache: bool = False) -> Infra: - return self._get_object( + infra_object = self._get_object( managed_infra, "infra_obj", project, @@ -570,6 +570,8 @@ def get_infra(self, project: str, allow_cache: bool = False) -> Infra: "infra_proto", None, ) + infra_object = infra_object or InfraProto() + return Infra.from_proto(infra_object) def apply_user_metadata( self, @@ -683,11 +685,19 @@ def commit(self): pass def _apply_object( - self, table, project: str, id_field_name, obj, proto_field_name, name=None + self, + table: Table, + project: str, + id_field_name: str, + obj: Any, + proto_field_name: str, + name: Optional[str] = None, ): self._maybe_init_project_metadata(project) - name = name or obj.name + name = name or (obj.name if hasattr(obj, "name") else None) + assert name, f"name needs to be provided for {obj}" + with self.engine.connect() as conn: update_datetime = datetime.utcnow() update_time = int(update_datetime.timestamp()) @@ -712,9 +722,16 @@ def _apply_object( ) conn.execute(update_stmt) else: + obj_proto = obj.to_proto() + + if hasattr(obj_proto, "meta") and hasattr( + obj_proto.meta, "created_timestamp" + ): + obj_proto.meta.created_timestamp.FromDatetime(update_datetime) + values = { id_field_name: name, - proto_field_name: obj.to_proto().SerializeToString(), + proto_field_name: obj_proto.SerializeToString(), "last_updated_timestamp": update_time, "project_id": project, } @@ -749,7 +766,14 @@ def _maybe_init_project_metadata(self, project): conn.execute(insert_stmt) usage.set_current_project_uuid(new_project_uuid) - def _delete_object(self, table, name, project, id_field_name, not_found_exception): + def _delete_object( + self, + table: Table, + name: str, + project: str, + id_field_name: str, + not_found_exception: Optional[Callable], + ): with self.engine.connect() as conn: stmt = delete(table).where( getattr(table.c, id_field_name) == name, table.c.project_id == project @@ -763,14 +787,14 @@ def _delete_object(self, table, name, project, id_field_name, not_found_exceptio def _get_object( self, - table, - name, - project, - proto_class, - python_class, - id_field_name, - proto_field_name, - not_found_exception, + table: Table, + name: str, + project: str, + proto_class: Any, + python_class: Any, + id_field_name: str, + proto_field_name: str, + not_found_exception: Optional[Callable], ): self._maybe_init_project_metadata(project) @@ -782,10 +806,18 @@ def _get_object( if row: _proto = proto_class.FromString(row[proto_field_name]) return python_class.from_proto(_proto) - raise not_found_exception(name, project) + if not_found_exception: + raise not_found_exception(name, project) + else: + return None def _list_objects( - self, table, project, proto_class, python_class, proto_field_name + self, + table: Table, + project: str, + proto_class: Any, + python_class: Any, + proto_field_name: str, ): self._maybe_init_project_metadata(project) with self.engine.connect() as conn: diff --git a/sdk/python/feast/infra/transformation_servers/Dockerfile b/sdk/python/feast/infra/transformation_servers/Dockerfile index 5e77144757..c072ed0160 100644 --- a/sdk/python/feast/infra/transformation_servers/Dockerfile +++ b/sdk/python/feast/infra/transformation_servers/Dockerfile @@ -15,7 +15,7 @@ COPY README.md README.md # Install dependencies -RUN --mount=source=.git,target=.git,type=bind pip3 install --no-cache-dir -e '.' +RUN --mount=source=.git,target=.git,type=bind pip3 install --no-cache-dir -e '.[gcp,aws]' # Start feature transformation server CMD [ "python", "app.py" ] diff --git a/sdk/python/feast/infra/transformation_servers/__init__.py b/sdk/python/feast/infra/transformation_servers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/transformation_servers/app.py b/sdk/python/feast/infra/transformation_servers/app.py index acfb0959ba..7afba69beb 100644 --- a/sdk/python/feast/infra/transformation_servers/app.py +++ b/sdk/python/feast/infra/transformation_servers/app.py @@ -13,8 +13,8 @@ FEATURE_TRANSFORMATION_SERVER_PORT_ENV_NAME, REGISTRY_ENV_NAME, ) -from feast.infra.local import LocalRegistryStore -from feast.registry import get_registry_store_class_from_scheme +from feast.infra.registry.file import FileRegistryStore +from feast.infra.registry.registry import get_registry_store_class_from_scheme # Load RepoConfig config_base64 = os.environ[FEATURE_STORE_YAML_ENV_NAME] @@ -32,7 +32,7 @@ registry = raw_config["registry"] registry_path = registry["path"] if isinstance(registry, dict) else registry registry_store_class = get_registry_store_class_from_scheme(registry_path) -if registry_store_class == LocalRegistryStore and not os.path.exists(registry_path): +if registry_store_class == FileRegistryStore and not os.path.exists(registry_path): registry_base64 = os.environ[REGISTRY_ENV_NAME] registry_bytes = base64.b64decode(registry_base64) registry_dir = os.path.dirname(registry_path) diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index 3c8ad9d71b..07ce3ab17d 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -21,7 +21,8 @@ RedshiftQueryError, RedshiftTableNameTooLong, ) -from feast.type_map import pa_to_redshift_value_type +from feast.type_map import pa_to_athena_value_type, pa_to_redshift_value_type +from feast.usage import get_user_agent try: import boto3 @@ -32,7 +33,6 @@ raise FeastExtrasDependencyImportError("aws", str(e)) - REDSHIFT_TABLE_NAME_MAX_LENGTH = 127 @@ -40,7 +40,10 @@ def get_redshift_data_client(aws_region: str): """ Get the Redshift Data API Service client for the given AWS region. """ - return boto3.client("redshift-data", config=Config(region_name=aws_region)) + return boto3.client( + "redshift-data", + config=Config(region_name=aws_region, user_agent=get_user_agent()), + ) def get_s3_resource(aws_region: str): @@ -672,3 +675,333 @@ def list_s3_files(aws_region: str, path: str) -> List[str]: contents = objects["Contents"] files = [f"s3://{bucket}/{content['Key']}" for content in contents] return files + + +# Athena + + +def get_athena_data_client(aws_region: str): + """ + Get the athena Data API Service client for the given AWS region. + """ + return boto3.client( + "athena", config=Config(region_name=aws_region, user_agent=get_user_agent()) + ) + + +@retry( + wait=wait_exponential(multiplier=1, max=4), + retry=retry_if_exception_type(ConnectionClosedError), + stop=stop_after_attempt(5), + reraise=True, +) +def execute_athena_query_async( + athena_data_client, data_source: str, database: str, query: str +) -> dict: + """Execute Athena statement asynchronously. Does not wait for the query to finish. + + Raises AthenaCredentialsError if the statement couldn't be executed due to the validation error. + + Args: + athena_data_client: athena Data API Service client + data_source: athena Cluster Identifier + database: athena Database Name + query: The SQL query to execute + + Returns: JSON response + + """ + try: + # return athena_data_client.execute_statement( + return athena_data_client.start_query_execution( + QueryString=query, + QueryExecutionContext={"Database": database}, + WorkGroup="primary", + ) + + except ClientError as e: + raise AthenaQueryError(e) + + +class AthenaStatementNotFinishedError(Exception): + pass + + +@retry( + wait=wait_exponential(multiplier=1, max=30), + retry=retry_if_exception_type(AthenaStatementNotFinishedError), + reraise=True, +) +def wait_for_athena_execution(athena_data_client, execution: dict) -> None: + """Waits for the Athena statement to finish. Raises AthenaQueryError if the statement didn't succeed. + + We use exponential backoff for checking the query state until it's not running. The backoff starts with + 0.1 seconds and doubles exponentially until reaching 30 seconds, at which point the backoff is fixed. + + Args: + athena_data_client: athena Service boto3 client + execution: The athena execution to wait for (result of execute_athena_statement) + + Returns: None + + """ + response = athena_data_client.get_query_execution( + QueryExecutionId=execution["QueryExecutionId"] + ) + if response["QueryExecution"]["Status"]["State"] in ("QUEUED", "RUNNING"): + raise AthenaStatementNotFinishedError # Retry + if response["QueryExecution"]["Status"]["State"] != "SUCCEEDED": + raise AthenaQueryError(response) # Don't retry. Raise exception. + + +def drop_temp_table( + athena_data_client, data_source: str, database: str, temp_table: str +): + query = f"DROP TABLE `{database}.{temp_table}`" + execute_athena_query_async(athena_data_client, data_source, database, query) + + +def execute_athena_query( + athena_data_client, + data_source: str, + database: str, + query: str, + temp_table: str = None, +) -> str: + """Execute athena statement synchronously. Waits for the query to finish. + + Raises athenaCredentialsError if the statement couldn't be executed due to the validation error. + Raises athenaQueryError if the query runs but finishes with errors. + + + Args: + athena_data_client: athena Data API Service client + data_source: athena data source Name + database: athena Database Name + query: The SQL query to execute + temp_table: temp table name to be deleted after query execution. + + Returns: Statement ID + + """ + + execution = execute_athena_query_async( + athena_data_client, data_source, database, query + ) + wait_for_athena_execution(athena_data_client, execution) + if temp_table is not None: + drop_temp_table(athena_data_client, data_source, database, temp_table) + + return execution["QueryExecutionId"] + + +def get_athena_query_result(athena_data_client, query_execution_id: str) -> dict: + """Get the athena query result""" + response = athena_data_client.get_query_results(QueryExecutionId=query_execution_id) + return response["ResultSet"] + + +class AthenaError(Exception): + def __init__(self, details): + super().__init__(f"Athena API failed. Details: {details}") + + +class AthenaQueryError(Exception): + def __init__(self, details): + super().__init__(f"Athena SQL Query failed to finish. Details: {details}") + + +class AthenaTableNameTooLong(Exception): + def __init__(self, table_name: str): + super().__init__( + f"Athena table(Data catalog) names have a maximum length of 255 characters, but the table name {table_name} has length {len(table_name)} characters." + ) + + +def unload_athena_query_to_pa( + athena_data_client, + data_source: str, + database: str, + s3_resource, + s3_path: str, + query: str, + temp_table: str, +) -> pa.Table: + """Unload Athena Query results to S3 and get the results in PyArrow Table format""" + bucket, key = get_bucket_and_key(s3_path) + + execute_athena_query_and_unload_to_s3( + athena_data_client, data_source, database, query, temp_table + ) + + with tempfile.TemporaryDirectory() as temp_dir: + download_s3_directory(s3_resource, bucket, key, temp_dir) + delete_s3_directory(s3_resource, bucket, key) + return pq.read_table(temp_dir) + + +def unload_athena_query_to_df( + athena_data_client, + data_source: str, + database: str, + s3_resource, + s3_path: str, + query: str, + temp_table: str, +) -> pd.DataFrame: + """Unload Athena Query results to S3 and get the results in Pandas DataFrame format""" + table = unload_athena_query_to_pa( + athena_data_client, + data_source, + database, + s3_resource, + s3_path, + query, + temp_table, + ) + return table.to_pandas() + + +def execute_athena_query_and_unload_to_s3( + athena_data_client, + data_source: str, + database: str, + query: str, + temp_table: str, +) -> None: + """Unload Athena Query results to S3 + + Args: + athena_data_client: Athena Data API Service client + data_source: Athena data source + database: Redshift Database Name + query: The SQL query to execute + temp_table: temp table name to be deleted after query execution. + + """ + + execute_athena_query(athena_data_client, data_source, database, query, temp_table) + + +def upload_df_to_athena( + athena_client, + data_source: str, + database: str, + s3_resource, + s3_path: str, + table_name: str, + df: pd.DataFrame, +): + """Uploads a Pandas DataFrame to S3(Athena) as a new table. + + The caller is responsible for deleting the table when no longer necessary. + + Args: + athena_client: Athena API Service client + data_source: Athena Data Source + database: Athena Database Name + s3_resource: S3 Resource object + s3_path: S3 path where the Parquet file is temporarily uploaded + table_name: The name of the new Data Catalog table where we copy the dataframe + df: The Pandas DataFrame to upload + + Raises: + AthenaTableNameTooLong: The specified table name is too long. + """ + + # Drop the index so that we dont have unnecessary columns + df.reset_index(drop=True, inplace=True) + + # Convert Pandas DataFrame into PyArrow table and compile the Athena table schema. + # Note, if the underlying data has missing values, + # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean. + # If the dtype is 'object', then missing values are inferred as python `None`s. + # More details at: + # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing + table = pa.Table.from_pandas(df) + upload_arrow_table_to_athena( + table, + athena_client, + data_source=data_source, + database=database, + s3_resource=s3_resource, + s3_path=s3_path, + table_name=table_name, + ) + + +def upload_arrow_table_to_athena( + table: Union[pyarrow.Table, Path], + athena_client, + data_source: str, + database: str, + s3_resource, + s3_path: str, + table_name: str, + schema: Optional[pyarrow.Schema] = None, + fail_if_exists: bool = True, +): + """Uploads an Arrow Table to S3(Athena). + + Here's how the upload process works: + 1. PyArrow Table is serialized into a Parquet format on local disk + 2. The Parquet file is uploaded to S3 + 3. an Athena(data catalog) table is created. the S3 directory(in number 2) will be set as an external location. + 4. The local disk & s3 paths are cleaned up + + Args: + table: The Arrow Table or Path to parquet dataset to upload + athena_client: Athena API Service client + data_source: Athena data source + database: Athena Database Name + s3_resource: S3 Resource object + s3_path: S3 path where the Parquet file is temporarily uploaded + table_name: The name of the new Athena table where we copy the dataframe + schema: (Optionally) client may provide arrow Schema which will be converted into Athena table schema + fail_if_exists: fail if table with such name exists or append data to existing table + + Raises: + AthenaTableNameTooLong: The specified table name is too long. + """ + DATA_CATALOG_TABLE_NAME_MAX_LENGTH = 255 + + if len(table_name) > DATA_CATALOG_TABLE_NAME_MAX_LENGTH: + raise AthenaTableNameTooLong(table_name) + + if isinstance(table, pyarrow.Table) and not schema: + schema = table.schema + + if not schema: + raise ValueError("Schema must be specified when data is passed as a Path") + + bucket, key = get_bucket_and_key(s3_path) + + column_query_list = ", ".join( + [f"`{field.name}` {pa_to_athena_value_type(field.type)}" for field in schema] + ) + + with tempfile.TemporaryFile(suffix=".parquet") as parquet_temp_file: + pq.write_table(table, parquet_temp_file) + parquet_temp_file.seek(0) + s3_resource.Object(bucket, key).put(Body=parquet_temp_file) + + create_query = ( + f"CREATE EXTERNAL TABLE {database}.{table_name} " + f"({column_query_list}) " + f"STORED AS PARQUET " + f"LOCATION '{s3_path[:s3_path.rfind('/')]}' " + f"TBLPROPERTIES('parquet.compress' = 'SNAPPY') " + ) + + try: + execute_athena_query( + athena_client, + data_source, + database, + f"{create_query}", + ) + finally: + pass + # Clean up S3 temporary data + # for file_path in uploaded_files: + # s3_resource.Object(bucket, file_path).delete() diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake/snowflake_utils.py similarity index 92% rename from sdk/python/feast/infra/utils/snowflake_utils.py rename to sdk/python/feast/infra/utils/snowflake/snowflake_utils.py index f54288e45d..c7b27d8331 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake/snowflake_utils.py @@ -1,6 +1,7 @@ import configparser import os import random +import shutil import string from logging import getLogger from pathlib import Path @@ -18,7 +19,9 @@ wait_exponential, ) +import feast from feast.errors import SnowflakeIncompleteConfig, SnowflakeQueryUnknownError +from feast.feature_view import FeatureView try: import snowflake.connector @@ -36,6 +39,16 @@ logger = getLogger(__name__) +def assert_snowflake_feature_names(feature_view: FeatureView) -> None: + for feature in feature_view.features: + assert feature.name not in [ + "entity_key", + "feature_name", + "feature_value", + ], f"Feature Name: {feature.name} is a protected name to ensure query stability" + return None + + def execute_snowflake_statement(conn: SnowflakeConnection, query) -> SnowflakeCursor: cursor = conn.cursor().execute(query) if cursor is None: @@ -44,10 +57,12 @@ def execute_snowflake_statement(conn: SnowflakeConnection, query) -> SnowflakeCu def get_snowflake_conn(config, autocommit=True) -> SnowflakeConnection: - assert config.type in ["snowflake.offline", "snowflake.online"] + assert config.type in ["snowflake.offline", "snowflake.engine", "snowflake.online"] if config.type == "snowflake.offline": config_header = "connections.feast_offline_store" + if config.type == "snowflake.engine": + config_header = "connections.feast_batch_engine" elif config.type == "snowflake.online": config_header = "connections.feast_online_store" @@ -60,19 +75,13 @@ def get_snowflake_conn(config, autocommit=True) -> SnowflakeConnection: if config_reader.has_section(config_header): kwargs = dict(config_reader[config_header]) - if "schema" in kwargs: - kwargs["schema_"] = kwargs.pop("schema") - kwargs.update((k, v) for k, v in config_dict.items() if v is not None) for k, v in kwargs.items(): if k in ["role", "warehouse", "database", "schema_"]: kwargs[k] = f'"{v}"' - if "schema_" in kwargs: - kwargs["schema"] = kwargs.pop("schema_") - else: - kwargs["schema"] = '"PUBLIC"' + kwargs["schema"] = kwargs.pop("schema_") # https://docs.snowflake.com/en/user-guide/python-connector-example.html#using-key-pair-authentication-key-pair-rotation # https://docs.snowflake.com/en/user-guide/key-pair-auth.html#configuring-key-pair-authentication @@ -83,14 +92,57 @@ def get_snowflake_conn(config, autocommit=True) -> SnowflakeConnection: try: conn = snowflake.connector.connect( - application="feast", autocommit=autocommit, **kwargs + application="feast", + autocommit=autocommit, + **kwargs, ) + conn.cursor().execute("ALTER SESSION SET TIMEZONE = 'UTC'", _is_internal=True) + return conn except KeyError as e: raise SnowflakeIncompleteConfig(e) +def package_snowpark_zip(project_name) -> Tuple[str, str]: + path = os.path.dirname(feast.__file__) + copy_path = path + f"/snowflake_feast_{project_name}" + + if os.path.exists(copy_path): + shutil.rmtree(copy_path) + + copy_files = [ + "/infra/utils/snowflake/snowpark/snowflake_udfs.py", + "/infra/key_encoding_utils.py", + "/type_map.py", + "/value_type.py", + "/protos/feast/types/Value_pb2.py", + "/protos/feast/types/EntityKey_pb2.py", + ] + + package_path = copy_path + "/feast" + for feast_file in copy_files: + idx = feast_file.rfind("/") + if idx > -1: + Path(package_path + feast_file[:idx]).mkdir(parents=True, exist_ok=True) + feast_file = shutil.copy(path + feast_file, package_path + feast_file[:idx]) + else: + feast_file = shutil.copy(path + feast_file, package_path + feast_file) + + zip_path = shutil.make_archive(package_path, "zip", copy_path) + + return copy_path, zip_path + + +def _run_snowflake_field_mapping(snowflake_job_sql: str, field_mapping: dict) -> str: + snowflake_mapped_sql = snowflake_job_sql + for key in field_mapping.keys(): + snowflake_mapped_sql = snowflake_mapped_sql.replace( + f'"{key}"', f'"{key}" AS "{field_mapping[key]}"', 1 + ) + return snowflake_mapped_sql + + # TO DO -- sfc-gh-madkins # Remove dependency on write_pandas function by falling back to native snowflake python connector # Current issue is datetime[ns] types are read incorrectly in Snowflake, need to coerce to datetime[ns, UTC] diff --git a/sdk/python/feast/infra/utils/snowflake/snowpark/__init__.py b/sdk/python/feast/infra/utils/snowflake/snowpark/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/utils/snowflake/snowpark/snowflake_python_udfs_creation.sql b/sdk/python/feast/infra/utils/snowflake/snowpark/snowflake_python_udfs_creation.sql new file mode 100644 index 0000000000..a197a3ee4c --- /dev/null +++ b/sdk/python/feast/infra/utils/snowflake/snowpark/snowflake_python_udfs_creation.sql @@ -0,0 +1,71 @@ +CREATE FUNCTION IF NOT EXISTS feast_PROJECT_NAME_snowflake_binary_to_bytes_proto(df BINARY) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_binary_to_bytes_proto' + IMPORTS = ('@STAGE_HOLDER/feast.zip'); + +CREATE FUNCTION IF NOT EXISTS feast_PROJECT_NAME_snowflake_varchar_to_string_proto(df VARCHAR) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_varchar_to_string_proto' + IMPORTS = ('@STAGE_HOLDER/feast.zip'); + +CREATE FUNCTION IF NOT EXISTS feast_PROJECT_NAME_snowflake_number_to_int32_proto(df NUMBER) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_number_to_int32_proto' + IMPORTS = ('@STAGE_HOLDER/feast.zip'); + +CREATE FUNCTION IF NOT EXISTS feast_PROJECT_NAME_snowflake_number_to_int64_proto(df NUMBER) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_number_to_int64_proto' + IMPORTS = ('@STAGE_HOLDER/feast.zip'); + +CREATE FUNCTION IF NOT EXISTS feast_PROJECT_NAME_snowflake_float_to_double_proto(df DOUBLE) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_float_to_double_proto' + IMPORTS = ('@STAGE_HOLDER/feast.zip'); + +CREATE FUNCTION IF NOT EXISTS feast_PROJECT_NAME_snowflake_boolean_to_bool_proto(df BOOLEAN) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_boolean_to_bool_boolean_proto' + IMPORTS = ('@STAGE_HOLDER/feast.zip'); + +CREATE FUNCTION IF NOT EXISTS feast_PROJECT_NAME_snowflake_timestamp_to_unix_timestamp_proto(df NUMBER) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_timestamp_to_unix_timestamp_proto' + IMPORTS = ('@STAGE_HOLDER/feast.zip'); + +CREATE FUNCTION IF NOT EXISTS feast_PROJECT_NAME_serialize_entity_keys(names ARRAY, data ARRAY, types ARRAY) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_serialize_entity_keys' + IMPORTS = ('@STAGE_HOLDER/feast.zip'); + +CREATE FUNCTION IF NOT EXISTS feast_PROJECT_NAME_entity_key_proto_to_string(names ARRAY, data ARRAY, types ARRAY) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_entity_key_proto_to_string' + IMPORTS = ('@STAGE_HOLDER/feast.zip') diff --git a/sdk/python/feast/infra/utils/snowflake/snowpark/snowflake_python_udfs_deletion.sql b/sdk/python/feast/infra/utils/snowflake/snowpark/snowflake_python_udfs_deletion.sql new file mode 100644 index 0000000000..ff96bdd8f2 --- /dev/null +++ b/sdk/python/feast/infra/utils/snowflake/snowpark/snowflake_python_udfs_deletion.sql @@ -0,0 +1,17 @@ +DROP FUNCTION IF EXISTS feast_PROJECT_NAME_snowflake_binary_to_bytes_proto(BINARY); + +DROP FUNCTION IF EXISTS feast_PROJECT_NAME_snowflake_varchar_to_string_proto(VARCHAR); + +DROP FUNCTION IF EXISTS feast_PROJECT_NAME_snowflake_number_to_int32_proto(NUMBER); + +DROP FUNCTION IF EXISTS feast_PROJECT_NAME_snowflake_number_to_int64_proto(NUMBER); + +DROP FUNCTION IF EXISTS feast_PROJECT_NAME_snowflake_float_to_double_proto(DOUBLE); + +DROP FUNCTION IF EXISTS feast_PROJECT_NAME_snowflake_boolean_to_bool_proto(BOOLEAN); + +DROP FUNCTION IF EXISTS feast_PROJECT_NAME_snowflake_timestamp_to_unix_timestamp_proto(NUMBER); + +DROP FUNCTION IF EXISTS feast_PROJECT_NAME_serialize_entity_keys(ARRAY, ARRAY, ARRAY); + +DROP FUNCTION IF EXISTS feast_PROJECT_NAME_entity_key_proto_to_string(ARRAY, ARRAY, ARRAY) diff --git a/sdk/python/feast/infra/utils/snowflake/snowpark/snowflake_udfs.py b/sdk/python/feast/infra/utils/snowflake/snowpark/snowflake_udfs.py new file mode 100644 index 0000000000..7fde4dd3a1 --- /dev/null +++ b/sdk/python/feast/infra/utils/snowflake/snowpark/snowflake_udfs.py @@ -0,0 +1,261 @@ +from binascii import unhexlify + +import pandas +from _snowflake import vectorized + +from feast.infra.key_encoding_utils import serialize_entity_key +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.type_map import ( + _convert_value_type_str_to_value_type, + python_values_to_proto_values, +) +from feast.value_type import ValueType + +""" +CREATE OR REPLACE FUNCTION feast_snowflake_binary_to_bytes_proto(df BINARY) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_binary_to_bytes_proto' + IMPORTS = ('@feast_stage/feast.zip'); +""" +# ValueType.BYTES = 1 +@vectorized(input=pandas.DataFrame) +def feast_snowflake_binary_to_bytes_proto(df): + df = list( + map( + ValueProto.SerializeToString, + python_values_to_proto_values(df[0].to_numpy(), ValueType.BYTES), + ) + ) + return df + + +""" +CREATE OR REPLACE FUNCTION feast_snowflake_varchar_to_string_proto(df VARCHAR) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_varchar_to_string_proto' + IMPORTS = ('@feast_stage/feast.zip'); +""" +# ValueType.STRING = 2 +@vectorized(input=pandas.DataFrame) +def feast_snowflake_varchar_to_string_proto(df): + df = list( + map( + ValueProto.SerializeToString, + python_values_to_proto_values(df[0].to_numpy(), ValueType.STRING), + ) + ) + return df + + +""" +CREATE OR REPLACE FUNCTION feast_snowflake_number_to_int32_proto(df NUMBER) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_number_to_int32_proto' + IMPORTS = ('@feast_stage/feast.zip'); +""" +# ValueType.INT32 = 3 +@vectorized(input=pandas.DataFrame) +def feast_snowflake_number_to_int32_proto(df): + df = list( + map( + ValueProto.SerializeToString, + python_values_to_proto_values(df[0].to_numpy(), ValueType.INT32), + ) + ) + return df + + +""" +CREATE OR REPLACE FUNCTION feast_snowflake_number_to_int64_proto(df NUMBER) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_number_to_int64_proto' + IMPORTS = ('@feast_stage/feast.zip'); +""" +# ValueType.INT64 = 4 +@vectorized(input=pandas.DataFrame) +def feast_snowflake_number_to_int64_proto(df): + df = list( + map( + ValueProto.SerializeToString, + python_values_to_proto_values(df[0].to_numpy(), ValueType.INT64), + ) + ) + return df + + +# All floating-point numbers stored as double +# https://docs.snowflake.com/en/sql-reference/data-types-numeric.html#data-types-for-floating-point-numbers +""" +CREATE OR REPLACE FUNCTION feast_snowflake_float_to_double_proto(df DOUBLE) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_float_to_double_proto' + IMPORTS = ('@feast_stage/feast.zip'); +""" +# ValueType.FLOAT = 5 & ValueType.DOUBLE = 6 +@vectorized(input=pandas.DataFrame) +def feast_snowflake_float_to_double_proto(df): + df = list( + map( + ValueProto.SerializeToString, + python_values_to_proto_values(df[0].to_numpy(), ValueType.DOUBLE), + ) + ) + return df + + +""" +CREATE OR REPLACE FUNCTION feast_snowflake_boolean_to_bool_proto(df BOOLEAN) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_boolean_to_bool_boolean_proto' + IMPORTS = ('@feast_stage/feast.zip'); +""" +# ValueType.BOOL = 7 +@vectorized(input=pandas.DataFrame) +def feast_snowflake_boolean_to_bool_boolean_proto(df): + df = list( + map( + ValueProto.SerializeToString, + python_values_to_proto_values(df[0].to_numpy(), ValueType.BOOL), + ) + ) + return df + + +""" +CREATE OR REPLACE FUNCTION feast_snowflake_timestamp_to_unix_timestamp_proto(df NUMBER) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_snowflake_timestamp_to_unix_timestamp_proto' + IMPORTS = ('@feast_stage/feast.zip'); +""" +# ValueType.UNIX_TIMESTAMP = 8 +@vectorized(input=pandas.DataFrame) +def feast_snowflake_timestamp_to_unix_timestamp_proto(df): + + df = list( + map( + ValueProto.SerializeToString, + python_values_to_proto_values( + pandas.to_datetime(df[0], unit="ns").to_numpy(), + ValueType.UNIX_TIMESTAMP, + ), + ) + ) + return df + + +""" +CREATE OR REPLACE FUNCTION feast_serialize_entity_keys(names ARRAY, data ARRAY, types ARRAY) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_serialize_entity_keys' + IMPORTS = ('@feast_stage/feast.zip') +""" +# converts 1 to n many entity keys to a single binary for lookups +@vectorized(input=pandas.DataFrame) +def feast_serialize_entity_keys(df): + join_keys = create_entity_dict(df[0].values[0], df[2].values[0]) + + df = pandas.DataFrame.from_dict( + dict(zip(df[1].index, df[1].values)), orient="index", columns=df[0].values[0] + ) + + proto_values_by_column = {} + for column, value_type in list(join_keys.items()): + # BINARY is converted to a hex string, we need to convert back + if value_type == ValueType.BYTES: + proto_values = python_values_to_proto_values( + list(map(unhexlify, df[column].tolist())), value_type + ) + else: + proto_values = python_values_to_proto_values( + df[column].to_numpy(), value_type + ) + + proto_values_by_column.update({column: proto_values}) + + serialized_entity_keys = [ + serialize_entity_key( + EntityKeyProto( + join_keys=join_keys, + entity_values=[proto_values_by_column[k][idx] for k in join_keys], + ), + entity_key_serialization_version=2, + ) + for idx in range(df.shape[0]) + ] + return serialized_entity_keys + + +""" +CREATE OR REPLACE FUNCTION feast_entity_key_proto_to_string(names ARRAY, data ARRAY, types ARRAY) + RETURNS BINARY + LANGUAGE PYTHON + RUNTIME_VERSION = '3.8' + PACKAGES = ('protobuf', 'pandas') + HANDLER = 'feast.infra.utils.snowflake.snowpark.snowflake_udfs.feast_entity_key_proto_to_string' + IMPORTS = ('@feast_stage/feast.zip') +""" +# converts 1 to n many entity keys to a single binary for lookups +@vectorized(input=pandas.DataFrame) +def feast_entity_key_proto_to_string(df): + join_keys = create_entity_dict(df[0].values[0], df[2].values[0]) + + df = pandas.DataFrame.from_dict( + dict(zip(df[1].index, df[1].values)), orient="index", columns=df[0].values[0] + ) + + proto_values_by_column = {} + for column, value_type in list(join_keys.items()): + # BINARY is converted to a hex string, we need to convert back + if value_type == ValueType.BYTES: + proto_values = python_values_to_proto_values( + list(map(unhexlify, df[column].tolist())), value_type + ) + else: + proto_values = python_values_to_proto_values( + df[column].to_numpy(), value_type + ) + + proto_values_by_column.update({column: proto_values}) + + serialized_entity_keys = [ + EntityKeyProto( + join_keys=join_keys, + entity_values=[proto_values_by_column[k][idx] for k in join_keys], + ).SerializeToString() + for idx in range(df.shape[0]) + ] + return serialized_entity_keys + + +def create_entity_dict(names, types): + return dict( + zip( + names, + [_convert_value_type_str_to_value_type(type_str) for type_str in types], + ) + ) diff --git a/sdk/python/feast/on_demand_feature_view.py b/sdk/python/feast/on_demand_feature_view.py index bb45dd6eb6..fcafeaa2bc 100644 --- a/sdk/python/feast/on_demand_feature_view.py +++ b/sdk/python/feast/on_demand_feature_view.py @@ -13,7 +13,6 @@ from feast.batch_feature_view import BatchFeatureView from feast.data_source import RequestSource from feast.errors import RegistryInferenceFailure, SpecifiedFeaturesNotPresentError -from feast.feature import Feature from feast.feature_view import FeatureView from feast.feature_view_projection import FeatureViewProjection from feast.field import Field, from_value_type @@ -28,7 +27,6 @@ from feast.protos.feast.core.OnDemandFeatureView_pb2 import ( UserDefinedFunction as UserDefinedFunctionProto, ) -from feast.stream_feature_view import StreamFeatureView from feast.type_map import ( feast_value_type_to_pandas_type, python_type_to_feast_value_type, @@ -61,12 +59,12 @@ class OnDemandFeatureView(BaseFeatureView): maintainer. """ - # TODO(adchia): remove inputs from proto and declaration name: str features: List[Field] source_feature_view_projections: Dict[str, FeatureViewProjection] source_request_sources: Dict[str, RequestSource] udf: FunctionType + udf_string: str description: str tags: Dict[str, str] owner: str @@ -74,17 +72,18 @@ class OnDemandFeatureView(BaseFeatureView): @log_exceptions # noqa: C901 def __init__( # noqa: C901 self, - *args, - name: Optional[str] = None, - features: Optional[List[Feature]] = None, - sources: Optional[ - List[Any] - ] = None, # Typed as Any because @typechecked can't deal with the List[Union] - udf: Optional[FunctionType] = None, - inputs: Optional[ - Dict[str, Union[FeatureView, FeatureViewProjection, RequestSource]] - ] = None, - schema: Optional[List[Field]] = None, + *, + name: str, + schema: List[Field], + sources: List[ + Union[ + FeatureView, + RequestSource, + FeatureViewProjection, + ] + ], + udf: FunctionType, + udf_string: str = "", description: str = "", tags: Optional[Dict[str, str]] = None, owner: str = "", @@ -94,136 +93,30 @@ def __init__( # noqa: C901 Args: name: The unique name of the on demand feature view. - features (deprecated): The list of features in the output of the on demand - feature view, after the transformation has been applied. - sources (optional): A map from input source names to the actual input sources, - which may be feature views, or request data sources. - These sources serve as inputs to the udf, which will refer to them by name. - udf (optional): The user defined transformation function, which must take pandas + schema: The list of features in the output of the on demand feature view, after + the transformation has been applied. + sources: A map from input source names to the actual input sources, which may be + feature views, or request data sources. These sources serve as inputs to the udf, + which will refer to them by name. + udf: The user defined transformation function, which must take pandas dataframes as inputs. - inputs (optional): (Deprecated) A map from input source names to the actual input sources, - which may be feature views, feature view projections, or request data sources. - These sources serve as inputs to the udf, which will refer to them by name. - schema (optional): The list of features in the output of the on demand feature - view, after the transformation has been applied. + udf_string: The source code version of the udf (for diffing and displaying in Web UI) description (optional): A human-readable description. tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the on demand feature view, typically the email of the primary maintainer. """ - positional_attributes = ["name", "features", "inputs", "udf"] - - _name = name - - _schema = schema or [] - if len(_schema) == 0 and features is not None: - _schema = [Field.from_feature(feature) for feature in features] - if features is not None: - warnings.warn( - ( - "The `features` parameter is being deprecated in favor of the `schema` parameter. " - "Please switch from using `features` to `schema`. This will also requiring switching " - "feature definitions from using `Feature` to `Field`. Feast 0.24 and onwards will not " - "support the `features` parameter." - ), - DeprecationWarning, - ) - _sources = sources or [] - if inputs and sources: - raise ValueError("At most one of `sources` or `inputs` can be specified.") - elif inputs: - warnings.warn( - ( - "The `inputs` parameter is being deprecated. Please use `sources` instead. " - "Feast 0.24 and onwards will not support the `inputs` parameter." - ), - DeprecationWarning, - ) - for _, source in inputs.items(): - if isinstance(source, FeatureView): - _sources.append(feature_view_to_batch_feature_view(source)) - elif isinstance(source, RequestSource) or isinstance( - source, FeatureViewProjection - ): - _sources.append(source) - else: - raise ValueError( - "input can only accept FeatureView, FeatureViewProjection, or RequestSource" - ) - _udf: Optional[FunctionType] = udf - - if args: - warnings.warn( - ( - "On demand feature view parameters should be specified as keyword arguments " - "instead of positional arguments. Feast 0.24 and onwards will not support " - "positional arguments in on demand feature view definitions." - ), - DeprecationWarning, - ) - if len(args) > len(positional_attributes): - raise ValueError( - f"Only {', '.join(positional_attributes)} are allowed as positional args " - f"when defining feature views, for backwards compatibility." - ) - if len(args) >= 1: - _name = args[0] - if len(args) >= 2: - _schema = args[1] - # Convert Features to Fields. - if len(_schema) > 0 and isinstance(_schema[0], Feature): - _schema = [Field.from_feature(feature) for feature in _schema] - warnings.warn( - ( - "The `features` parameter is being deprecated in favor of the `schema` parameter. " - "Please switch from using `features` to `schema`. This will also requiring switching " - "feature definitions from using `Feature` to `Field`. Feast 0.24 and onwards will not " - "support the `features` parameter." - ), - DeprecationWarning, - ) - if len(args) >= 3: - _inputs = args[2] - for _, source in _inputs.items(): - if isinstance(source, FeatureView): - _sources.append(feature_view_to_batch_feature_view(source)) - elif isinstance(source, RequestSource) or isinstance( - source, FeatureViewProjection - ): - _sources.append(source) - else: - raise ValueError( - "input can only accept FeatureView, FeatureViewProjection, or RequestSource" - ) - warnings.warn( - ( - "The `inputs` parameter is being deprecated. Please use `sources` instead. " - "Feast 0.24 and onwards will not support the `inputs` parameter." - ), - DeprecationWarning, - ) - if len(args) >= 4: - _udf = args[3] - - if not _name: - raise ValueError( - "The name of the on demand feature view must be specified." - ) - - if not _sources: - raise ValueError("The `sources` parameter must be specified.") - super().__init__( - name=_name, - features=_schema, + name=name, + features=schema, description=description, tags=tags, owner=owner, ) - assert _sources is not None + self.source_feature_view_projections: Dict[str, FeatureViewProjection] = {} self.source_request_sources: Dict[str, RequestSource] = {} - for odfv_source in _sources: + for odfv_source in sources: if isinstance(odfv_source, RequestSource): self.source_request_sources[odfv_source.name] = odfv_source elif isinstance(odfv_source, FeatureViewProjection): @@ -233,22 +126,21 @@ def __init__( # noqa: C901 odfv_source.name ] = odfv_source.projection - if _udf is None: - raise ValueError("The `udf` parameter must be specified.") - self.udf = _udf # type: ignore + self.udf = udf # type: ignore + self.udf_string = udf_string @property def proto_class(self) -> Type[OnDemandFeatureViewProto]: return OnDemandFeatureViewProto def __copy__(self): - fv = OnDemandFeatureView( name=self.name, schema=self.features, sources=list(self.source_feature_view_projections.values()) + list(self.source_request_sources.values()), udf=self.udf, + udf_string=self.udf_string, description=self.description, tags=self.tags, owner=self.owner, @@ -269,6 +161,7 @@ def __eq__(self, other): self.source_feature_view_projections != other.source_feature_view_projections or self.source_request_sources != other.source_request_sources + or self.udf_string != other.udf_string or self.udf.__code__.co_code != other.udf.__code__.co_code ): return False @@ -310,6 +203,7 @@ def to_proto(self) -> OnDemandFeatureViewProto: user_defined_function=UserDefinedFunctionProto( name=self.udf.__name__, body=dill.dumps(self.udf, recurse=True), + body_text=self.udf_string, ), description=self.description, tags=self.tags, @@ -362,6 +256,7 @@ def from_proto(cls, on_demand_feature_view_proto: OnDemandFeatureViewProto): udf=dill.loads( on_demand_feature_view_proto.spec.user_defined_function.body ), + udf_string=on_demand_feature_view_proto.spec.user_defined_function.body_text, description=on_demand_feature_view_proto.spec.description, tags=dict(on_demand_feature_view_proto.spec.tags), owner=on_demand_feature_view_proto.spec.owner, @@ -514,23 +409,16 @@ def get_requested_odfvs(feature_refs, project, registry): return requested_on_demand_feature_views -# TODO(felixwang9817): Force this decorator to accept kwargs and switch from -# `features` to `schema`. def on_demand_feature_view( - *args, - features: Optional[List[Feature]] = None, - sources: Optional[ - List[ - Union[ - BatchFeatureView, - StreamFeatureView, - RequestSource, - FeatureViewProjection, - ] + *, + schema: List[Field], + sources: List[ + Union[ + FeatureView, + RequestSource, + FeatureViewProjection, ] - ] = None, - inputs: Optional[Dict[str, Union[FeatureView, RequestSource]]] = None, - schema: Optional[List[Field]] = None, + ], description: str = "", tags: Optional[Dict[str, str]] = None, owner: str = "", @@ -539,110 +427,16 @@ def on_demand_feature_view( Creates an OnDemandFeatureView object with the given user function as udf. Args: - features (deprecated): The list of features in the output of the on demand - feature view, after the transformation has been applied. - sources (optional): A map from input source names to the actual input sources, - which may be feature views, or request data sources. - These sources serve as inputs to the udf, which will refer to them by name. - inputs (optional): A map from input source names to the actual input sources, - which may be feature views, feature view projections, or request data sources. - These sources serve as inputs to the udf, which will refer to them by name. - schema (optional): The list of features in the output of the on demand feature - view, after the transformation has been applied. + schema: The list of features in the output of the on demand feature view, after + the transformation has been applied. + sources: A map from input source names to the actual input sources, which may be + feature views, or request data sources. These sources serve as inputs to the udf, + which will refer to them by name. description (optional): A human-readable description. tags (optional): A dictionary of key-value pairs to store arbitrary metadata. owner (optional): The owner of the on demand feature view, typically the email of the primary maintainer. """ - positional_attributes = ["features", "inputs"] - - _schema = schema or [] - if len(_schema) == 0 and features is not None: - _schema = [Field.from_feature(feature) for feature in features] - if features is not None: - warnings.warn( - ( - "The `features` parameter is being deprecated in favor of the `schema` parameter. " - "Please switch from using `features` to `schema`. This will also requiring switching " - "feature definitions from using `Feature` to `Field`. Feast 0.24 and onwards will not " - "support the `features` parameter." - ), - DeprecationWarning, - ) - _sources = sources or [] - if inputs and sources: - raise ValueError("At most one of `sources` or `inputs` can be specified.") - elif inputs: - warnings.warn( - ( - "The `inputs` parameter is being deprecated. Please use `sources` instead. " - "Feast 0.24 and onwards will not support the `inputs` parameter." - ), - DeprecationWarning, - ) - for _, source in inputs.items(): - if isinstance(source, FeatureView): - _sources.append(feature_view_to_batch_feature_view(source)) - elif isinstance(source, RequestSource) or isinstance( - source, FeatureViewProjection - ): - _sources.append(source) - else: - raise ValueError( - "input can only accept FeatureView, FeatureViewProjection, or RequestSource" - ) - - if args: - warnings.warn( - ( - "On demand feature view parameters should be specified as keyword arguments " - "instead of positional arguments. Feast 0.24 and onwards will not support " - "positional arguments in on demand feature view definitions." - ), - DeprecationWarning, - ) - if len(args) > len(positional_attributes): - raise ValueError( - f"Only {', '.join(positional_attributes)} are allowed as positional args " - f"when defining feature views, for backwards compatibility." - ) - if len(args) >= 1: - _schema = args[0] - # Convert Features to Fields. - if len(_schema) > 0 and isinstance(_schema[0], Feature): - _schema = [Field.from_feature(feature) for feature in _schema] - warnings.warn( - ( - "The `features` parameter is being deprecated in favor of the `schema` parameter. " - "Please switch from using `features` to `schema`. This will also requiring switching " - "feature definitions from using `Feature` to `Field`. Feast 0.24 and onwards will not " - "support the `features` parameter." - ), - DeprecationWarning, - ) - if len(args) >= 2: - _inputs = args[1] - for _, source in _inputs.items(): - if isinstance(source, FeatureView): - _sources.append(feature_view_to_batch_feature_view(source)) - elif isinstance(source, RequestSource) or isinstance( - source, FeatureViewProjection - ): - _sources.append(source) - else: - raise ValueError( - "input can only accept FeatureView, FeatureViewProjection, or RequestSource" - ) - warnings.warn( - ( - "The `inputs` parameter is being deprecated. Please use `sources` instead. " - "Feast 0.24 and onwards will not support the `inputs` parameter." - ), - DeprecationWarning, - ) - - if not _sources: - raise ValueError("The `sources` parameter must be specified.") def mainify(obj): # Needed to allow dill to properly serialize the udf. Otherwise, clients will need to have a file with the same @@ -651,15 +445,17 @@ def mainify(obj): obj.__module__ = "__main__" def decorator(user_function): + udf_string = dill.source.getsource(user_function) mainify(user_function) on_demand_feature_view_obj = OnDemandFeatureView( name=user_function.__name__, - sources=_sources, - schema=_schema, + sources=sources, + schema=schema, udf=user_function, description=description, tags=tags, owner=owner, + udf_string=udf_string, ) functools.update_wrapper( wrapper=on_demand_feature_view_obj, wrapped=user_function diff --git a/sdk/python/feast/proto_json.py b/sdk/python/feast/proto_json.py index 58b77edf8b..a0a4dce86b 100644 --- a/sdk/python/feast/proto_json.py +++ b/sdk/python/feast/proto_json.py @@ -1,12 +1,14 @@ import uuid from typing import Any, Callable, Type +import pkg_resources from google.protobuf.json_format import ( # type: ignore _WKTJSONMETHODS, ParseError, _Parser, _Printer, ) +from packaging import version from feast.protos.feast.serving.ServingService_pb2 import FeatureList from feast.protos.feast.types.Value_pb2 import RepeatedValue, Value @@ -15,8 +17,6 @@ JsonObject = Any -# TODO: These methods need to be updated when bumping the version of protobuf. -# https://github.com/feast-dev/feast/issues/2484 def _patch_proto_json_encoding( proto_type: Type[ProtoMessage], to_json_object: Callable[[_Printer, ProtoMessage], JsonObject], @@ -70,7 +70,7 @@ def to_json_object(printer: _Printer, message: ProtoMessage) -> JsonObject: return value def from_json_object( - parser: _Parser, value: JsonObject, message: ProtoMessage, path: str + parser: _Parser, value: JsonObject, message: ProtoMessage ) -> None: if value is None: message.null_val = 0 @@ -111,7 +111,18 @@ def from_json_object( "Value {0} has unexpected type {1}.".format(value, type(value)) ) - _patch_proto_json_encoding(Value, to_json_object, from_json_object) + def from_json_object_updated( + parser: _Parser, value: JsonObject, message: ProtoMessage, path: str + ): + from_json_object(parser, value, message) + + # https://github.com/feast-dev/feast/issues/2484 Certain feast users need a higher version of protobuf but the + # parameters of `from_json_object` changes in feast 3.20.1. This change gives users flexibility to use earlier versions. + current_version = pkg_resources.get_distribution("protobuf").version + if version.parse(current_version) < version.parse("3.20"): + _patch_proto_json_encoding(Value, to_json_object, from_json_object) + else: + _patch_proto_json_encoding(Value, to_json_object, from_json_object_updated) def _patch_feast_repeated_value_json_encoding(): @@ -141,14 +152,29 @@ def _patch_feast_repeated_value_json_encoding(): def to_json_object(printer: _Printer, message: ProtoMessage) -> JsonObject: return [printer._MessageToJsonObject(item) for item in message.val] - def from_json_object( + def from_json_object_updated( parser: _Parser, value: JsonObject, message: ProtoMessage, path: str ) -> None: array = value if isinstance(value, list) else value["val"] for item in array: parser.ConvertMessage(item, message.val.add(), path) - _patch_proto_json_encoding(RepeatedValue, to_json_object, from_json_object) + def from_json_object( + parser: _Parser, value: JsonObject, message: ProtoMessage + ) -> None: + array = value if isinstance(value, list) else value["val"] + for item in array: + parser.ConvertMessage(item, message.val.add()) + + # https://github.com/feast-dev/feast/issues/2484 Certain feast users need a higher version of protobuf but the + # parameters of `from_json_object` changes in feast 3.20.1. This change gives users flexibility to use earlier versions. + current_version = pkg_resources.get_distribution("protobuf").version + if version.parse(current_version) < version.parse("3.20"): + _patch_proto_json_encoding(RepeatedValue, to_json_object, from_json_object) + else: + _patch_proto_json_encoding( + RepeatedValue, to_json_object, from_json_object_updated + ) def _patch_feast_feature_list_json_encoding(): @@ -183,12 +209,25 @@ def to_json_object(printer: _Printer, message: ProtoMessage) -> JsonObject: return list(message.val) def from_json_object( - parser: _Parser, value: JsonObject, message: ProtoMessage, path: str + parser: _Parser, value: JsonObject, message: ProtoMessage ) -> None: array = value if isinstance(value, list) else value["val"] message.val.extend(array) - _patch_proto_json_encoding(FeatureList, to_json_object, from_json_object) + def from_json_object_updated( + parser: _Parser, value: JsonObject, message: ProtoMessage, path: str + ) -> None: + from_json_object(parser, value, message) + + # https://github.com/feast-dev/feast/issues/2484 Certain feast users need a higher version of protobuf but the + # parameters of `from_json_object` changes in feast 3.20.1. This change gives users flexibility to use earlier versions. + current_version = pkg_resources.get_distribution("protobuf").version + if version.parse(current_version) < version.parse("3.20"): + _patch_proto_json_encoding(FeatureList, to_json_object, from_json_object) + else: + _patch_proto_json_encoding( + FeatureList, to_json_object, from_json_object_updated + ) def patch(): diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 587907b284..47a5ae321d 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -17,10 +17,11 @@ from pydantic.error_wrappers import ErrorWrapper from pydantic.typing import Dict, Optional, Union -from feast import flags from feast.errors import ( FeastFeatureServerTypeInvalidError, FeastFeatureServerTypeSetError, + FeastOfflineStoreInvalidName, + FeastOnlineStoreInvalidName, FeastProviderNotSetError, ) from feast.importer import import_class @@ -34,8 +35,10 @@ # - existing values for the online store type in featurestore.yaml files continue to work in a backwards compatible way # - first party and third party implementations can use the same class loading code path. BATCH_ENGINE_CLASS_FOR_TYPE = { - "local": "feast.infra.materialization.LocalMaterializationEngine", - "lambda": "feast.infra.materialization.lambda.lambda_engine.LambdaMaterializationEngine", + "local": "feast.infra.materialization.local_engine.LocalMaterializationEngine", + "snowflake.engine": "feast.infra.materialization.snowflake_engine.SnowflakeMaterializationEngine", + "lambda": "feast.infra.materialization.aws_lambda.lambda_engine.LambdaMaterializationEngine", + "bytewax": "feast.infra.materialization.contrib.bytewax.bytewax_materialization_engine.BytewaxMaterializationEngine", } ONLINE_STORE_CLASS_FOR_TYPE = { @@ -46,6 +49,7 @@ "snowflake.online": "feast.infra.online_stores.snowflake.SnowflakeOnlineStore", "postgres": "feast.infra.online_stores.contrib.postgres.PostgreSQLOnlineStore", "hbase": "feast.infra.online_stores.contrib.hbase_online_store.hbase.HbaseOnlineStore", + "cassandra": "feast.infra.online_stores.contrib.cassandra_online_store.cassandra_online_store.CassandraOnlineStore", } OFFLINE_STORE_CLASS_FOR_TYPE = { @@ -56,6 +60,8 @@ "spark": "feast.infra.offline_stores.contrib.spark_offline_store.spark.SparkOfflineStore", "trino": "feast.infra.offline_stores.contrib.trino_offline_store.trino.TrinoOfflineStore", "postgres": "feast.infra.offline_stores.contrib.postgres_offline_store.postgres.PostgreSQLOfflineStore", + "athena": "feast.infra.offline_stores.contrib.athena_offline_store.athena.AthenaOfflineStore", + "mssql": "feast.infra.offline_stores.contrib.mssql_offline_store.mssql.MsSqlServerOfflineStore", } FEATURE_SERVER_CONFIG_CLASS_FOR_TYPE = { @@ -135,7 +141,7 @@ class RepoConfig(FeastBaseModel): """ FeatureServerConfig: Feature server configuration (optional depending on provider) """ flags: Any - """ Flags: Feature flags for experimental features (optional) """ + """ Flags (deprecated field): Feature flags for experimental features """ repo_path: Optional[Path] = None @@ -169,6 +175,8 @@ def __init__(self, **data: Any): self._offline_config = "bigquery" elif data["provider"] == "aws": self._offline_config = "redshift" + elif data["provider"] == "azure": + self._offline_config = "mssql" self._online_store = None if "online_store" in data: @@ -278,7 +286,8 @@ def _validate_online_store_config(cls, values): return values # Make sure that the provider configuration is set. We need it to set the defaults - assert "provider" in values + if "provider" not in values: + raise FeastProviderNotSetError() # Set the default type # This is only direct reference to a provider or online store that we should have @@ -315,7 +324,8 @@ def _validate_offline_store_config(cls, values): return values # Make sure that the provider configuration is set. We need it to set the defaults - assert "provider" in values + if "provider" not in values: + raise FeastProviderNotSetError() # Set the default type if "type" not in values["offline_store"]: @@ -325,6 +335,8 @@ def _validate_offline_store_config(cls, values): values["offline_store"]["type"] = "bigquery" elif values["provider"] == "aws": values["offline_store"]["type"] = "redshift" + if values["provider"] == "azure": + values["offline_store"]["type"] = "mssql" offline_store_type = values["offline_store"]["type"] @@ -391,15 +403,9 @@ def _validate_flags(cls, v): if not isinstance(v, Dict): return - for flag_name, val in v.items(): - if flag_name not in flags.FLAG_NAMES: - _logger.warn( - "Unrecognized flag: %s. This feature may be invalid, or may refer " - "to a previously experimental feature which has graduated to production.", - flag_name, - ) - if type(val) is not bool: - raise ValueError(f"Flag value, {val}, not valid.") + _logger.warning( + "Flags are no longer necessary in Feast. Experimental features will log warnings instead." + ) return v @@ -455,8 +461,8 @@ def get_batch_engine_config_from_type(batch_engine_type: str): def get_online_config_from_type(online_store_type: str): if online_store_type in ONLINE_STORE_CLASS_FOR_TYPE: online_store_type = ONLINE_STORE_CLASS_FOR_TYPE[online_store_type] - else: - assert online_store_type.endswith("OnlineStore") + elif not online_store_type.endswith("OnlineStore"): + raise FeastOnlineStoreInvalidName(online_store_type) module_name, online_store_class_type = online_store_type.rsplit(".", 1) config_class_name = f"{online_store_class_type}Config" @@ -466,8 +472,8 @@ def get_online_config_from_type(online_store_type: str): def get_offline_config_from_type(offline_store_type: str): if offline_store_type in OFFLINE_STORE_CLASS_FOR_TYPE: offline_store_type = OFFLINE_STORE_CLASS_FOR_TYPE[offline_store_type] - else: - assert offline_store_type.endswith("OfflineStore") + elif not offline_store_type.endswith("OfflineStore"): + raise FeastOfflineStoreInvalidName(offline_store_type) module_name, offline_store_class_type = offline_store_type.rsplit(".", 1) config_class_name = f"{offline_store_class_type}Config" @@ -484,8 +490,8 @@ def get_feature_server_config_from_type(feature_server_type: str): return import_class(module_name, config_class_name, config_class_name) -def load_repo_config(repo_path: Path) -> RepoConfig: - config_path = repo_path / "feature_store.yaml" +def load_repo_config(repo_path: Path, fs_yaml_file: Path) -> RepoConfig: + config_path = fs_yaml_file with open(config_path) as f: raw_config = yaml.safe_load(os.path.expandvars(f.read())) diff --git a/sdk/python/feast/repo_operations.py b/sdk/python/feast/repo_operations.py index 9a5e64f8c3..e019ac7178 100644 --- a/sdk/python/feast/repo_operations.py +++ b/sdk/python/feast/repo_operations.py @@ -14,15 +14,16 @@ from feast import PushSource from feast.batch_feature_view import BatchFeatureView -from feast.data_source import DataSource, KafkaSource +from feast.data_source import DataSource, KafkaSource, KinesisSource from feast.diff.registry_diff import extract_objects_for_keep_delete_update_add from feast.entity import Entity from feast.feature_service import FeatureService from feast.feature_store import FeatureStore from feast.feature_view import DUMMY_ENTITY, FeatureView +from feast.file_utils import replace_str_in_file +from feast.infra.registry.registry import FEAST_OBJECT_TYPES, FeastObjectType, Registry from feast.names import adjectives, animals from feast.on_demand_feature_view import OnDemandFeatureView -from feast.registry import FEAST_OBJECT_TYPES, FeastObjectType, Registry from feast.repo_config import RepoConfig from feast.repo_contents import RepoContents from feast.request_feature_view import RequestFeatureView @@ -113,17 +114,30 @@ def parse_repo(repo_root: Path) -> RepoContents: request_feature_views=[], ) - data_sources_set = set() for repo_file in get_repo_files(repo_root): module_path = py_path_to_module(repo_file) module = importlib.import_module(module_path) + for attr_name in dir(module): obj = getattr(module, attr_name) + if isinstance(obj, DataSource) and not any( (obj is ds) for ds in res.data_sources ): res.data_sources.append(obj) - data_sources_set.add(obj) + + # Handle batch sources defined within stream sources. + if ( + isinstance(obj, PushSource) + or isinstance(obj, KafkaSource) + or isinstance(obj, KinesisSource) + ): + batch_source = obj.batch_source + + if batch_source and not any( + (batch_source is ds) for ds in res.data_sources + ): + res.data_sources.append(batch_source) if ( isinstance(obj, FeatureView) and not any((obj is fv) for fv in res.feature_views) @@ -131,26 +145,42 @@ def parse_repo(repo_root: Path) -> RepoContents: and not isinstance(obj, BatchFeatureView) ): res.feature_views.append(obj) - if isinstance(obj.stream_source, PushSource) and not any( - (obj is ds) for ds in res.data_sources - ): - push_source_dep = obj.stream_source.batch_source - # Don't add if the push source's batch source is a duplicate of an existing batch source - if push_source_dep not in data_sources_set: - res.data_sources.append(push_source_dep) + + # Handle batch sources defined with feature views. + batch_source = obj.batch_source + assert batch_source + if not any((batch_source is ds) for ds in res.data_sources): + res.data_sources.append(batch_source) + + # Handle stream sources defined with feature views. + if obj.stream_source: + stream_source = obj.stream_source + if not any((stream_source is ds) for ds in res.data_sources): + res.data_sources.append(stream_source) elif isinstance(obj, StreamFeatureView) and not any( (obj is sfv) for sfv in res.stream_feature_views ): res.stream_feature_views.append(obj) - if ( - isinstance(obj.stream_source, PushSource) - or isinstance(obj.stream_source, KafkaSource) - and not any((obj is ds) for ds in res.data_sources) - ): - batch_source_dep = obj.stream_source.batch_source - # Don't add if the push source's batch source is a duplicate of an existing batch source - if batch_source_dep and batch_source_dep not in data_sources_set: - res.data_sources.append(batch_source_dep) + + # Handle batch sources defined with feature views. + batch_source = obj.batch_source + if not any((batch_source is ds) for ds in res.data_sources): + res.data_sources.append(batch_source) + + # Handle stream sources defined with feature views. + stream_source = obj.stream_source + assert stream_source + if not any((stream_source is ds) for ds in res.data_sources): + res.data_sources.append(stream_source) + elif isinstance(obj, BatchFeatureView) and not any( + (obj is bfv) for bfv in res.feature_views + ): + res.feature_views.append(obj) + + # Handle batch sources defined with feature views. + batch_source = obj.batch_source + if not any((batch_source is ds) for ds in res.data_sources): + res.data_sources.append(batch_source) elif isinstance(obj, Entity) and not any( (obj is entity) for entity in res.entities ): @@ -167,6 +197,7 @@ def parse_repo(repo_root: Path) -> RepoContents: (obj is rfv) for rfv in res.request_feature_views ): res.request_feature_views.append(obj) + res.entities.append(DUMMY_ENTITY) return res @@ -299,7 +330,6 @@ def log_infra_changes( @log_exceptions_and_usage def apply_total(repo_config: RepoConfig, repo_path: Path, skip_source_validation: bool): - os.chdir(repo_path) project, registry, repo, store = _prepare_registry_and_repo(repo_config, repo_path) apply_total_with_repo_instance( @@ -324,13 +354,12 @@ def registry_dump(repo_config: RepoConfig, repo_path: Path) -> str: return json.dumps(registry_dict, indent=2, sort_keys=True) -def cli_check_repo(repo_path: Path): +def cli_check_repo(repo_path: Path, fs_yaml_file: Path): sys.path.append(str(repo_path)) - config_path = repo_path / "feature_store.yaml" - if not config_path.exists(): + if not fs_yaml_file.exists(): print( - f"Can't find feature_store.yaml at {repo_path}. Make sure you're running feast from an initialized " - f"feast repository. " + f"Can't find feature repo configuration file at {fs_yaml_file}. " + "Make sure you're running feast from an initialized feast repository." ) sys.exit(1) @@ -382,7 +411,7 @@ def init_repo(repo_name: str, template: str): os.remove(bootstrap_path) # Template the feature_store.yaml file - feature_store_yaml_path = repo_path / "feature_store.yaml" + feature_store_yaml_path = repo_path / "feature_repo" / "feature_store.yaml" replace_str_in_file( feature_store_yaml_path, "project: my_project", f"project: {repo_name}" ) @@ -406,14 +435,6 @@ def is_valid_name(name: str) -> bool: return not name.startswith("_") and re.compile(r"\W+").search(name) is None -def replace_str_in_file(file_path, match_str, sub_str): - with open(file_path, "r") as f: - contents = f.read() - contents = contents.replace(match_str, sub_str) - with open(file_path, "wt") as f: - f.write(contents) - - def generate_project_name() -> str: """Generates a unique project name""" return f"{random.choice(adjectives)}_{random.choice(animals)}" diff --git a/sdk/python/feast/saved_dataset.py b/sdk/python/feast/saved_dataset.py index e2004d15f4..4a3043a873 100644 --- a/sdk/python/feast/saved_dataset.py +++ b/sdk/python/feast/saved_dataset.py @@ -8,6 +8,7 @@ from feast.data_source import DataSource from feast.dqm.profilers.profiler import Profile, Profiler +from feast.importer import import_class from feast.protos.feast.core.SavedDataset_pb2 import SavedDataset as SavedDatasetProto from feast.protos.feast.core.SavedDataset_pb2 import SavedDatasetMeta, SavedDatasetSpec from feast.protos.feast.core.SavedDataset_pb2 import ( @@ -31,6 +32,16 @@ def __new__(cls, name, bases, dct): return kls +_DATA_SOURCE_TO_SAVED_DATASET_STORAGE = { + "FileSource": "feast.infra.offline_stores.file_source.SavedDatasetFileStorage", +} + + +def get_saved_dataset_storage_class_from_path(saved_dataset_storage_path: str): + module_name, class_name = saved_dataset_storage_path.rsplit(".", 1) + return import_class(module_name, class_name, "SavedDatasetStorage") + + class SavedDatasetStorage(metaclass=_StorageRegistry): _proto_attr_name: str @@ -43,11 +54,24 @@ def from_proto(storage_proto: SavedDatasetStorageProto) -> "SavedDatasetStorage" @abstractmethod def to_proto(self) -> SavedDatasetStorageProto: - ... + pass @abstractmethod def to_data_source(self) -> DataSource: - ... + pass + + @staticmethod + def from_data_source(data_source: DataSource) -> "SavedDatasetStorage": + data_source_type = type(data_source).__name__ + if data_source_type in _DATA_SOURCE_TO_SAVED_DATASET_STORAGE: + cls = get_saved_dataset_storage_class_from_path( + _DATA_SOURCE_TO_SAVED_DATASET_STORAGE[data_source_type] + ) + return cls.from_data_source(data_source) + else: + raise ValueError( + f"This method currently does not support {data_source_type}." + ) class SavedDataset: diff --git a/sdk/python/feast/stream_feature_view.py b/sdk/python/feast/stream_feature_view.py index 29e8abb7da..176f38d093 100644 --- a/sdk/python/feast/stream_feature_view.py +++ b/sdk/python/feast/stream_feature_view.py @@ -8,9 +8,9 @@ import dill from typeguard import typechecked -from feast import utils +from feast import flags_helper, utils from feast.aggregation import Aggregation -from feast.data_source import DataSource, KafkaSource, PushSource +from feast.data_source import DataSource from feast.entity import Entity from feast.feature_view import FeatureView from feast.field import Field @@ -33,8 +33,8 @@ @typechecked class StreamFeatureView(FeatureView): """ - NOTE: Stream Feature Views are not yet fully implemented and exist to allow users to register their stream sources and - schemas with Feast. + A stream feature view defines a logical group of features that has both a stream data source and + a batch data source. Attributes: name: The unique name of the stream feature view. @@ -44,15 +44,14 @@ class StreamFeatureView(FeatureView): can result in extremely computationally intensive queries. schema: The schema of the feature view, including feature, timestamp, and entity columns. If not specified, can be inferred from the underlying data source. - source: DataSource. The stream source of data where this group of features is stored. + source: The stream source of data where this group of features is stored. aggregations: List of aggregations registered with the stream feature view. mode: The mode of execution. timestamp_field: Must be specified if aggregations are specified. Defines the timestamp column on which to aggregate windows. - online: Defines whether this stream feature view is used in online feature retrieval. + online: A boolean indicating whether online retrieval is enabled for this feature view. description: A human-readable description. tags: A dictionary of key-value pairs to store arbitrary metadata. - owner: The owner of the on demand feature view, typically the email of the primary - maintainer. + owner: The owner of the stream feature view, typically the email of the primary maintainer. udf: The user defined transformation function. This transformation function should have all of the corresponding imports imported within the function. """ @@ -76,28 +75,26 @@ class StreamFeatureView(FeatureView): def __init__( self, *, - name: Optional[str] = None, + name: str, + source: DataSource, entities: Optional[Union[List[Entity], List[str]]] = None, - ttl: Optional[timedelta] = None, + ttl: timedelta = timedelta(days=0), tags: Optional[Dict[str, str]] = None, online: Optional[bool] = True, description: Optional[str] = "", owner: Optional[str] = "", schema: Optional[List[Field]] = None, - source: Optional[DataSource] = None, aggregations: Optional[List[Aggregation]] = None, mode: Optional[str] = "spark", timestamp_field: Optional[str] = "", udf: Optional[FunctionType] = None, ): - warnings.warn( - "Stream Feature Views are experimental features in alpha development. " - "Some functionality may still be unstable so functionality can change in the future.", - RuntimeWarning, - ) - - if source is None: - raise ValueError("Stream Feature views need a source to be specified") + if not flags_helper.is_test(): + warnings.warn( + "Stream feature views are experimental features in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) if ( type(source).__name__ not in SUPPORTED_STREAM_SOURCES @@ -117,18 +114,11 @@ def __init__( self.mode = mode or "" self.timestamp_field = timestamp_field or "" self.udf = udf - _batch_source = None - if isinstance(source, KafkaSource) or isinstance(source, PushSource): - _batch_source = source.batch_source if source.batch_source else None - _ttl = ttl - if not _ttl: - _ttl = timedelta(days=0) + super().__init__( name=name, entities=entities, - ttl=_ttl, - batch_source=_batch_source, - stream_source=source, + ttl=ttl, tags=tags, online=online, description=description, diff --git a/sdk/python/feast/templates/athena/__init__.py b/sdk/python/feast/templates/athena/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/athena/feature_repo/__init__.py b/sdk/python/feast/templates/athena/feature_repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/athena/feature_repo/feature_store.yaml b/sdk/python/feast/templates/athena/feature_repo/feature_store.yaml new file mode 100644 index 0000000000..13e7898e86 --- /dev/null +++ b/sdk/python/feast/templates/athena/feature_repo/feature_store.yaml @@ -0,0 +1,13 @@ +project: repo +registry: registry.db +provider: aws +online_store: + type: sqlite + path: online_store.db +offline_store: + type: athena + region: ap-northeast-2 + database: sampledb + data_source: AwsDataCatalog + s3_staging_location: s3://sagemaker-yelo-test +entity_key_serialization_version: 2 \ No newline at end of file diff --git a/sdk/python/feast/templates/athena/feature_repo/test_workflow.py b/sdk/python/feast/templates/athena/feature_repo/test_workflow.py new file mode 100644 index 0000000000..7d7daff865 --- /dev/null +++ b/sdk/python/feast/templates/athena/feature_repo/test_workflow.py @@ -0,0 +1,108 @@ +import os +from datetime import datetime, timedelta + +import pandas as pd + +from feast import Entity, Feature, FeatureStore, FeatureView, ValueType +from feast.infra.offline_stores.contrib.athena_offline_store.athena_source import ( + AthenaSource, +) + + +def test_end_to_end(): + + try: + fs = FeatureStore(".") + + driver_hourly_stats = AthenaSource( + timestamp_field="event_timestamp", + table="driver_stats", + # table="driver_stats_partitioned", + database="sampledb", + data_source="AwsDataCatalog", + created_timestamp_column="created", + # date_partition_column="std_date" + ) + + driver = Entity( + name="driver_id", + value_type=ValueType.INT64, + description="driver id", + ) + + driver_hourly_stats_view = FeatureView( + name="driver_hourly_stats", + entities=["driver_id"], + ttl=timedelta(days=365), + features=[ + Feature(name="conv_rate", dtype=ValueType.FLOAT), + Feature(name="acc_rate", dtype=ValueType.FLOAT), + Feature(name="avg_daily_trips", dtype=ValueType.INT64), + ], + online=True, + batch_source=driver_hourly_stats, + ) + + # apply repository + fs.apply([driver_hourly_stats, driver, driver_hourly_stats_view]) + + print(fs.list_data_sources()) + print(fs.list_feature_views()) + + entity_df = pd.DataFrame( + {"driver_id": [1001], "event_timestamp": [datetime.now()]} + ) + + # Read features from offline store + + feature_vector = ( + fs.get_historical_features( + features=["driver_hourly_stats:conv_rate"], entity_df=entity_df + ) + .to_df() + .to_dict() + ) + conv_rate = feature_vector["conv_rate"][0] + print(conv_rate) + assert conv_rate > 0 + + # load data into online store + fs.materialize_incremental(end_date=datetime.now()) + + online_response = fs.get_online_features( + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + ], + entity_rows=[{"driver_id": 1002}], + ) + online_response_dict = online_response.to_dict() + print(online_response_dict) + + except Exception as e: + print(e) + finally: + # tear down feature store + fs.teardown() + + +def test_cli(): + os.system("PYTHONPATH=$PYTHONPATH:/$(pwd) feast -c feature_repo apply") + try: + os.system("PYTHONPATH=$PYTHONPATH:/$(pwd) ") + with open("output", "r") as f: + output = f.read() + + if "Pulling latest features from my offline store" not in output: + raise Exception( + 'Failed to successfully use provider from CLI. See "output" for more details.' + ) + finally: + os.system("PYTHONPATH=$PYTHONPATH:/$(pwd) feast -c feature_repo teardown") + + +if __name__ == "__main__": + # pass + test_end_to_end() + test_cli() diff --git a/sdk/python/feast/templates/aws/README.md b/sdk/python/feast/templates/aws/README.md new file mode 100644 index 0000000000..008a338e98 --- /dev/null +++ b/sdk/python/feast/templates/aws/README.md @@ -0,0 +1,19 @@ +# Feast Quickstart +A quick view of what's in this repository: + +* `data/` contains raw demo parquet data +* `example_repo.py` contains demo feature definitions +* `feature_store.yaml` contains a demo setup configuring where data sources are +* `test_workflow.py` showcases how to run all key Feast commands, including defining, retrieving, and pushing features. + +You can run the overall workflow with `python test_workflow.py`. + +## To move from this into a more production ready workflow: +1. `feature_store.yaml` points to a local file as a registry. You'll want to setup a remote file (e.g. in S3/GCS) or a +SQL registry. See [registry docs](https://docs.feast.dev/getting-started/concepts/registry) for more details. +2. Setup CI/CD + dev vs staging vs prod environments to automatically update the registry as you change Feast feature definitions. See [docs](https://docs.feast.dev/how-to-guides/running-feast-in-production#1.-automatically-deploying-changes-to-your-feature-definitions). +3. (optional) Regularly scheduled materialization to power low latency feature retrieval (e.g. via Airflow). See [Batch data ingestion](https://docs.feast.dev/getting-started/concepts/data-ingestion#batch-data-ingestion) +for more details. +4. (optional) Deploy feature server instances with `feast serve` to expose endpoints to retrieve online features. + - See [Python feature server](https://docs.feast.dev/reference/feature-servers/python-feature-server) for details. + - Use cases can also directly call the Feast client to fetch features as per [Feature retrieval](https://docs.feast.dev/getting-started/concepts/feature-retrieval) \ No newline at end of file diff --git a/sdk/python/feast/templates/aws/bootstrap.py b/sdk/python/feast/templates/aws/bootstrap.py index 456c6e9b70..dcabadd358 100644 --- a/sdk/python/feast/templates/aws/bootstrap.py +++ b/sdk/python/feast/templates/aws/bootstrap.py @@ -1,5 +1,6 @@ import click +from feast.file_utils import replace_str_in_file from feast.infra.utils import aws_utils @@ -51,14 +52,14 @@ def bootstrap(): driver_df, ) - repo_path = pathlib.Path(__file__).parent.absolute() - config_file = repo_path / "feature_store.yaml" - driver_file = repo_path / "driver_repo.py" + repo_path = pathlib.Path(__file__).parent.absolute() / "feature_repo" + example_py_file = repo_path / "example_repo.py" + replace_str_in_file(example_py_file, "%REDSHIFT_DATABASE%", database) + config_file = repo_path / "feature_store.yaml" replace_str_in_file(config_file, "%AWS_REGION%", aws_region) replace_str_in_file(config_file, "%REDSHIFT_CLUSTER_ID%", cluster_id) replace_str_in_file(config_file, "%REDSHIFT_DATABASE%", database) - replace_str_in_file(driver_file, "%REDSHIFT_DATABASE%", database) replace_str_in_file(config_file, "%REDSHIFT_USER%", user) replace_str_in_file( config_file, "%REDSHIFT_S3_STAGING_LOCATION%", s3_staging_location @@ -66,13 +67,5 @@ def bootstrap(): replace_str_in_file(config_file, "%REDSHIFT_IAM_ROLE%", iam_role) -def replace_str_in_file(file_path, match_str, sub_str): - with open(file_path, "r") as f: - contents = f.read() - contents = contents.replace(match_str, sub_str) - with open(file_path, "wt") as f: - f.write(contents) - - if __name__ == "__main__": bootstrap() diff --git a/sdk/python/feast/templates/aws/driver_repo.py b/sdk/python/feast/templates/aws/driver_repo.py deleted file mode 100644 index f80f16bb6f..0000000000 --- a/sdk/python/feast/templates/aws/driver_repo.py +++ /dev/null @@ -1,67 +0,0 @@ -from datetime import timedelta - -from feast import Entity, FeatureService, FeatureView, Field, RedshiftSource -from feast.types import Float32, Int64 - -# Define an entity for the driver. Entities can be thought of as primary keys used to -# retrieve features. Entities are also used to join multiple tables/views during the -# construction of feature vectors -driver = Entity( - # Name of the entity. Must be unique within a project - name="driver", - # The join keys of an entity describe the storage level field/column on which - # features can be looked up. The join keys are also used to join feature - # tables/views when building feature vectors - join_keys=["driver_id"], -) - -# Indicates a data source from which feature values can be retrieved. Sources are queried when building training -# datasets or materializing features into an online store. -driver_stats_source = RedshiftSource( - # The Redshift table where features can be found - table="feast_driver_hourly_stats", - # The event timestamp is used for point-in-time joins and for ensuring only - # features within the TTL are returned - timestamp_field="event_timestamp", - # The (optional) created timestamp is used to ensure there are no duplicate - # feature rows in the offline store or when building training datasets - created_timestamp_column="created", - # Database to redshift source. - database="%REDSHIFT_DATABASE%", -) - -# Feature views are a grouping based on how features are stored in either the -# online or offline store. -driver_stats_fv = FeatureView( - # The unique name of this feature view. Two feature views in a single - # project cannot have the same name - name="driver_hourly_stats", - # The list of entities specifies the keys required for joining or looking - # up features from this feature view. The reference provided in this field - # correspond to the name of a defined entity (or entities) - entities=[driver], - # The timedelta is the maximum age that each feature value may have - # relative to its lookup time. For historical features (used in training), - # TTL is relative to each timestamp provided in the entity dataframe. - # TTL also allows for eviction of keys from online stores and limits the - # amount of historical scanning required for historical feature values - # during retrieval - ttl=timedelta(weeks=52), - # The list of features defined below act as a schema to both define features - # for both materialization of features into a store, and are used as references - # during retrieval for building a training dataset or serving features - schema=[ - Field(name="conv_rate", dtype=Float32), - Field(name="acc_rate", dtype=Float32), - Field(name="avg_daily_trips", dtype=Int64), - ], - # Batch sources are used to find feature values. In the case of this feature - # view we will query a source table on Redshift for driver statistics - # features - source=driver_stats_source, - # Tags are user defined key/value pairs that are attached to each - # feature view - tags={"team": "driver_performance"}, -) - -driver_stats_fs = FeatureService(name="driver_activity", features=[driver_stats_fv]) diff --git a/sdk/python/feast/templates/aws/feature_repo/__init__.py b/sdk/python/feast/templates/aws/feature_repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/aws/feature_repo/example_repo.py b/sdk/python/feast/templates/aws/feature_repo/example_repo.py new file mode 100644 index 0000000000..eaa1a1bfd4 --- /dev/null +++ b/sdk/python/feast/templates/aws/feature_repo/example_repo.py @@ -0,0 +1,105 @@ +# This is an example feature definition file + +from datetime import timedelta + +import pandas as pd + +from feast import ( + Entity, + FeatureService, + FeatureView, + Field, + PushSource, + RedshiftSource, + RequestSource, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Float32, Float64, Int64 + +# Define an entity for the driver. You can think of an entity as a primary key used to +# fetch features. +driver = Entity(name="driver", join_keys=["driver_id"]) + +# Defines a data source from which feature values can be retrieved. Sources are queried when building training +# datasets or materializing features into an online store. +driver_stats_source = RedshiftSource( + # The Redshift table where features can be found + table="feast_driver_hourly_stats", + # The event timestamp is used for point-in-time joins and for ensuring only + # features within the TTL are returned + timestamp_field="event_timestamp", + # The (optional) created timestamp is used to ensure there are no duplicate + # feature rows in the offline store or when building training datasets + created_timestamp_column="created", + # Database to redshift source. + database="%REDSHIFT_DATABASE%", +) + +# Our parquet files contain sample data that includes a driver_id column, timestamps and +# three feature column. Here we define a Feature View that will allow us to serve this +# data to our model online. +driver_stats_fv = FeatureView( + # The unique name of this feature view. Two feature views in a single + # project cannot have the same name + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + # The list of features defined below act as a schema to both define features + # for both materialization of features into a store, and are used as references + # during retrieval for building a training dataset or serving features + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + ], + online=True, + source=driver_stats_source, + # Tags are user defined key/value pairs that are attached to each + # feature view + tags={"team": "driver_performance"}, +) + +# Defines a way to push data (to be available offline, online or both) into Feast. +driver_stats_push_source = PushSource( + name="driver_stats_push_source", + batch_source=driver_stats_source, +) + +# Define a request data source which encodes features / information only +# available at request time (e.g. part of the user initiated HTTP request) +input_request = RequestSource( + name="vals_to_add", + schema=[ + Field(name="val_to_add", dtype=Int64), + Field(name="val_to_add_2", dtype=Int64), + ], +) + + +# Define an on demand feature view which can generate new features based on +# existing feature views and RequestSource features +@on_demand_feature_view( + sources=[driver_stats_fv, input_request], + schema=[ + Field(name="conv_rate_plus_val1", dtype=Float64), + Field(name="conv_rate_plus_val2", dtype=Float64), + ], +) +def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_val1"] = inputs["conv_rate"] + inputs["val_to_add"] + df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] + return df + + +# This groups features into a model version +driver_activity_v1 = FeatureService( + name="driver_activity_v1", + features=[ + driver_stats_fv[["conv_rate"]], # Sub-selects a feature from a feature view + transformed_conv_rate, # Selects all features from the feature view + ], +) +driver_activity_v2 = FeatureService( + name="driver_activity_v2", features=[driver_stats_fv, transformed_conv_rate] +) diff --git a/sdk/python/feast/templates/aws/feature_repo/feature_store.yaml b/sdk/python/feast/templates/aws/feature_repo/feature_store.yaml new file mode 100644 index 0000000000..c29496711b --- /dev/null +++ b/sdk/python/feast/templates/aws/feature_repo/feature_store.yaml @@ -0,0 +1,28 @@ +project: my_project +# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry) +# On AWS, minimally you should create a S3 bucket for a remote file registry +# See https://docs.feast.dev/getting-started/concepts/registry for details +registry: data/registry.db +# The provider primarily specifies default offline / online stores & storing the registry in a given cloud +provider: aws +# Note: if you comment the online store out, then this by default will use DynamoDB as the online store +online_store: + type: sqlite + path: data/online_store.db +# See https://docs.feast.dev/reference/online-stores/dynamodb +#online_store: +# type: dynamodb +# region: %AWS_REGION% +# See https://docs.feast.dev/reference/online-stores/redis +#online_store: +# type: redis +# connection_string: "localhost:6379" +offline_store: + type: redshift + cluster_id: %REDSHIFT_CLUSTER_ID% + region: %AWS_REGION% + database: %REDSHIFT_DATABASE% + user: %REDSHIFT_USER% + s3_staging_location: %REDSHIFT_S3_STAGING_LOCATION% + iam_role: %REDSHIFT_IAM_ROLE% +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/aws/feature_repo/test_workflow.py b/sdk/python/feast/templates/aws/feature_repo/test_workflow.py new file mode 100644 index 0000000000..0d5b2714d9 --- /dev/null +++ b/sdk/python/feast/templates/aws/feature_repo/test_workflow.py @@ -0,0 +1,124 @@ +import subprocess +from datetime import datetime + +import pandas as pd + +from feast import FeatureStore +from feast.data_source import PushMode + + +def run_demo(): + store = FeatureStore(repo_path=".") + print("\n--- Run feast apply to setup feature store on AWS ---") + subprocess.run(["feast", "apply"]) + + print("\n--- Historical features for training ---") + fetch_historical_features_entity_df(store, for_batch_scoring=False) + + print("\n--- Historical features for batch scoring ---") + fetch_historical_features_entity_df(store, for_batch_scoring=True) + + print("\n--- Load features into online store ---") + store.materialize_incremental(end_date=datetime.now()) + + print("\n--- Online features ---") + fetch_online_features(store, use_feature_service=False) + + print("\n--- Online features retrieved (instead) through a feature service---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Simulate a stream event ingestion of the hourly stats df ---") + event_df = pd.DataFrame.from_dict( + { + "driver_id": [1001], + "event_timestamp": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "created": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "conv_rate": [1.0], + "acc_rate": [1.0], + "avg_daily_trips": [1000], + } + ) + print(event_df) + store.push("driver_stats_push_source", event_df, to=PushMode.ONLINE_AND_OFFLINE) + + print("\n--- Online features again with updated values from a stream push---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Run feast teardown ---") + subprocess.run(["feast", "teardown"]) + + +def fetch_historical_features_entity_df(store: FeatureStore, for_batch_scoring: bool): + # Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for more details on how to retrieve + # for all entities in the offline store instead + entity_df = pd.DataFrame.from_dict( + { + # entity's join key -> entity values + "driver_id": [1001, 1002, 1003], + # "event_timestamp" (reserved key) -> timestamps + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + ], + # (optional) label name -> label values. Feast does not process these + "label_driver_reported_satisfaction": [1, 5, 3], + # values we're using for an on-demand transformation + "val_to_add": [1, 2, 3], + "val_to_add_2": [10, 20, 30], + } + ) + # For batch scoring, we want the latest timestamps + if for_batch_scoring: + entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) + + training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ], + ).to_df() + print(training_df.head()) + + +def fetch_online_features(store, use_feature_service: bool): + entity_rows = [ + # {join_key: entity_value} + { + "driver_id": 1001, + "val_to_add": 1000, + "val_to_add_2": 2000, + }, + { + "driver_id": 1002, + "val_to_add": 1001, + "val_to_add_2": 2002, + }, + ] + if use_feature_service: + features_to_fetch = store.get_feature_service("driver_activity_v1") + else: + features_to_fetch = [ + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ] + returned_features = store.get_online_features( + features=features_to_fetch, + entity_rows=entity_rows, + ).to_dict() + for key, value in sorted(returned_features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo() diff --git a/sdk/python/feast/templates/aws/feature_store.yaml b/sdk/python/feast/templates/aws/feature_store.yaml deleted file mode 100644 index 3745a75347..0000000000 --- a/sdk/python/feast/templates/aws/feature_store.yaml +++ /dev/null @@ -1,15 +0,0 @@ -project: my_project -registry: data/registry.db -provider: aws -online_store: - type: dynamodb - region: %AWS_REGION% -offline_store: - type: redshift - cluster_id: %REDSHIFT_CLUSTER_ID% - region: %AWS_REGION% - database: %REDSHIFT_DATABASE% - user: %REDSHIFT_USER% - s3_staging_location: %REDSHIFT_S3_STAGING_LOCATION% - iam_role: %REDSHIFT_IAM_ROLE% -entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/aws/test.py b/sdk/python/feast/templates/aws/test.py deleted file mode 100644 index 3d223e8f26..0000000000 --- a/sdk/python/feast/templates/aws/test.py +++ /dev/null @@ -1,66 +0,0 @@ -from datetime import datetime, timedelta - -import pandas as pd -from driver_repo import driver, driver_stats_fv - -from feast import FeatureStore - - -def main(): - pd.set_option("display.max_columns", None) - pd.set_option("display.width", 1000) - - # Load the feature store from the current path - fs = FeatureStore(repo_path=".") - - # Deploy the feature store to AWS - print("Deploying feature store to AWS...") - fs.apply([driver, driver_stats_fv]) - - # Select features - features = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"] - - # Create an entity dataframe. This is the dataframe that will be enriched with historical features - entity_df = pd.DataFrame( - { - "event_timestamp": [ - pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") - for dt in pd.date_range( - start=datetime.now() - timedelta(days=3), - end=datetime.now(), - periods=3, - ) - ], - "driver_id": [1001, 1002, 1003], - } - ) - - print("Retrieving training data...") - - # Retrieve historical features by joining the entity dataframe to the Redshift table source - training_df = fs.get_historical_features( - features=features, entity_df=entity_df - ).to_df() - - print() - print(training_df) - - print() - print("Loading features into the online store...") - fs.materialize_incremental(end_date=datetime.now()) - - print() - print("Retrieving online features...") - - # Retrieve features from the online store (Firestore) - online_features = fs.get_online_features( - features=features, - entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], - ).to_dict() - - print() - print(pd.DataFrame.from_dict(online_features)) - - -if __name__ == "__main__": - main() diff --git a/sdk/python/feast/templates/cassandra/__init__.py b/sdk/python/feast/templates/cassandra/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/cassandra/bootstrap.py b/sdk/python/feast/templates/cassandra/bootstrap.py new file mode 100644 index 0000000000..464eba271f --- /dev/null +++ b/sdk/python/feast/templates/cassandra/bootstrap.py @@ -0,0 +1,257 @@ +# +# Copyright 2019 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import pathlib +import sys +from datetime import datetime, timedelta + +import click + +from feast.file_utils import ( + remove_lines_from_file, + replace_str_in_file, + write_setting_or_remove, +) + + +def collect_cassandra_store_settings(): + """ + Interactive CLI collection of settings for the feature store yaml. + Returns a dict with all keys, possibly some are None. + """ + + db_type = click.prompt( + "Regular [C]assandra or [A]stra DB?", + type=click.Choice(["C", "A"]), + show_choices=False, + default="C", + ) + is_astra = db_type == "A" + + if is_astra: + c_secure_bundle_path = click.prompt( + "Enter the full path to your Secure Connect Bundle" + ) + c_hosts = None + c_port = None + c_username = click.prompt("Enter the Client ID from your Astra DB token") + c_password = click.prompt( + "Enter the Client Secret from your Astra DB token", + hide_input=True, + ) + else: + # it's regular Cassandra + c_secure_bundle_path = None + hosts_string = click.prompt( + ("Enter the seed hosts of your cluster " "(comma-separated IP addresses)"), + default="127.0.0.1", + ) + c_hosts = [ + haddr + for haddr in (host.strip() for host in hosts_string.split(",")) + if haddr != "" + ] + if not c_hosts: + print("*Error* : seed host list cannot be empty.") + sys.exit(1) + needs_port = click.confirm("Need to specify port?", default=False) + if needs_port: + c_port = click.prompt("Port to use", default=9042, type=int) + else: + c_port = None + use_auth = click.confirm( + "Do you need username/password?", + default=False, + ) + if use_auth: + c_username = click.prompt("Database username") + c_password = click.prompt("Database password", hide_input=True) + else: + c_username = None + c_password = None + + c_keyspace = click.prompt( + "Specify the keyspace to use", + default="feast_keyspace", + ) + + specify_protocol_version = click.confirm( + "Specify protocol version?", + default=False, + ) + if specify_protocol_version: + c_protocol_version = click.prompt( + "Protocol version", + default={"A": 4, "C": 5}.get(db_type, 5), + type=int, + ) + else: + c_protocol_version = None + + specify_lb = click.confirm("Specify load-balancing?", default=False) + if specify_lb: + c_local_dc = click.prompt( + "Local datacenter (for load-balancing)", + default="datacenter1" if db_type == "C" else None, + ) + c_load_balancing_policy = click.prompt( + "Load-balancing policy", + type=click.Choice( + [ + "TokenAwarePolicy(DCAwareRoundRobinPolicy)", + "DCAwareRoundRobinPolicy", + ] + ), + default="TokenAwarePolicy(DCAwareRoundRobinPolicy)", + ) + else: + c_local_dc = None + c_load_balancing_policy = None + + return { + "c_secure_bundle_path": c_secure_bundle_path, + "c_hosts": c_hosts, + "c_port": c_port, + "c_username": c_username, + "c_password": c_password, + "c_keyspace": c_keyspace, + "c_protocol_version": c_protocol_version, + "c_local_dc": c_local_dc, + "c_load_balancing_policy": c_load_balancing_policy, + } + + +def apply_cassandra_store_settings(config_file, settings): + """ + In-place replacements to `config_file` according to the settings + to make the yaml a proper Cassandra/AstraDB feature-store yaml. + `settings` must have all its keys, possibly the optional ones set to None: + 'c_secure_bundle_path' + 'c_hosts' + 'c_port' + 'c_username' + 'c_password' + 'c_keyspace' + 'c_protocol_version' + 'c_local_dc' + 'c_load_balancing_policy' + """ + write_setting_or_remove( + config_file, + settings["c_secure_bundle_path"], + "secure_bundle_path", + "/path/to/secure/bundle.zip", + ) + # + if settings["c_hosts"]: + replace_str_in_file( + config_file, + " - 127.0.0.1", + os.linesep.join(f" - {c_host}" for c_host in settings["c_hosts"]), + ) + else: + remove_lines_from_file(config_file, "hosts:") + remove_lines_from_file(config_file, "- 127.0.0.1") + # + write_setting_or_remove( + config_file, + settings["c_port"], + "port", + "9042", + ) + # + write_setting_or_remove( + config_file, + settings["c_username"], + "username", + "c_username", + ) + # + write_setting_or_remove( + config_file, + settings["c_password"], + "password", + "c_password", + ) + # + replace_str_in_file( + config_file, + "feast_keyspace", + settings["c_keyspace"], + ) + # + write_setting_or_remove( + config_file, + settings["c_protocol_version"], + "protocol_version", + "c_protocol_version", + ) + # it is assumed that if there's local_dc also there's l.b.p. + if settings["c_local_dc"] is not None: + replace_str_in_file( + config_file, + "c_local_dc", + settings["c_local_dc"], + ) + replace_str_in_file( + config_file, + "c_load_balancing_policy", + settings["c_load_balancing_policy"], + ) + else: + remove_lines_from_file(config_file, "load_balancing:") + remove_lines_from_file(config_file, "local_dc:") + remove_lines_from_file(config_file, "load_balancing_policy:") + + +def bootstrap(): + """ + Bootstrap() will automatically be called + from the init_repo() during `feast init`. + """ + from feast.driver_test_data import create_driver_hourly_stats_df + + repo_path = pathlib.Path(__file__).parent.absolute() / "feature_repo" + config_file = repo_path / "feature_store.yaml" + + data_path = repo_path / "data" + data_path.mkdir(exist_ok=True) + + end_date = datetime.now().replace(microsecond=0, second=0, minute=0) + start_date = end_date - timedelta(days=15) + # + driver_entities = [1001, 1002, 1003, 1004, 1005] + driver_df = create_driver_hourly_stats_df( + driver_entities, + start_date, + end_date, + ) + # + driver_stats_path = data_path / "driver_stats.parquet" + driver_df.to_parquet(path=str(driver_stats_path), allow_truncated_timestamps=True) + + # example_repo.py + example_py_file = repo_path / "example_repo.py" + replace_str_in_file(example_py_file, "%PARQUET_PATH%", str(driver_stats_path)) + + # store config yaml, interact with user and then customize file: + settings = collect_cassandra_store_settings() + apply_cassandra_store_settings(config_file, settings) + + +if __name__ == "__main__": + bootstrap() diff --git a/sdk/python/feast/templates/cassandra/feature_repo/__init__.py b/sdk/python/feast/templates/cassandra/feature_repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/cassandra/feature_repo/example_repo.py b/sdk/python/feast/templates/cassandra/feature_repo/example_repo.py new file mode 100644 index 0000000000..b3c7115482 --- /dev/null +++ b/sdk/python/feast/templates/cassandra/feature_repo/example_repo.py @@ -0,0 +1,100 @@ +# This is an example feature definition file + +from datetime import timedelta + +import pandas as pd + +from feast import ( + Entity, + FeatureService, + FeatureView, + Field, + FileSource, + PushSource, + RequestSource, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Float32, Float64, Int64 + +# Define an entity for the driver. You can think of an entity as a primary key used to +# fetch features. +driver = Entity(name="driver", join_keys=["driver_id"]) + +# Read data from parquet files. Parquet is convenient for local development mode. For +# production, you can use your favorite DWH, such as BigQuery. See Feast documentation +# for more info. +driver_stats_source = FileSource( + name="driver_hourly_stats_source", + path="%PARQUET_PATH%", + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +# Our parquet files contain sample data that includes a driver_id column, timestamps and +# three feature column. Here we define a Feature View that will allow us to serve this +# data to our model online. +driver_stats_fv = FeatureView( + # The unique name of this feature view. Two feature views in a single + # project cannot have the same name + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + # The list of features defined below act as a schema to both define features + # for both materialization of features into a store, and are used as references + # during retrieval for building a training dataset or serving features + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + ], + online=True, + source=driver_stats_source, + # Tags are user defined key/value pairs that are attached to each + # feature view + tags={"team": "driver_performance"}, +) + +# Defines a way to push data (to be available offline, online or both) into Feast. +driver_stats_push_source = PushSource( + name="driver_stats_push_source", + batch_source=driver_stats_source, +) + +# Define a request data source which encodes features / information only +# available at request time (e.g. part of the user initiated HTTP request) +input_request = RequestSource( + name="vals_to_add", + schema=[ + Field(name="val_to_add", dtype=Int64), + Field(name="val_to_add_2", dtype=Int64), + ], +) + + +# Define an on demand feature view which can generate new features based on +# existing feature views and RequestSource features +@on_demand_feature_view( + sources=[driver_stats_fv, input_request], + schema=[ + Field(name="conv_rate_plus_val1", dtype=Float64), + Field(name="conv_rate_plus_val2", dtype=Float64), + ], +) +def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_val1"] = inputs["conv_rate"] + inputs["val_to_add"] + df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] + return df + + +# This groups features into a model version +driver_activity_v1 = FeatureService( + name="driver_activity_v1", + features=[ + driver_stats_fv[["conv_rate"]], # Sub-selects a feature from a feature view + transformed_conv_rate, # Selects all features from the feature view + ], +) +driver_activity_v2 = FeatureService( + name="driver_activity_v2", features=[driver_stats_fv, transformed_conv_rate] +) diff --git a/sdk/python/feast/templates/cassandra/feature_repo/feature_store.yaml b/sdk/python/feast/templates/cassandra/feature_repo/feature_store.yaml new file mode 100644 index 0000000000..076a0d7c6b --- /dev/null +++ b/sdk/python/feast/templates/cassandra/feature_repo/feature_store.yaml @@ -0,0 +1,19 @@ +project: my_project +# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry) +registry: data/registry.db +# The provider primarily specifies default offline / online stores & storing the registry in a given cloud +provider: local +online_store: + type: cassandra + secure_bundle_path: /path/to/secure/bundle.zip + hosts: + - 127.0.0.1 + port: 9042 + username: c_username + password: c_password + keyspace: feast_keyspace + protocol_version: c_protocol_version + load_balancing: + local_dc: c_local_dc + load_balancing_policy: c_load_balancing_policy +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/cassandra/feature_repo/test_workflow.py b/sdk/python/feast/templates/cassandra/feature_repo/test_workflow.py new file mode 100644 index 0000000000..2c388deea9 --- /dev/null +++ b/sdk/python/feast/templates/cassandra/feature_repo/test_workflow.py @@ -0,0 +1,124 @@ +import subprocess +from datetime import datetime + +import pandas as pd + +from feast import FeatureStore +from feast.data_source import PushMode + + +def run_demo(): + store = FeatureStore(repo_path=".") + print("\n--- Run feast apply ---") + subprocess.run(["feast", "apply"]) + + print("\n--- Historical features for training ---") + fetch_historical_features_entity_df(store, for_batch_scoring=False) + + print("\n--- Historical features for batch scoring ---") + fetch_historical_features_entity_df(store, for_batch_scoring=True) + + print("\n--- Load features into online store ---") + store.materialize_incremental(end_date=datetime.now()) + + print("\n--- Online features ---") + fetch_online_features(store, use_feature_service=False) + + print("\n--- Simulate a stream event ingestion of the hourly stats df ---") + event_df = pd.DataFrame.from_dict( + { + "driver_id": [1001], + "event_timestamp": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "created": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "conv_rate": [1.0], + "acc_rate": [1.0], + "avg_daily_trips": [1000], + } + ) + print(event_df) + store.push("driver_stats_push_source", event_df, to=PushMode.ONLINE_AND_OFFLINE) + + print("\n--- Online features again with updated values from a stream push---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Online features retrieved (instead) through a feature service---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Run feast teardown ---") + subprocess.run(["feast", "teardown"]) + + +def fetch_historical_features_entity_df(store: FeatureStore, for_batch_scoring: bool): + # Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for more details on how to retrieve + # for all entities in the offline store instead + entity_df = pd.DataFrame.from_dict( + { + # entity's join key -> entity values + "driver_id": [1001, 1002, 1003], + # "event_timestamp" (reserved key) -> timestamps + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + ], + # (optional) label name -> label values. Feast does not process these + "label_driver_reported_satisfaction": [1, 5, 3], + # values we're using for an on-demand transformation + "val_to_add": [1, 2, 3], + "val_to_add_2": [10, 20, 30], + } + ) + # For batch scoring, we want the latest timestamps + if for_batch_scoring: + entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) + + training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ], + ).to_df() + print(training_df.head()) + + +def fetch_online_features(store, use_feature_service: bool): + entity_rows = [ + # {join_key: entity_value} + { + "driver_id": 1001, + "val_to_add": 1000, + "val_to_add_2": 2000, + }, + { + "driver_id": 1002, + "val_to_add": 1001, + "val_to_add_2": 2002, + }, + ] + if use_feature_service: + features_to_fetch = store.get_feature_service("driver_activity_v1") + else: + features_to_fetch = [ + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ] + returned_features = store.get_online_features( + features=features_to_fetch, + entity_rows=entity_rows, + ).to_dict() + for key, value in sorted(returned_features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo() diff --git a/sdk/python/feast/templates/gcp/README.md b/sdk/python/feast/templates/gcp/README.md new file mode 100644 index 0000000000..7929dc2bdf --- /dev/null +++ b/sdk/python/feast/templates/gcp/README.md @@ -0,0 +1,21 @@ +# Feast Quickstart +A quick view of what's in this repository: + +* `data/` contains raw demo parquet data +* `example_repo.py` contains demo feature definitions +* `feature_store.yaml` contains a demo setup configuring where data sources are +* `test_workflow.py` showcases how to run all key Feast commands, including defining, retrieving, and pushing features. + +You can run the overall workflow with `python test_workflow.py`. + +## To move from this into a more production ready workflow: +1. `feature_store.yaml` points to a local file as a registry. You'll want to setup a remote file (e.g. in S3/GCS) or a +SQL registry. See [registry docs](https://docs.feast.dev/getting-started/concepts/registry) for more details. +2. This example uses an already setup BigQuery Feast data warehouse as the [offline store](https://docs.feast.dev/getting-started/architecture-and-components/offline-store) + to generate training data. You'll need to connect your own BigQuery instance to make this work. +3. Setup CI/CD + dev vs staging vs prod environments to automatically update the registry as you change Feast feature definitions. See [docs](https://docs.feast.dev/how-to-guides/running-feast-in-production#1.-automatically-deploying-changes-to-your-feature-definitions). +4. (optional) Regularly scheduled materialization to power low latency feature retrieval (e.g. via Airflow). See [Batch data ingestion](https://docs.feast.dev/getting-started/concepts/data-ingestion#batch-data-ingestion) +for more details. +5. (optional) Deploy feature server instances with `feast serve` to expose endpoints to retrieve online features. + - See [Python feature server](https://docs.feast.dev/reference/feature-servers/python-feature-server) for details. + - Use cases can also directly call the Feast client to fetch features as per [Feature retrieval](https://docs.feast.dev/getting-started/concepts/feature-retrieval) \ No newline at end of file diff --git a/sdk/python/feast/templates/gcp/driver_repo.py b/sdk/python/feast/templates/gcp/driver_repo.py deleted file mode 100644 index 6c904a0fee..0000000000 --- a/sdk/python/feast/templates/gcp/driver_repo.py +++ /dev/null @@ -1,66 +0,0 @@ -from datetime import timedelta - -from feast import BigQuerySource, Entity, FeatureService, FeatureView, Field -from feast.types import Float32, Int64 - -# Define an entity for the driver. Entities can be thought of as primary keys used to -# retrieve features. Entities are also used to join multiple tables/views during the -# construction of feature vectors -driver = Entity( - # Name of the entity. Must be unique within a project - name="driver", - # The join keys of an entity describe the storage level field/column on which - # features can be looked up. The join keys are also used to join feature - # tables/views when building feature vectors - join_keys=["driver_id"], -) - -# Indicates a data source from which feature values can be retrieved. Sources are queried when building training -# datasets or materializing features into an online store. -driver_stats_source = BigQuerySource( - name="driver_hourly_stats_source", - # The BigQuery table where features can be found - table="feast-oss.demo_data.driver_hourly_stats_2", - # The event timestamp is used for point-in-time joins and for ensuring only - # features within the TTL are returned - timestamp_field="event_timestamp", - # The (optional) created timestamp is used to ensure there are no duplicate - # feature rows in the offline store or when building training datasets - created_timestamp_column="created", -) - -# Feature views are a grouping based on how features are stored in either the -# online or offline store. -driver_stats_fv = FeatureView( - # The unique name of this feature view. Two feature views in a single - # project cannot have the same name - name="driver_hourly_stats", - # The list of entities specifies the keys required for joining or looking - # up features from this feature view. The reference provided in this field - # correspond to the name of a defined entity (or entities) - entities=[driver], - # The timedelta is the maximum age that each feature value may have - # relative to its lookup time. For historical features (used in training), - # TTL is relative to each timestamp provided in the entity dataframe. - # TTL also allows for eviction of keys from online stores and limits the - # amount of historical scanning required for historical feature values - # during retrieval - ttl=timedelta(weeks=52), - # The list of features defined below act as a schema to both define features - # for both materialization of features into a store, and are used as references - # during retrieval for building a training dataset or serving features - schema=[ - Field(name="conv_rate", dtype=Float32), - Field(name="acc_rate", dtype=Float32), - Field(name="avg_daily_trips", dtype=Int64), - ], - # Batch sources are used to find feature values. In the case of this feature - # view we will query a source table on BigQuery for driver statistics - # features - source=driver_stats_source, - # Tags are user defined key/value pairs that are attached to each - # feature view - tags={"team": "driver_performance"}, -) - -driver_stats_fs = FeatureService(name="driver_activity", features=[driver_stats_fv]) diff --git a/sdk/python/feast/templates/gcp/feature_repo/__init__.py b/sdk/python/feast/templates/gcp/feature_repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/gcp/feature_repo/example_repo.py b/sdk/python/feast/templates/gcp/feature_repo/example_repo.py new file mode 100644 index 0000000000..ab2f696ef2 --- /dev/null +++ b/sdk/python/feast/templates/gcp/feature_repo/example_repo.py @@ -0,0 +1,109 @@ +from datetime import timedelta + +import pandas as pd + +from feast import ( + BigQuerySource, + Entity, + FeatureService, + FeatureView, + Field, + PushSource, + RequestSource, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Float32, Float64, Int64 + +# Define an entity for the driver. You can think of an entity as a primary key used to +# fetch features. +driver = Entity(name="driver", join_keys=["driver_id"]) + +# Defines a data source from which feature values can be retrieved. Sources are queried when building training +# datasets or materializing features into an online store. +driver_stats_source = BigQuerySource( + name="driver_hourly_stats_source", + # The BigQuery table where features can be found + table="feast-oss.demo_data.driver_hourly_stats_2", + # The event timestamp is used for point-in-time joins and for ensuring only + # features within the TTL are returned + timestamp_field="event_timestamp", + # The (optional) created timestamp is used to ensure there are no duplicate + # feature rows in the offline store or when building training datasets + created_timestamp_column="created", +) + +# Feature views are a grouping based on how features are stored in either the +# online or offline store. +driver_stats_fv = FeatureView( + # The unique name of this feature view. Two feature views in a single + # project cannot have the same name + name="driver_hourly_stats", + # The list of entities specifies the keys required for joining or looking + # up features from this feature view. The reference provided in this field + # correspond to the name of a defined entity (or entities) + entities=[driver], + # The timedelta is the maximum age that each feature value may have + # relative to its lookup time. For historical features (used in training), + # TTL is relative to each timestamp provided in the entity dataframe. + # TTL also allows for eviction of keys from online stores and limits the + # amount of historical scanning required for historical feature values + # during retrieval + ttl=timedelta(weeks=52 * 10), # Set to be very long for example purposes only + # The list of features defined below act as a schema to both define features + # for both materialization of features into a store, and are used as references + # during retrieval for building a training dataset or serving features + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + ], + source=driver_stats_source, + # Tags are user defined key/value pairs that are attached to each + # feature view + tags={"team": "driver_performance"}, +) + +# Defines a way to push data (to be available offline, online or both) into Feast. +driver_stats_push_source = PushSource( + name="driver_stats_push_source", + batch_source=driver_stats_source, +) + +# Define a request data source which encodes features / information only +# available at request time (e.g. part of the user initiated HTTP request) +input_request = RequestSource( + name="vals_to_add", + schema=[ + Field(name="val_to_add", dtype=Int64), + Field(name="val_to_add_2", dtype=Int64), + ], +) + + +# Define an on demand feature view which can generate new features based on +# existing feature views and RequestSource features +@on_demand_feature_view( + sources=[driver_stats_fv, input_request], + schema=[ + Field(name="conv_rate_plus_val1", dtype=Float64), + Field(name="conv_rate_plus_val2", dtype=Float64), + ], +) +def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_val1"] = inputs["conv_rate"] + inputs["val_to_add"] + df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] + return df + + +# This groups features into a model version +driver_activity_v1 = FeatureService( + name="driver_activity_v1", + features=[ + driver_stats_fv[["conv_rate"]], # Sub-selects a feature from a feature view + transformed_conv_rate, # Selects all features from the feature view + ], +) +driver_activity_v2 = FeatureService( + name="driver_activity_v2", features=[driver_stats_fv, transformed_conv_rate] +) diff --git a/sdk/python/feast/templates/gcp/feature_repo/feature_store.yaml b/sdk/python/feast/templates/gcp/feature_repo/feature_store.yaml new file mode 100644 index 0000000000..7d4096615a --- /dev/null +++ b/sdk/python/feast/templates/gcp/feature_repo/feature_store.yaml @@ -0,0 +1,20 @@ +project: my_project +# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry) +# On GCP, minimally you should create a GCS bucket for a remote file registry +# See https://docs.feast.dev/getting-started/concepts/registry for details +registry: data/registry.db +provider: gcp +# Note: if you comment the online store out, then this by default will use Datastore as the online store +online_store: + type: sqlite + path: data/online_store.db +# See https://docs.feast.dev/reference/online-stores/datastore +#online_store: +# type: datastore +# project_id: my_gcp_project +# namespace: my_datastore_namespace +# See https://docs.feast.dev/reference/online-stores/redis +#online_store: +# type: redis +# connection_string: "localhost:6379" +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/gcp/feature_repo/test_workflow.py b/sdk/python/feast/templates/gcp/feature_repo/test_workflow.py new file mode 100644 index 0000000000..0f8d889477 --- /dev/null +++ b/sdk/python/feast/templates/gcp/feature_repo/test_workflow.py @@ -0,0 +1,126 @@ +import subprocess +from datetime import datetime + +import pandas as pd + +from feast import FeatureStore +from feast.data_source import PushMode + + +def run_demo(): + store = FeatureStore(repo_path=".") + print("\n--- Run feast apply to setup feature store on GCP ---") + subprocess.run(["feast", "apply"]) + + print("\n--- Historical features for training ---") + fetch_historical_features_entity_df(store, for_batch_scoring=False) + + print("\n--- Historical features for batch scoring ---") + fetch_historical_features_entity_df(store, for_batch_scoring=True) + + print("\n--- Load features into online store ---") + store.materialize_incremental(end_date=datetime.now()) + + print("\n--- Online features ---") + fetch_online_features(store, use_feature_service=False) + + print("\n--- Online features retrieved (instead) through a feature service---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Simulate a stream event ingestion of the hourly stats df ---") + event_df = pd.DataFrame.from_dict( + { + "driver_id": [1001], + "event_timestamp": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "created": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "conv_rate": [1.0], + "acc_rate": [1.0], + "avg_daily_trips": [1000], + } + ) + print(event_df) + # You can normally push to offline too, but in this case, you don't have access (since it's a Feast owned + # BigQuery source) + store.push("driver_stats_push_source", event_df, to=PushMode.ONLINE) + + print("\n--- Online features again with updated values from a stream push---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Run feast teardown ---") + subprocess.run(["feast", "teardown"]) + + +def fetch_historical_features_entity_df(store: FeatureStore, for_batch_scoring: bool): + # Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for more details on how to retrieve + # for all entities in the offline store instead + entity_df = pd.DataFrame.from_dict( + { + # entity's join key -> entity values + "driver_id": [1001, 1002, 1003], + # "event_timestamp" (reserved key) -> timestamps + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + ], + # (optional) label name -> label values. Feast does not process these + "label_driver_reported_satisfaction": [1, 5, 3], + # values we're using for an on-demand transformation + "val_to_add": [1, 2, 3], + "val_to_add_2": [10, 20, 30], + } + ) + # For batch scoring, we want the latest timestamps + if for_batch_scoring: + entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) + + training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ], + ).to_df() + print(training_df.head()) + + +def fetch_online_features(store, use_feature_service: bool): + entity_rows = [ + # {join_key: entity_value} + { + "driver_id": 1001, + "val_to_add": 1000, + "val_to_add_2": 2000, + }, + { + "driver_id": 1002, + "val_to_add": 1001, + "val_to_add_2": 2002, + }, + ] + if use_feature_service: + features_to_fetch = store.get_feature_service("driver_activity_v1") + else: + features_to_fetch = [ + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ] + returned_features = store.get_online_features( + features=features_to_fetch, + entity_rows=entity_rows, + ).to_dict() + for key, value in sorted(returned_features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo() diff --git a/sdk/python/feast/templates/gcp/feature_store.yaml b/sdk/python/feast/templates/gcp/feature_store.yaml deleted file mode 100644 index 74ee729090..0000000000 --- a/sdk/python/feast/templates/gcp/feature_store.yaml +++ /dev/null @@ -1,4 +0,0 @@ -project: my_project -registry: data/registry.db -provider: gcp -entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/gcp/test.py b/sdk/python/feast/templates/gcp/test.py deleted file mode 100644 index 8ff11bda5c..0000000000 --- a/sdk/python/feast/templates/gcp/test.py +++ /dev/null @@ -1,66 +0,0 @@ -from datetime import datetime, timedelta - -import pandas as pd -from driver_repo import driver, driver_stats_fv - -from feast import FeatureStore - - -def main(): - pd.set_option("display.max_columns", None) - pd.set_option("display.width", 1000) - - # Load the feature store from the current path - fs = FeatureStore(repo_path=".") - - # Deploy the feature store to GCP - print("Deploying feature store to GCP...") - fs.apply([driver, driver_stats_fv]) - - # Select features - features = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"] - - # Create an entity dataframe. This is the dataframe that will be enriched with historical features - entity_df = pd.DataFrame( - { - "event_timestamp": [ - pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") - for dt in pd.date_range( - start=datetime.now() - timedelta(days=3), - end=datetime.now(), - periods=3, - ) - ], - "driver_id": [1001, 1002, 1003], - } - ) - - print("Retrieving training data...") - - # Retrieve historical features by joining the entity dataframe to the BigQuery table source - training_df = fs.get_historical_features( - features=features, entity_df=entity_df - ).to_df() - - print() - print(training_df) - - print() - print("Loading features into the online store...") - fs.materialize_incremental(end_date=datetime.now()) - - print() - print("Retrieving online features...") - - # Retrieve features from the online store (Firestore) - online_features = fs.get_online_features( - features=features, - entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], - ).to_dict() - - print() - print(pd.DataFrame.from_dict(online_features)) - - -if __name__ == "__main__": - main() diff --git a/sdk/python/feast/templates/hbase/bootstrap.py b/sdk/python/feast/templates/hbase/bootstrap.py index 4013ca5a8d..125eb7c2e7 100644 --- a/sdk/python/feast/templates/hbase/bootstrap.py +++ b/sdk/python/feast/templates/hbase/bootstrap.py @@ -1,3 +1,6 @@ +from feast.file_utils import replace_str_in_file + + def bootstrap(): # Bootstrap() will automatically be called from the init_repo() during `feast init` @@ -6,7 +9,7 @@ def bootstrap(): from feast.driver_test_data import create_driver_hourly_stats_df - repo_path = pathlib.Path(__file__).parent.absolute() + repo_path = pathlib.Path(__file__).parent.absolute() / "feature_repo" data_path = repo_path / "data" data_path.mkdir(exist_ok=True) @@ -19,17 +22,9 @@ def bootstrap(): driver_stats_path = data_path / "driver_stats.parquet" driver_df.to_parquet(path=str(driver_stats_path), allow_truncated_timestamps=True) - example_py_file = repo_path / "example.py" + example_py_file = repo_path / "example_repo.py" replace_str_in_file(example_py_file, "%PARQUET_PATH%", str(driver_stats_path)) -def replace_str_in_file(file_path, match_str, sub_str): - with open(file_path, "r") as f: - contents = f.read() - contents = contents.replace(match_str, sub_str) - with open(file_path, "wt") as f: - f.write(contents) - - if __name__ == "__main__": bootstrap() diff --git a/sdk/python/feast/templates/hbase/example.py b/sdk/python/feast/templates/hbase/example.py deleted file mode 100644 index 6845371f1f..0000000000 --- a/sdk/python/feast/templates/hbase/example.py +++ /dev/null @@ -1,37 +0,0 @@ -# This is an example feature definition file - -from datetime import timedelta - -from feast import Entity, FeatureView, Field, FileSource -from feast.types import Float32, Int64 - -# Read data from parquet files. Parquet is convenient for local development mode. For -# production, you can use your favorite DWH, such as BigQuery. See Feast documentation -# for more info. -driver_hourly_stats = FileSource( - name="driver_hourly_stats_source", - path="%PARQUET_PATH%", - timestamp_field="event_timestamp", - created_timestamp_column="created", -) - -# Define an entity for the driver. You can think of entity as a primary key used to -# fetch features. -driver = Entity(name="driver", join_keys=["driver_id"]) - -# Our parquet files contain sample data that includes a driver_id column, timestamps and -# three feature column. Here we define a Feature View that will allow us to serve this -# data to our model online. -driver_hourly_stats_view = FeatureView( - name="driver_hourly_stats", - entities=["driver"], - ttl=timedelta(days=1), - schema=[ - Field(name="conv_rate", dtype=Float32), - Field(name="acc_rate", dtype=Float32), - Field(name="avg_daily_trips", dtype=Int64), - ], - online=True, - source=driver_hourly_stats, - tags={}, -) diff --git a/sdk/python/feast/templates/hbase/feature_repo/__init__.py b/sdk/python/feast/templates/hbase/feature_repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/hbase/feature_repo/example_repo.py b/sdk/python/feast/templates/hbase/feature_repo/example_repo.py new file mode 100644 index 0000000000..b3c7115482 --- /dev/null +++ b/sdk/python/feast/templates/hbase/feature_repo/example_repo.py @@ -0,0 +1,100 @@ +# This is an example feature definition file + +from datetime import timedelta + +import pandas as pd + +from feast import ( + Entity, + FeatureService, + FeatureView, + Field, + FileSource, + PushSource, + RequestSource, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Float32, Float64, Int64 + +# Define an entity for the driver. You can think of an entity as a primary key used to +# fetch features. +driver = Entity(name="driver", join_keys=["driver_id"]) + +# Read data from parquet files. Parquet is convenient for local development mode. For +# production, you can use your favorite DWH, such as BigQuery. See Feast documentation +# for more info. +driver_stats_source = FileSource( + name="driver_hourly_stats_source", + path="%PARQUET_PATH%", + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +# Our parquet files contain sample data that includes a driver_id column, timestamps and +# three feature column. Here we define a Feature View that will allow us to serve this +# data to our model online. +driver_stats_fv = FeatureView( + # The unique name of this feature view. Two feature views in a single + # project cannot have the same name + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + # The list of features defined below act as a schema to both define features + # for both materialization of features into a store, and are used as references + # during retrieval for building a training dataset or serving features + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + ], + online=True, + source=driver_stats_source, + # Tags are user defined key/value pairs that are attached to each + # feature view + tags={"team": "driver_performance"}, +) + +# Defines a way to push data (to be available offline, online or both) into Feast. +driver_stats_push_source = PushSource( + name="driver_stats_push_source", + batch_source=driver_stats_source, +) + +# Define a request data source which encodes features / information only +# available at request time (e.g. part of the user initiated HTTP request) +input_request = RequestSource( + name="vals_to_add", + schema=[ + Field(name="val_to_add", dtype=Int64), + Field(name="val_to_add_2", dtype=Int64), + ], +) + + +# Define an on demand feature view which can generate new features based on +# existing feature views and RequestSource features +@on_demand_feature_view( + sources=[driver_stats_fv, input_request], + schema=[ + Field(name="conv_rate_plus_val1", dtype=Float64), + Field(name="conv_rate_plus_val2", dtype=Float64), + ], +) +def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_val1"] = inputs["conv_rate"] + inputs["val_to_add"] + df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] + return df + + +# This groups features into a model version +driver_activity_v1 = FeatureService( + name="driver_activity_v1", + features=[ + driver_stats_fv[["conv_rate"]], # Sub-selects a feature from a feature view + transformed_conv_rate, # Selects all features from the feature view + ], +) +driver_activity_v2 = FeatureService( + name="driver_activity_v2", features=[driver_stats_fv, transformed_conv_rate] +) diff --git a/sdk/python/feast/templates/hbase/feature_repo/feature_store.yaml b/sdk/python/feast/templates/hbase/feature_repo/feature_store.yaml new file mode 100644 index 0000000000..bde49486ad --- /dev/null +++ b/sdk/python/feast/templates/hbase/feature_repo/feature_store.yaml @@ -0,0 +1,10 @@ +project: my_project +# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry) +registry: data/registry.db +# The provider primarily specifies default offline / online stores & storing the registry in a given cloud +provider: local +online_store: + type: hbase + host: 127.0.0.1 + port: 9090 +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/hbase/feature_repo/test_workflow.py b/sdk/python/feast/templates/hbase/feature_repo/test_workflow.py new file mode 100644 index 0000000000..76b8d7836c --- /dev/null +++ b/sdk/python/feast/templates/hbase/feature_repo/test_workflow.py @@ -0,0 +1,124 @@ +import subprocess +from datetime import datetime + +import pandas as pd + +from feast import FeatureStore +from feast.data_source import PushMode + + +def run_demo(): + store = FeatureStore(repo_path=".") + print("\n--- Run feast apply ---") + subprocess.run(["feast", "apply"]) + + print("\n--- Historical features for training ---") + fetch_historical_features_entity_df(store, for_batch_scoring=False) + + print("\n--- Historical features for batch scoring ---") + fetch_historical_features_entity_df(store, for_batch_scoring=True) + + print("\n--- Load features into online store ---") + store.materialize_incremental(end_date=datetime.now()) + + print("\n--- Online features ---") + fetch_online_features(store, use_feature_service=False) + + print("\n--- Online features retrieved (instead) through a feature service---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Simulate a stream event ingestion of the hourly stats df ---") + event_df = pd.DataFrame.from_dict( + { + "driver_id": [1001], + "event_timestamp": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "created": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "conv_rate": [1.0], + "acc_rate": [1.0], + "avg_daily_trips": [1000], + } + ) + print(event_df) + store.push("driver_stats_push_source", event_df, to=PushMode.ONLINE_AND_OFFLINE) + + print("\n--- Online features again with updated values from a stream push---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Run feast teardown ---") + subprocess.run(["feast", "teardown"]) + + +def fetch_historical_features_entity_df(store: FeatureStore, for_batch_scoring: bool): + # Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for more details on how to retrieve + # for all entities in the offline store instead + entity_df = pd.DataFrame.from_dict( + { + # entity's join key -> entity values + "driver_id": [1001, 1002, 1003], + # "event_timestamp" (reserved key) -> timestamps + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + ], + # (optional) label name -> label values. Feast does not process these + "label_driver_reported_satisfaction": [1, 5, 3], + # values we're using for an on-demand transformation + "val_to_add": [1, 2, 3], + "val_to_add_2": [10, 20, 30], + } + ) + # For batch scoring, we want the latest timestamps + if for_batch_scoring: + entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) + + training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ], + ).to_df() + print(training_df.head()) + + +def fetch_online_features(store, use_feature_service: bool): + entity_rows = [ + # {join_key: entity_value} + { + "driver_id": 1001, + "val_to_add": 1000, + "val_to_add_2": 2000, + }, + { + "driver_id": 1002, + "val_to_add": 1001, + "val_to_add_2": 2002, + }, + ] + if use_feature_service: + features_to_fetch = store.get_feature_service("driver_activity_v1") + else: + features_to_fetch = [ + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ] + returned_features = store.get_online_features( + features=features_to_fetch, + entity_rows=entity_rows, + ).to_dict() + for key, value in sorted(returned_features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo() diff --git a/sdk/python/feast/templates/hbase/feature_store.yaml b/sdk/python/feast/templates/hbase/feature_store.yaml deleted file mode 100644 index f99e858f7c..0000000000 --- a/sdk/python/feast/templates/hbase/feature_store.yaml +++ /dev/null @@ -1,8 +0,0 @@ -project: my_project -registry: data/registry.db -provider: local -online_store: - type: hbase - host: 127.0.0.1 - port: 9090 -entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/local/README.md b/sdk/python/feast/templates/local/README.md new file mode 100644 index 0000000000..8133b6e84e --- /dev/null +++ b/sdk/python/feast/templates/local/README.md @@ -0,0 +1,27 @@ +# Feast Quickstart +If you haven't already, check out the quickstart guide on Feast's website (http://docs.feast.dev/quickstart), which +uses this repo. A quick view of what's in this repository's `feature_repo/` directory: + +* `data/` contains raw demo parquet data +* `example_repo.py` contains demo feature definitions +* `feature_store.yaml` contains a demo setup configuring where data sources are +* `test_workflow.py` showcases how to run all key Feast commands, including defining, retrieving, and pushing features. + +You can run the overall workflow with `python test_workflow.py`. + +## To move from this into a more production ready workflow: +1. First: you should start with a different Feast template, which delegates to a more scalable offline store. + - For example, running `feast init -t gcp` + or `feast init -t aws` or `feast init -t snowflake`. + - You can see your options if you run `feast init --help`. +2. `feature_store.yaml` points to a local file as a registry. You'll want to setup a remote file (e.g. in S3/GCS) or a +SQL registry. See [registry docs](https://docs.feast.dev/getting-started/concepts/registry) for more details. +3. This example uses a file [offline store](https://docs.feast.dev/getting-started/architecture-and-components/offline-store) + to generate training data. It does not scale. We recommend instead using a data warehouse such as BigQuery, + Snowflake, Redshift. There is experimental support for Spark as well. +4. Setup CI/CD + dev vs staging vs prod environments to automatically update the registry as you change Feast feature definitions. See [docs](https://docs.feast.dev/how-to-guides/running-feast-in-production#1.-automatically-deploying-changes-to-your-feature-definitions). +5. (optional) Regularly scheduled materialization to power low latency feature retrieval (e.g. via Airflow). See [Batch data ingestion](https://docs.feast.dev/getting-started/concepts/data-ingestion#batch-data-ingestion) +for more details. +6. (optional) Deploy feature server instances with `feast serve` to expose endpoints to retrieve online features. + - See [Python feature server](https://docs.feast.dev/reference/feature-servers/python-feature-server) for details. + - Use cases can also directly call the Feast client to fetch features as per [Feature retrieval](https://docs.feast.dev/getting-started/concepts/feature-retrieval) \ No newline at end of file diff --git a/sdk/python/feast/templates/local/bootstrap.py b/sdk/python/feast/templates/local/bootstrap.py index 4013ca5a8d..125eb7c2e7 100644 --- a/sdk/python/feast/templates/local/bootstrap.py +++ b/sdk/python/feast/templates/local/bootstrap.py @@ -1,3 +1,6 @@ +from feast.file_utils import replace_str_in_file + + def bootstrap(): # Bootstrap() will automatically be called from the init_repo() during `feast init` @@ -6,7 +9,7 @@ def bootstrap(): from feast.driver_test_data import create_driver_hourly_stats_df - repo_path = pathlib.Path(__file__).parent.absolute() + repo_path = pathlib.Path(__file__).parent.absolute() / "feature_repo" data_path = repo_path / "data" data_path.mkdir(exist_ok=True) @@ -19,17 +22,9 @@ def bootstrap(): driver_stats_path = data_path / "driver_stats.parquet" driver_df.to_parquet(path=str(driver_stats_path), allow_truncated_timestamps=True) - example_py_file = repo_path / "example.py" + example_py_file = repo_path / "example_repo.py" replace_str_in_file(example_py_file, "%PARQUET_PATH%", str(driver_stats_path)) -def replace_str_in_file(file_path, match_str, sub_str): - with open(file_path, "r") as f: - contents = f.read() - contents = contents.replace(match_str, sub_str) - with open(file_path, "wt") as f: - f.write(contents) - - if __name__ == "__main__": bootstrap() diff --git a/sdk/python/feast/templates/local/example.py b/sdk/python/feast/templates/local/example.py deleted file mode 100644 index 4fd30ba3a1..0000000000 --- a/sdk/python/feast/templates/local/example.py +++ /dev/null @@ -1,41 +0,0 @@ -# This is an example feature definition file - -from datetime import timedelta - -from feast import Entity, FeatureService, FeatureView, Field, FileSource -from feast.types import Float32, Int64 - -# Read data from parquet files. Parquet is convenient for local development mode. For -# production, you can use your favorite DWH, such as BigQuery. See Feast documentation -# for more info. -driver_hourly_stats = FileSource( - name="driver_hourly_stats_source", - path="%PARQUET_PATH%", - timestamp_field="event_timestamp", - created_timestamp_column="created", -) - -# Define an entity for the driver. You can think of entity as a primary key used to -# fetch features. -driver = Entity(name="driver", join_keys=["driver_id"]) - -# Our parquet files contain sample data that includes a driver_id column, timestamps and -# three feature column. Here we define a Feature View that will allow us to serve this -# data to our model online. -driver_hourly_stats_view = FeatureView( - name="driver_hourly_stats", - entities=[driver], - ttl=timedelta(days=1), - schema=[ - Field(name="conv_rate", dtype=Float32), - Field(name="acc_rate", dtype=Float32), - Field(name="avg_daily_trips", dtype=Int64), - ], - online=True, - source=driver_hourly_stats, - tags={}, -) - -driver_stats_fs = FeatureService( - name="driver_activity", features=[driver_hourly_stats_view] -) diff --git a/sdk/python/feast/templates/local/feature_repo/__init__.py b/sdk/python/feast/templates/local/feature_repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/local/feature_repo/example_repo.py b/sdk/python/feast/templates/local/feature_repo/example_repo.py new file mode 100644 index 0000000000..b3c7115482 --- /dev/null +++ b/sdk/python/feast/templates/local/feature_repo/example_repo.py @@ -0,0 +1,100 @@ +# This is an example feature definition file + +from datetime import timedelta + +import pandas as pd + +from feast import ( + Entity, + FeatureService, + FeatureView, + Field, + FileSource, + PushSource, + RequestSource, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Float32, Float64, Int64 + +# Define an entity for the driver. You can think of an entity as a primary key used to +# fetch features. +driver = Entity(name="driver", join_keys=["driver_id"]) + +# Read data from parquet files. Parquet is convenient for local development mode. For +# production, you can use your favorite DWH, such as BigQuery. See Feast documentation +# for more info. +driver_stats_source = FileSource( + name="driver_hourly_stats_source", + path="%PARQUET_PATH%", + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +# Our parquet files contain sample data that includes a driver_id column, timestamps and +# three feature column. Here we define a Feature View that will allow us to serve this +# data to our model online. +driver_stats_fv = FeatureView( + # The unique name of this feature view. Two feature views in a single + # project cannot have the same name + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + # The list of features defined below act as a schema to both define features + # for both materialization of features into a store, and are used as references + # during retrieval for building a training dataset or serving features + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + ], + online=True, + source=driver_stats_source, + # Tags are user defined key/value pairs that are attached to each + # feature view + tags={"team": "driver_performance"}, +) + +# Defines a way to push data (to be available offline, online or both) into Feast. +driver_stats_push_source = PushSource( + name="driver_stats_push_source", + batch_source=driver_stats_source, +) + +# Define a request data source which encodes features / information only +# available at request time (e.g. part of the user initiated HTTP request) +input_request = RequestSource( + name="vals_to_add", + schema=[ + Field(name="val_to_add", dtype=Int64), + Field(name="val_to_add_2", dtype=Int64), + ], +) + + +# Define an on demand feature view which can generate new features based on +# existing feature views and RequestSource features +@on_demand_feature_view( + sources=[driver_stats_fv, input_request], + schema=[ + Field(name="conv_rate_plus_val1", dtype=Float64), + Field(name="conv_rate_plus_val2", dtype=Float64), + ], +) +def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_val1"] = inputs["conv_rate"] + inputs["val_to_add"] + df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] + return df + + +# This groups features into a model version +driver_activity_v1 = FeatureService( + name="driver_activity_v1", + features=[ + driver_stats_fv[["conv_rate"]], # Sub-selects a feature from a feature view + transformed_conv_rate, # Selects all features from the feature view + ], +) +driver_activity_v2 = FeatureService( + name="driver_activity_v2", features=[driver_stats_fv, transformed_conv_rate] +) diff --git a/sdk/python/feast/templates/local/feature_repo/feature_store.yaml b/sdk/python/feast/templates/local/feature_repo/feature_store.yaml new file mode 100644 index 0000000000..3e6a360316 --- /dev/null +++ b/sdk/python/feast/templates/local/feature_repo/feature_store.yaml @@ -0,0 +1,9 @@ +project: my_project +# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry) +registry: data/registry.db +# The provider primarily specifies default offline / online stores & storing the registry in a given cloud +provider: local +online_store: + type: sqlite + path: data/online_store.db +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/local/feature_repo/test_workflow.py b/sdk/python/feast/templates/local/feature_repo/test_workflow.py new file mode 100644 index 0000000000..76b8d7836c --- /dev/null +++ b/sdk/python/feast/templates/local/feature_repo/test_workflow.py @@ -0,0 +1,124 @@ +import subprocess +from datetime import datetime + +import pandas as pd + +from feast import FeatureStore +from feast.data_source import PushMode + + +def run_demo(): + store = FeatureStore(repo_path=".") + print("\n--- Run feast apply ---") + subprocess.run(["feast", "apply"]) + + print("\n--- Historical features for training ---") + fetch_historical_features_entity_df(store, for_batch_scoring=False) + + print("\n--- Historical features for batch scoring ---") + fetch_historical_features_entity_df(store, for_batch_scoring=True) + + print("\n--- Load features into online store ---") + store.materialize_incremental(end_date=datetime.now()) + + print("\n--- Online features ---") + fetch_online_features(store, use_feature_service=False) + + print("\n--- Online features retrieved (instead) through a feature service---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Simulate a stream event ingestion of the hourly stats df ---") + event_df = pd.DataFrame.from_dict( + { + "driver_id": [1001], + "event_timestamp": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "created": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "conv_rate": [1.0], + "acc_rate": [1.0], + "avg_daily_trips": [1000], + } + ) + print(event_df) + store.push("driver_stats_push_source", event_df, to=PushMode.ONLINE_AND_OFFLINE) + + print("\n--- Online features again with updated values from a stream push---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Run feast teardown ---") + subprocess.run(["feast", "teardown"]) + + +def fetch_historical_features_entity_df(store: FeatureStore, for_batch_scoring: bool): + # Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for more details on how to retrieve + # for all entities in the offline store instead + entity_df = pd.DataFrame.from_dict( + { + # entity's join key -> entity values + "driver_id": [1001, 1002, 1003], + # "event_timestamp" (reserved key) -> timestamps + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + ], + # (optional) label name -> label values. Feast does not process these + "label_driver_reported_satisfaction": [1, 5, 3], + # values we're using for an on-demand transformation + "val_to_add": [1, 2, 3], + "val_to_add_2": [10, 20, 30], + } + ) + # For batch scoring, we want the latest timestamps + if for_batch_scoring: + entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) + + training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ], + ).to_df() + print(training_df.head()) + + +def fetch_online_features(store, use_feature_service: bool): + entity_rows = [ + # {join_key: entity_value} + { + "driver_id": 1001, + "val_to_add": 1000, + "val_to_add_2": 2000, + }, + { + "driver_id": 1002, + "val_to_add": 1001, + "val_to_add_2": 2002, + }, + ] + if use_feature_service: + features_to_fetch = store.get_feature_service("driver_activity_v1") + else: + features_to_fetch = [ + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ] + returned_features = store.get_online_features( + features=features_to_fetch, + entity_rows=entity_rows, + ).to_dict() + for key, value in sorted(returned_features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo() diff --git a/sdk/python/feast/templates/local/feature_store.yaml b/sdk/python/feast/templates/local/feature_store.yaml deleted file mode 100644 index fddde04f90..0000000000 --- a/sdk/python/feast/templates/local/feature_store.yaml +++ /dev/null @@ -1,6 +0,0 @@ -project: my_project -registry: data/registry.db -provider: local -online_store: - path: data/online_store.db -entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/minimal/__init__.py b/sdk/python/feast/templates/minimal/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/minimal/feature_repo/__init__.py b/sdk/python/feast/templates/minimal/feature_repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/minimal/feature_store.yaml b/sdk/python/feast/templates/minimal/feature_repo/feature_store.yaml similarity index 100% rename from sdk/python/feast/templates/minimal/feature_store.yaml rename to sdk/python/feast/templates/minimal/feature_repo/feature_store.yaml diff --git a/sdk/python/feast/templates/postgres/bootstrap.py b/sdk/python/feast/templates/postgres/bootstrap.py index 078d7cdc68..9f6e8a988d 100644 --- a/sdk/python/feast/templates/postgres/bootstrap.py +++ b/sdk/python/feast/templates/postgres/bootstrap.py @@ -1,6 +1,7 @@ import click import psycopg2 +from feast.file_utils import replace_str_in_file from feast.infra.utils.postgres.connection_utils import df_to_postgres_table from feast.infra.utils.postgres.postgres_config import PostgreSQLConfig @@ -13,7 +14,7 @@ def bootstrap(): from feast.driver_test_data import create_driver_hourly_stats_df - repo_path = pathlib.Path(__file__).parent.absolute() + repo_path = pathlib.Path(__file__).parent.absolute() / "feature_repo" config_file = repo_path / "feature_store.yaml" end_date = datetime.now().replace(microsecond=0, second=0, minute=0) @@ -66,13 +67,5 @@ def bootstrap(): replace_str_in_file(config_file, "DB_PASSWORD", postgres_password) -def replace_str_in_file(file_path, match_str, sub_str): - with open(file_path, "r") as f: - contents = f.read() - contents = contents.replace(match_str, sub_str) - with open(file_path, "wt") as f: - f.write(contents) - - if __name__ == "__main__": bootstrap() diff --git a/sdk/python/feast/templates/postgres/driver_repo.py b/sdk/python/feast/templates/postgres/driver_repo.py deleted file mode 100644 index 61e32eb58e..0000000000 --- a/sdk/python/feast/templates/postgres/driver_repo.py +++ /dev/null @@ -1,32 +0,0 @@ -from datetime import timedelta - -from feast import Entity, FeatureView, Field -from feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source import ( - PostgreSQLSource, -) -from feast.types import Float32, Int64 - -driver = Entity( - name="driver_id", - join_keys=["driver_id"], -) - - -driver_stats_source = PostgreSQLSource( - name="feast_driver_hourly_stats", - query="SELECT * FROM feast_driver_hourly_stats", - timestamp_field="event_timestamp", - created_timestamp_column="created", -) - -driver_stats_fv = FeatureView( - name="driver_hourly_stats", - entities=[driver], - ttl=timedelta(weeks=52), - schema=[ - Field(name="conv_rate", dtype=Float32), - Field(name="acc_rate", dtype=Float32), - Field(name="avg_daily_trips", dtype=Int64), - ], - source=driver_stats_source, -) diff --git a/sdk/python/feast/templates/postgres/feature_repo/__init__.py b/sdk/python/feast/templates/postgres/feature_repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/postgres/feature_repo/example_repo.py b/sdk/python/feast/templates/postgres/feature_repo/example_repo.py new file mode 100644 index 0000000000..a7ba9d7eac --- /dev/null +++ b/sdk/python/feast/templates/postgres/feature_repo/example_repo.py @@ -0,0 +1,92 @@ +# This is an example feature definition file + +from datetime import timedelta + +import pandas as pd + +from feast import Entity, FeatureService, FeatureView, Field, PushSource, RequestSource +from feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source import ( + PostgreSQLSource, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Float32, Float64, Int64 + +# Define an entity for the driver. You can think of an entity as a primary key used to +# fetch features. +driver = Entity(name="driver", join_keys=["driver_id"]) + +driver_stats_source = PostgreSQLSource( + name="driver_hourly_stats_source", + query="SELECT * FROM feast_driver_hourly_stats", + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +# Our parquet files contain sample data that includes a driver_id column, timestamps and +# three feature column. Here we define a Feature View that will allow us to serve this +# data to our model online. +driver_stats_fv = FeatureView( + # The unique name of this feature view. Two feature views in a single + # project cannot have the same name + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + # The list of features defined below act as a schema to both define features + # for both materialization of features into a store, and are used as references + # during retrieval for building a training dataset or serving features + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + ], + online=True, + source=driver_stats_source, + # Tags are user defined key/value pairs that are attached to each + # feature view + tags={"team": "driver_performance"}, +) + +# Defines a way to push data (to be available offline, online or both) into Feast. +driver_stats_push_source = PushSource( + name="driver_stats_push_source", + batch_source=driver_stats_source, +) + +# Define a request data source which encodes features / information only +# available at request time (e.g. part of the user initiated HTTP request) +input_request = RequestSource( + name="vals_to_add", + schema=[ + Field(name="val_to_add", dtype=Int64), + Field(name="val_to_add_2", dtype=Int64), + ], +) + + +# Define an on demand feature view which can generate new features based on +# existing feature views and RequestSource features +@on_demand_feature_view( + sources=[driver_stats_fv, input_request], + schema=[ + Field(name="conv_rate_plus_val1", dtype=Float64), + Field(name="conv_rate_plus_val2", dtype=Float64), + ], +) +def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_val1"] = inputs["conv_rate"] + inputs["val_to_add"] + df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] + return df + + +# This groups features into a model version +driver_activity_v1 = FeatureService( + name="driver_activity_v1", + features=[ + driver_stats_fv[["conv_rate"]], # Sub-selects a feature from a feature view + transformed_conv_rate, # Selects all features from the feature view + ], +) +driver_activity_v2 = FeatureService( + name="driver_activity_v2", features=[driver_stats_fv, transformed_conv_rate] +) diff --git a/sdk/python/feast/templates/postgres/feature_store.yaml b/sdk/python/feast/templates/postgres/feature_repo/feature_store.yaml similarity index 100% rename from sdk/python/feast/templates/postgres/feature_store.yaml rename to sdk/python/feast/templates/postgres/feature_repo/feature_store.yaml diff --git a/sdk/python/feast/templates/postgres/feature_repo/test_workflow.py b/sdk/python/feast/templates/postgres/feature_repo/test_workflow.py new file mode 100644 index 0000000000..ca5c1ccf42 --- /dev/null +++ b/sdk/python/feast/templates/postgres/feature_repo/test_workflow.py @@ -0,0 +1,124 @@ +import subprocess +from datetime import datetime + +import pandas as pd + +from feast import FeatureStore +from feast.data_source import PushMode + + +def run_demo(): + store = FeatureStore(repo_path=".") + print("\n--- Run feast apply to setup feature store on Postgres ---") + subprocess.run(["feast", "apply"]) + + print("\n--- Historical features for training ---") + fetch_historical_features_entity_df(store, for_batch_scoring=False) + + print("\n--- Historical features for batch scoring ---") + fetch_historical_features_entity_df(store, for_batch_scoring=True) + + print("\n--- Load features into online store ---") + store.materialize_incremental(end_date=datetime.now()) + + print("\n--- Online features ---") + fetch_online_features(store, use_feature_service=False) + + print("\n--- Online features retrieved (instead) through a feature service---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Simulate a stream event ingestion of the hourly stats df ---") + event_df = pd.DataFrame.from_dict( + { + "driver_id": [1001], + "event_timestamp": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "created": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "conv_rate": [1.0], + "acc_rate": [1.0], + "avg_daily_trips": [1000], + } + ) + print(event_df) + store.push("driver_stats_push_source", event_df, to=PushMode.ONLINE_AND_OFFLINE) + + print("\n--- Online features again with updated values from a stream push---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Run feast teardown ---") + subprocess.run(["feast", "teardown"]) + + +def fetch_historical_features_entity_df(store: FeatureStore, for_batch_scoring: bool): + # Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for more details on how to retrieve + # for all entities in the offline store instead + entity_df = pd.DataFrame.from_dict( + { + # entity's join key -> entity values + "driver_id": [1001, 1002, 1003], + # "event_timestamp" (reserved key) -> timestamps + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + ], + # (optional) label name -> label values. Feast does not process these + "label_driver_reported_satisfaction": [1, 5, 3], + # values we're using for an on-demand transformation + "val_to_add": [1, 2, 3], + "val_to_add_2": [10, 20, 30], + } + ) + # For batch scoring, we want the latest timestamps + if for_batch_scoring: + entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) + + training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ], + ).to_df() + print(training_df.head()) + + +def fetch_online_features(store, use_feature_service: bool): + entity_rows = [ + # {join_key: entity_value} + { + "driver_id": 1001, + "val_to_add": 1000, + "val_to_add_2": 2000, + }, + { + "driver_id": 1002, + "val_to_add": 1001, + "val_to_add_2": 2002, + }, + ] + if use_feature_service: + features_to_fetch = store.get_feature_service("driver_activity_v1") + else: + features_to_fetch = [ + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ] + returned_features = store.get_online_features( + features=features_to_fetch, + entity_rows=entity_rows, + ).to_dict() + for key, value in sorted(returned_features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo() diff --git a/sdk/python/feast/templates/postgres/test.py b/sdk/python/feast/templates/postgres/test.py deleted file mode 100644 index d547bc8c64..0000000000 --- a/sdk/python/feast/templates/postgres/test.py +++ /dev/null @@ -1,64 +0,0 @@ -from datetime import datetime, timedelta - -import pandas as pd -from driver_repo import driver, driver_stats_fv - -from feast import FeatureStore - - -def main(): - pd.set_option("display.max_columns", None) - pd.set_option("display.width", 1000) - - # Load the feature store from the current path - fs = FeatureStore(repo_path=".") - - print("Deploying feature store to Postgres...") - fs.apply([driver, driver_stats_fv]) - - # Select features - features = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"] - - # Create an entity dataframe. This is the dataframe that will be enriched with historical features - entity_df = pd.DataFrame( - { - "event_timestamp": [ - pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") - for dt in pd.date_range( - start=datetime.now() - timedelta(days=3), - end=datetime.now(), - periods=3, - ) - ], - "driver_id": [1001, 1002, 1003], - } - ) - - print("Retrieving training data...") - - training_df = fs.get_historical_features( - features=features, entity_df=entity_df - ).to_df() - - print() - print(training_df) - - print() - print("Loading features into the online store...") - fs.materialize_incremental(end_date=datetime.now()) - - print() - print("Retrieving online features...") - - # Retrieve features from the online store - online_features = fs.get_online_features( - features=features, - entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], - ).to_dict() - - print() - print(pd.DataFrame.from_dict(online_features)) - - -if __name__ == "__main__": - main() diff --git a/sdk/python/feast/templates/snowflake/README.md b/sdk/python/feast/templates/snowflake/README.md new file mode 100644 index 0000000000..0c950de435 --- /dev/null +++ b/sdk/python/feast/templates/snowflake/README.md @@ -0,0 +1,19 @@ +# Feast Quickstart +A quick view of what's in this repository: + +* `data/` contains raw demo parquet data +* `driver_repo.py` contains demo feature definitions +* `feature_store.yaml` contains a demo setup configuring where data sources are +* `test_workflow.py` showcases how to run all key Feast commands, including defining, retrieving, and pushing features. + +You can run the overall workflow with `python test_workflow.py`. + +## To move from this into a more production ready workflow: +1. `feature_store.yaml` points to a local file as a registry. You'll want to setup a remote file (e.g. in S3/GCS) or a + SQL registry. See [registry docs](https://docs.feast.dev/getting-started/concepts/registry) for more details. +2. Setup CI/CD + dev vs staging vs prod environments to automatically update the registry as you change Feast feature definitions. See [docs](https://docs.feast.dev/how-to-guides/running-feast-in-production#1.-automatically-deploying-changes-to-your-feature-definitions). +3. (optional) Regularly scheduled materialization to power low latency feature retrieval (e.g. via Airflow). See [Batch data ingestion](https://docs.feast.dev/getting-started/concepts/data-ingestion#batch-data-ingestion) + for more details. +4. (optional) Deploy feature server instances with `feast serve` to expose endpoints to retrieve online features. + - See [Python feature server](https://docs.feast.dev/reference/feature-servers/python-feature-server) for details. + - Use cases can also directly call the Feast client to fetch features as per [Feature retrieval](https://docs.feast.dev/getting-started/concepts/feature-retrieval) diff --git a/sdk/python/feast/templates/snowflake/bootstrap.py b/sdk/python/feast/templates/snowflake/bootstrap.py index 1663a1fb8b..01f4045fe7 100644 --- a/sdk/python/feast/templates/snowflake/bootstrap.py +++ b/sdk/python/feast/templates/snowflake/bootstrap.py @@ -1,7 +1,11 @@ import click import snowflake.connector -from feast.infra.utils.snowflake_utils import write_pandas +from feast.file_utils import replace_str_in_file +from feast.infra.utils.snowflake.snowflake_utils import ( + execute_snowflake_statement, + write_pandas, +) def bootstrap(): @@ -13,8 +17,8 @@ def bootstrap(): from feast.driver_test_data import create_driver_hourly_stats_df repo_path = pathlib.Path(__file__).parent.absolute() - project_name = str(repo_path)[str(repo_path).rfind("/") + 1 :] + repo_path = repo_path / "feature_repo" end_date = datetime.now().replace(microsecond=0, second=0, minute=0) start_date = end_date - timedelta(days=15) @@ -37,7 +41,7 @@ def bootstrap(): snowflake_database = click.prompt("Snowflake Database Name (Case Sensitive):") config_file = repo_path / "feature_store.yaml" - for i in range(2): + for i in range(3): replace_str_in_file( config_file, "SNOWFLAKE_DEPLOYMENT_URL", snowflake_deployment_url ) @@ -52,7 +56,7 @@ def bootstrap(): default=True, ): - conn = snowflake.connector.connect( + snowflake_conn = snowflake.connector.connect( account=snowflake_deployment_url, user=snowflake_user, password=snowflake_password, @@ -61,27 +65,27 @@ def bootstrap(): application="feast", ) - cur = conn.cursor() - cur.execute(f'CREATE DATABASE IF NOT EXISTS "{snowflake_database}"') - cur.execute(f'USE DATABASE "{snowflake_database}"') - cur.execute('CREATE SCHEMA IF NOT EXISTS "PUBLIC"') - cur.execute('USE SCHEMA "PUBLIC"') - cur.execute(f'DROP TABLE IF EXISTS "{project_name}_feast_driver_hourly_stats"') - write_pandas( - conn, - driver_df, - f"{project_name}_feast_driver_hourly_stats", - auto_create_table=True, - ) - conn.close() - - -def replace_str_in_file(file_path, match_str, sub_str): - with open(file_path, "r") as f: - contents = f.read() - contents = contents.replace(match_str, sub_str) - with open(file_path, "wt") as f: - f.write(contents) + with snowflake_conn as conn: + execute_snowflake_statement( + conn, f'CREATE DATABASE IF NOT EXISTS "{snowflake_database}"' + ) + execute_snowflake_statement(conn, f'USE DATABASE "{snowflake_database}"') + execute_snowflake_statement(conn, 'CREATE SCHEMA IF NOT EXISTS "PUBLIC"') + execute_snowflake_statement(conn, 'USE SCHEMA "PUBLIC"') + execute_snowflake_statement( + conn, f'DROP TABLE IF EXISTS "{project_name}_feast_driver_hourly_stats"' + ) + execute_snowflake_statement( + conn, + f'ALTER WAREHOUSE IF EXISTS "{snowflake_warehouse}" RESUME IF SUSPENDED', + ) + + write_pandas( + conn, + driver_df, + f"{project_name}_feast_driver_hourly_stats", + auto_create_table=True, + ) if __name__ == "__main__": diff --git a/sdk/python/feast/templates/snowflake/driver_repo.py b/sdk/python/feast/templates/snowflake/driver_repo.py deleted file mode 100644 index 54f6b67126..0000000000 --- a/sdk/python/feast/templates/snowflake/driver_repo.py +++ /dev/null @@ -1,58 +0,0 @@ -from datetime import timedelta - -import yaml - -from feast import Entity, FeatureService, FeatureView, SnowflakeSource - -# Define an entity for the driver. Entities can be thought of as primary keys used to -# retrieve features. Entities are also used to join multiple tables/views during the -# construction of feature vectors -driver = Entity( - # Name of the entity. Must be unique within a project - name="driver", - # The join keys of an entity describe the storage level field/column on which - # features can be looked up. The join keys are also used to join feature - # tables/views when building feature vectors - join_keys=["driver_id"], -) - -# Indicates a data source from which feature values can be retrieved. Sources are queried when building training -# datasets or materializing features into an online store. -project_name = yaml.safe_load(open("feature_store.yaml"))["project"] - -driver_stats_source = SnowflakeSource( - # The Snowflake table where features can be found - database=yaml.safe_load(open("feature_store.yaml"))["offline_store"]["database"], - table=f"{project_name}_feast_driver_hourly_stats", - # The event timestamp is used for point-in-time joins and for ensuring only - # features within the TTL are returned - timestamp_field="event_timestamp", - # The (optional) created timestamp is used to ensure there are no duplicate - # feature rows in the offline store or when building training datasets - created_timestamp_column="created", -) - -# Feature views are a grouping based on how features are stored in either the -# online or offline store. -driver_stats_fv = FeatureView( - # The unique name of this feature view. Two feature views in a single - # project cannot have the same name - name="driver_hourly_stats", - # The list of entities specifies the keys required for joining or looking - # up features from this feature view. The reference provided in this field - # correspond to the name of a defined entity (or entities) - entities=[driver], - # The timedelta is the maximum age that each feature value may have - # relative to its lookup time. For historical features (used in training), - # TTL is relative to each timestamp provided in the entity dataframe. - # TTL also allows for eviction of keys from online stores and limits the - # amount of historical scanning required for historical feature values - # during retrieval - ttl=timedelta(weeks=52), - # Batch sources are used to find feature values. In the case of this feature - # view we will query a source table on Redshift for driver statistics - # features - batch_source=driver_stats_source, -) - -driver_stats_fs = FeatureService(name="driver_activity", features=[driver_stats_fv]) diff --git a/sdk/python/feast/templates/snowflake/feature_repo/driver_repo.py b/sdk/python/feast/templates/snowflake/feature_repo/driver_repo.py new file mode 100644 index 0000000000..4befa693f9 --- /dev/null +++ b/sdk/python/feast/templates/snowflake/feature_repo/driver_repo.py @@ -0,0 +1,112 @@ +from datetime import timedelta + +import pandas as pd +import yaml + +from feast import ( + Entity, + FeatureService, + FeatureView, + Field, + PushSource, + RequestSource, + SnowflakeSource, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Float32, Float64, Int64 + +# Define an entity for the driver. You can think of an entity as a primary key used to +# fetch features. +driver = Entity(name="driver", join_keys=["driver_id"]) + +# Defines a data source from which feature values can be retrieved. Sources are queried when building training +# datasets or materializing features into an online store. +project_name = yaml.safe_load(open("feature_store.yaml"))["project"] + +driver_stats_source = SnowflakeSource( + # The Snowflake table where features can be found + database=yaml.safe_load(open("feature_store.yaml"))["offline_store"]["database"], + table=f"{project_name}_feast_driver_hourly_stats", + # The event timestamp is used for point-in-time joins and for ensuring only + # features within the TTL are returned + timestamp_field="event_timestamp", + # The (optional) created timestamp is used to ensure there are no duplicate + # feature rows in the offline store or when building training datasets + created_timestamp_column="created", +) + +# Feature views are a grouping based on how features are stored in either the +# online or offline store. +driver_stats_fv = FeatureView( + # The unique name of this feature view. Two feature views in a single + # project cannot have the same name + name="driver_hourly_stats", + # The list of entities specifies the keys required for joining or looking + # up features from this feature view. The reference provided in this field + # correspond to the name of a defined entity (or entities) + entities=[driver], + # The timedelta is the maximum age that each feature value may have + # relative to its lookup time. For historical features (used in training), + # TTL is relative to each timestamp provided in the entity dataframe. + # TTL also allows for eviction of keys from online stores and limits the + # amount of historical scanning required for historical feature values + # during retrieval + ttl=timedelta(weeks=52 * 10), # Set to be very long for example purposes only + # The list of features defined below act as a schema to both define features + # for both materialization of features into a store, and are used as references + # during retrieval for building a training dataset or serving features + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + ], + source=driver_stats_source, + # Tags are user defined key/value pairs that are attached to each + # feature view + tags={"team": "driver_performance"}, +) + +# Defines a way to push data (to be available offline, online or both) into Feast. +driver_stats_push_source = PushSource( + name="driver_stats_push_source", + batch_source=driver_stats_source, +) + +# Define a request data source which encodes features / information only +# available at request time (e.g. part of the user initiated HTTP request) +input_request = RequestSource( + name="vals_to_add", + schema=[ + Field(name="val_to_add", dtype=Int64), + Field(name="val_to_add_2", dtype=Int64), + ], +) + + +# Define an on demand feature view which can generate new features based on +# existing feature views and RequestSource features +@on_demand_feature_view( + sources=[driver_stats_fv, input_request], + schema=[ + Field(name="conv_rate_plus_val1", dtype=Float64), + Field(name="conv_rate_plus_val2", dtype=Float64), + ], +) +def transformed_conv_rate(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_val1"] = inputs["conv_rate"] + inputs["val_to_add"] + df["conv_rate_plus_val2"] = inputs["conv_rate"] + inputs["val_to_add_2"] + return df + + +# This groups features into a model version +driver_activity_v1 = FeatureService( + name="driver_activity_v1", + features=[ + driver_stats_fv[["conv_rate"]], # Sub-selects a feature from a feature view + transformed_conv_rate, # Selects all features from the feature view + ], +) +driver_activity_v2 = FeatureService( + name="driver_activity_v2", features=[driver_stats_fv, transformed_conv_rate] +) diff --git a/sdk/python/feast/templates/snowflake/feature_store.yaml b/sdk/python/feast/templates/snowflake/feature_repo/feature_store.yaml similarity index 70% rename from sdk/python/feast/templates/snowflake/feature_store.yaml rename to sdk/python/feast/templates/snowflake/feature_repo/feature_store.yaml index 39f266f89f..104e6394c6 100644 --- a/sdk/python/feast/templates/snowflake/feature_store.yaml +++ b/sdk/python/feast/templates/snowflake/feature_repo/feature_store.yaml @@ -9,7 +9,14 @@ offline_store: role: SNOWFLAKE_ROLE warehouse: SNOWFLAKE_WAREHOUSE database: SNOWFLAKE_DATABASE -entity_key_serialization_version: 2 +batch_engine: + type: snowflake.engine + account: SNOWFLAKE_DEPLOYMENT_URL + user: SNOWFLAKE_USER + password: SNOWFLAKE_PASSWORD + role: SNOWFLAKE_ROLE + warehouse: SNOWFLAKE_WAREHOUSE + database: SNOWFLAKE_DATABASE online_store: type: snowflake.online account: SNOWFLAKE_DEPLOYMENT_URL @@ -18,3 +25,4 @@ online_store: role: SNOWFLAKE_ROLE warehouse: SNOWFLAKE_WAREHOUSE database: SNOWFLAKE_DATABASE +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/snowflake/test.py b/sdk/python/feast/templates/snowflake/test.py deleted file mode 100644 index 3c33f6aefd..0000000000 --- a/sdk/python/feast/templates/snowflake/test.py +++ /dev/null @@ -1,66 +0,0 @@ -from datetime import datetime, timedelta - -import pandas as pd -from driver_repo import driver, driver_stats_fv - -from feast import FeatureStore - - -def main(): - pd.set_option("display.max_columns", None) - pd.set_option("display.width", 1000) - - # Load the feature store from the current path - fs = FeatureStore(repo_path=".") - - # Deploy the feature store to Snowflake - print("Deploying feature store to Snowflake...") - fs.apply([driver, driver_stats_fv]) - - # Select features - features = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"] - - # Create an entity dataframe. This is the dataframe that will be enriched with historical features - entity_df = pd.DataFrame( - { - "event_timestamp": [ - pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") - for dt in pd.date_range( - start=datetime.now() - timedelta(days=3), - end=datetime.now(), - periods=3, - ) - ], - "driver_id": [1001, 1002, 1003], - } - ) - - print("Retrieving training data...") - - # Retrieve historical features by joining the entity dataframe to the Snowflake table source - training_df = fs.get_historical_features( - features=features, entity_df=entity_df - ).to_df() - - print() - print(training_df) - - print() - print("Loading features into the online store...") - fs.materialize_incremental(end_date=datetime.now()) - - print() - print("Retrieving online features...") - - # Retrieve features from the online store - online_features = fs.get_online_features( - features=features, - entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], - ).to_dict() - - print() - print(pd.DataFrame.from_dict(online_features)) - - -if __name__ == "__main__": - main() diff --git a/sdk/python/feast/templates/snowflake/test_workflow.py b/sdk/python/feast/templates/snowflake/test_workflow.py new file mode 100644 index 0000000000..6f5e33622a --- /dev/null +++ b/sdk/python/feast/templates/snowflake/test_workflow.py @@ -0,0 +1,126 @@ +import subprocess +from datetime import datetime + +import pandas as pd + +from feast import FeatureStore +from feast.data_source import PushMode + + +def run_demo(): + store = FeatureStore(repo_path="./feature_repo") + print("\n--- Run feast apply to setup feature store on Snowflake ---") + command = "cd feature_repo; feast apply" + subprocess.run(command, shell=True) + + print("\n--- Historical features for training ---") + fetch_historical_features_entity_df(store, for_batch_scoring=False) + + print("\n--- Historical features for batch scoring ---") + fetch_historical_features_entity_df(store, for_batch_scoring=True) + + print("\n--- Load features into online store ---") + store.materialize_incremental(end_date=datetime.now()) + + print("\n--- Online features ---") + fetch_online_features(store, use_feature_service=False) + + print("\n--- Online features retrieved (instead) through a feature service---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Simulate a stream event ingestion of the hourly stats df ---") + event_df = pd.DataFrame.from_dict( + { + "driver_id": [1001], + "event_timestamp": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "created": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "conv_rate": [1.0], + "acc_rate": [1.0], + "avg_daily_trips": [1000], + } + ) + print(event_df) + store.push("driver_stats_push_source", event_df, to=PushMode.ONLINE_AND_OFFLINE) + + print("\n--- Online features again with updated values from a stream push---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Run feast teardown ---") + command = "cd feature_repo; feast teardown" + subprocess.run(command, shell=True) + + +def fetch_historical_features_entity_df(store: FeatureStore, for_batch_scoring: bool): + # Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for more details on how to retrieve + # for all entities in the offline store instead + entity_df = pd.DataFrame.from_dict( + { + # entity's join key -> entity values + "driver_id": [1001, 1002, 1003], + # "event_timestamp" (reserved key) -> timestamps + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + ], + # (optional) label name -> label values. Feast does not process these + "label_driver_reported_satisfaction": [1, 5, 3], + # values we're using for an on-demand transformation + "val_to_add": [1, 2, 3], + "val_to_add_2": [10, 20, 30], + } + ) + # For batch scoring, we want the latest timestamps + if for_batch_scoring: + entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) + + training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ], + ).to_df() + print(training_df.head()) + + +def fetch_online_features(store, use_feature_service: bool): + entity_rows = [ + # {join_key: entity_value} + { + "driver_id": 1001, + "val_to_add": 1000, + "val_to_add_2": 2000, + }, + { + "driver_id": 1002, + "val_to_add": 1001, + "val_to_add_2": 2002, + }, + ] + if use_feature_service: + features_to_fetch = store.get_feature_service("driver_activity_v1") + else: + features_to_fetch = [ + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ] + returned_features = store.get_online_features( + features=features_to_fetch, + entity_rows=entity_rows, + ).to_dict() + for key, value in sorted(returned_features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo() diff --git a/sdk/python/feast/templates/spark/bootstrap.py b/sdk/python/feast/templates/spark/bootstrap.py index b57387d3d7..fc0be4ea0a 100644 --- a/sdk/python/feast/templates/spark/bootstrap.py +++ b/sdk/python/feast/templates/spark/bootstrap.py @@ -8,7 +8,7 @@ def bootstrap(): create_driver_hourly_stats_df, ) - repo_path = pathlib.Path(__file__).parent.absolute() + repo_path = pathlib.Path(__file__).parent.absolute() / "feature_repo" data_path = repo_path / "data" data_path.mkdir(exist_ok=True) diff --git a/sdk/python/feast/templates/spark/feature_repo/__init__.py b/sdk/python/feast/templates/spark/feature_repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/spark/example.py b/sdk/python/feast/templates/spark/feature_repo/example_repo.py similarity index 100% rename from sdk/python/feast/templates/spark/example.py rename to sdk/python/feast/templates/spark/feature_repo/example_repo.py diff --git a/sdk/python/feast/templates/spark/feature_repo/feature_store.yaml b/sdk/python/feast/templates/spark/feature_repo/feature_store.yaml new file mode 100644 index 0000000000..f72c7c65f4 --- /dev/null +++ b/sdk/python/feast/templates/spark/feature_repo/feature_store.yaml @@ -0,0 +1,17 @@ +project: my_project +# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry) +# On GCP/AWS, minimally you should create a GCS/S3 bucket for a remote file registry +registry: data/registry.db +provider: local +offline_store: + type: spark + spark_conf: + spark.master: "local[*]" + spark.ui.enabled: "false" + spark.eventLog.enabled: "false" + spark.sql.catalogImplementation: "hive" + spark.sql.parser.quotedRegexColumnNames: "true" + spark.sql.session.timeZone: "UTC" +online_store: + path: data/online_store.db +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/spark/feature_repo/test_workflow.py b/sdk/python/feast/templates/spark/feature_repo/test_workflow.py new file mode 100644 index 0000000000..08d493fc54 --- /dev/null +++ b/sdk/python/feast/templates/spark/feature_repo/test_workflow.py @@ -0,0 +1,124 @@ +import subprocess +from datetime import datetime + +import pandas as pd + +from feast import FeatureStore +from feast.data_source import PushMode + + +def run_demo(): + store = FeatureStore(repo_path=".") + print("\n--- Run feast apply to setup feature store on Snowflake ---") + subprocess.run(["feast", "apply"]) + + print("\n--- Historical features for training ---") + fetch_historical_features_entity_df(store, for_batch_scoring=False) + + print("\n--- Historical features for batch scoring ---") + fetch_historical_features_entity_df(store, for_batch_scoring=True) + + print("\n--- Load features into online store ---") + store.materialize_incremental(end_date=datetime.now()) + + print("\n--- Online features ---") + fetch_online_features(store, use_feature_service=False) + + print("\n--- Online features retrieved (instead) through a feature service---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Simulate a stream event ingestion of the hourly stats df ---") + event_df = pd.DataFrame.from_dict( + { + "driver_id": [1001], + "event_timestamp": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "created": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "conv_rate": [1.0], + "acc_rate": [1.0], + "avg_daily_trips": [1000], + } + ) + print(event_df) + store.push("driver_stats_push_source", event_df, to=PushMode.ONLINE) + + print("\n--- Online features again with updated values from a stream push---") + fetch_online_features(store, use_feature_service=True) + + print("\n--- Run feast teardown ---") + subprocess.run(["feast", "teardown"]) + + +def fetch_historical_features_entity_df(store: FeatureStore, for_batch_scoring: bool): + # Note: see https://docs.feast.dev/getting-started/concepts/feature-retrieval for more details on how to retrieve + # for all entities in the offline store instead + entity_df = pd.DataFrame.from_dict( + { + # entity's join key -> entity values + "driver_id": [1001, 1002, 1003], + # "event_timestamp" (reserved key) -> timestamps + "event_timestamp": [ + datetime(2021, 4, 12, 10, 59, 42), + datetime(2021, 4, 12, 8, 12, 10), + datetime(2021, 4, 12, 16, 40, 26), + ], + # (optional) label name -> label values. Feast does not process these + "label_driver_reported_satisfaction": [1, 5, 3], + # values we're using for an on-demand transformation + "val_to_add": [1, 2, 3], + "val_to_add_2": [10, 20, 30], + } + ) + # For batch scoring, we want the latest timestamps + if for_batch_scoring: + entity_df["event_timestamp"] = pd.to_datetime("now", utc=True) + + training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ], + ).to_df() + print(training_df.head()) + + +def fetch_online_features(store, use_feature_service: bool): + entity_rows = [ + # {join_key: entity_value} + { + "driver_id": 1001, + "val_to_add": 1000, + "val_to_add_2": 2000, + }, + { + "driver_id": 1002, + "val_to_add": 1001, + "val_to_add_2": 2002, + }, + ] + if use_feature_service: + features_to_fetch = store.get_feature_service("driver_activity_v1") + else: + features_to_fetch = [ + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips", + "transformed_conv_rate:conv_rate_plus_val1", + "transformed_conv_rate:conv_rate_plus_val2", + ] + returned_features = store.get_online_features( + features=features_to_fetch, + entity_rows=entity_rows, + ).to_dict() + for key, value in sorted(returned_features.items()): + print(key, " : ", value) + + +if __name__ == "__main__": + run_demo() diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index ed4b7cba59..2cb1c4fefb 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -15,6 +15,7 @@ from collections import defaultdict from datetime import datetime, timezone from typing import ( + TYPE_CHECKING, Any, Dict, Iterator, @@ -31,7 +32,6 @@ import numpy as np import pandas as pd -import pyarrow from google.protobuf.timestamp_pb2 import Timestamp from feast.protos.feast.types.Value_pb2 import ( @@ -46,6 +46,9 @@ from feast.protos.feast.types.Value_pb2 import Value as ProtoValue from feast.value_type import ListType, ValueType +if TYPE_CHECKING: + import pyarrow + # null timestamps get converted to -9223372036854775808 NULL_TIMESTAMP_INT_VALUE = np.datetime64("NaT").astype(int) @@ -228,6 +231,30 @@ def python_values_to_feast_value_type( return inferred_dtype +def _convert_value_type_str_to_value_type(type_str: str) -> ValueType: + type_map = { + "UNKNOWN": ValueType.UNKNOWN, + "BYTES": ValueType.BYTES, + "STRING": ValueType.STRING, + "INT32": ValueType.INT32, + "INT64": ValueType.INT64, + "DOUBLE": ValueType.DOUBLE, + "FLOAT": ValueType.FLOAT, + "BOOL": ValueType.BOOL, + "NULL": ValueType.NULL, + "UNIX_TIMESTAMP": ValueType.UNIX_TIMESTAMP, + "BYTES_LIST": ValueType.BYTES_LIST, + "STRING_LIST": ValueType.STRING_LIST, + "INT32_LIST ": ValueType.INT32_LIST, + "INT64_LIST": ValueType.INT64_LIST, + "DOUBLE_LIST": ValueType.DOUBLE_LIST, + "FLOAT_LIST": ValueType.FLOAT_LIST, + "BOOL_LIST": ValueType.BOOL_LIST, + "UNIX_TIMESTAMP_LIST": ValueType.UNIX_TIMESTAMP_LIST, + } + return type_map[type_str] + + def _type_err(item, dtype): raise TypeError(f'Value "{item}" is of type {type(item)} not of type {dtype}') @@ -505,6 +532,73 @@ def bq_to_feast_value_type(bq_type_as_str: str) -> ValueType: return value_type +def mssql_to_feast_value_type(mssql_type_as_str: str) -> ValueType: + type_map = { + "bigint": ValueType.FLOAT, + "binary": ValueType.BYTES, + "bit": ValueType.BOOL, + "char": ValueType.STRING, + "date": ValueType.UNIX_TIMESTAMP, + "datetime": ValueType.UNIX_TIMESTAMP, + "float": ValueType.FLOAT, + "nchar": ValueType.STRING, + "nvarchar": ValueType.STRING, + "nvarchar(max)": ValueType.STRING, + "real": ValueType.FLOAT, + "smallint": ValueType.INT32, + "tinyint": ValueType.INT32, + "varbinary": ValueType.BYTES, + "varchar": ValueType.STRING, + "None": ValueType.NULL, + # skip date, geometry, hllsketch, time, timetz + } + if mssql_type_as_str.lower() not in type_map: + raise ValueError(f"Mssql type not supported by feast {mssql_type_as_str}") + return type_map[mssql_type_as_str.lower()] + + +def pa_to_mssql_type(pa_type: "pyarrow.DataType") -> str: + # PyArrow types: https://arrow.apache.org/docs/python/api/datatypes.html + # MS Sql types: https://docs.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql?view=sql-server-ver16 + pa_type_as_str = str(pa_type).lower() + if pa_type_as_str.startswith("timestamp"): + if "tz=" in pa_type_as_str: + return "datetime2" + else: + return "datetime" + + if pa_type_as_str.startswith("date"): + return "date" + + if pa_type_as_str.startswith("decimal"): + return pa_type_as_str + + # We have to take into account how arrow types map to parquet types as well. + # For example, null type maps to int32 in parquet, so we have to use int4 in Redshift. + # Other mappings have also been adjusted accordingly. + type_map = { + "null": "None", + "bool": "bit", + "int8": "tinyint", + "int16": "smallint", + "int32": "int", + "int64": "bigint", + "uint8": "tinyint", + "uint16": "smallint", + "uint32": "int", + "uint64": "bigint", + "float": "float", + "double": "real", + "binary": "binary", + "string": "varchar", + } + + if pa_type_as_str.lower() not in type_map: + raise ValueError(f"MS SQL Server type not supported by feast {pa_type_as_str}") + + return type_map[pa_type_as_str] + + def redshift_to_feast_value_type(redshift_type_as_str: str) -> ValueType: # Type names from https://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html type_map = { @@ -525,30 +619,37 @@ def redshift_to_feast_value_type(redshift_type_as_str: str) -> ValueType: return type_map[redshift_type_as_str.lower()] -def snowflake_python_type_to_feast_value_type( - snowflake_python_type_as_str: str, -) -> ValueType: - +def snowflake_type_to_feast_value_type(snowflake_type: str) -> ValueType: type_map = { - "str": ValueType.STRING, - "float64": ValueType.DOUBLE, - "int64": ValueType.INT64, - "uint64": ValueType.INT64, - "int32": ValueType.INT32, - "uint32": ValueType.INT32, - "int16": ValueType.INT32, - "uint16": ValueType.INT32, - "uint8": ValueType.INT32, - "int8": ValueType.INT32, - "datetime64[ns]": ValueType.UNIX_TIMESTAMP, - "object": ValueType.STRING, - "bool": ValueType.BOOL, + "BINARY": ValueType.BYTES, + "VARCHAR": ValueType.STRING, + "NUMBER32": ValueType.INT32, + "NUMBER64": ValueType.INT64, + "DOUBLE": ValueType.DOUBLE, + "BOOLEAN": ValueType.BOOL, + "TIMESTAMP": ValueType.UNIX_TIMESTAMP, + "TIMESTAMP_TZ": ValueType.UNIX_TIMESTAMP, + "TIMESTAMP_LTZ": ValueType.UNIX_TIMESTAMP, + "TIMESTAMP_NTZ": ValueType.UNIX_TIMESTAMP, } - - return type_map[snowflake_python_type_as_str.lower()] + return type_map[snowflake_type] + + +def _convert_value_name_to_snowflake_udf(value_name: str, project_name: str) -> str: + name_map = { + "BYTES": f"feast_{project_name}_snowflake_binary_to_bytes_proto", + "STRING": f"feast_{project_name}_snowflake_varchar_to_string_proto", + "INT32": f"feast_{project_name}_snowflake_number_to_int32_proto", + "INT64": f"feast_{project_name}_snowflake_number_to_int64_proto", + "DOUBLE": f"feast_{project_name}_snowflake_float_to_double_proto", + "FLOAT": f"feast_{project_name}_snowflake_float_to_double_proto", + "BOOL": f"feast_{project_name}_snowflake_boolean_to_bool_proto", + "UNIX_TIMESTAMP": f"feast_{project_name}_snowflake_timestamp_to_unix_timestamp_proto", + } + return name_map[value_name].upper() -def pa_to_redshift_value_type(pa_type: pyarrow.DataType) -> str: +def pa_to_redshift_value_type(pa_type: "pyarrow.DataType") -> str: # PyArrow types: https://arrow.apache.org/docs/python/api/datatypes.html # Redshift type: https://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html pa_type_as_str = str(pa_type).lower() @@ -728,7 +829,9 @@ def pg_type_to_feast_value_type(type_str: str) -> ValueType: return value -def feast_value_type_to_pa(feast_type: ValueType) -> pyarrow.DataType: +def feast_value_type_to_pa(feast_type: ValueType) -> "pyarrow.DataType": + import pyarrow + type_map = { ValueType.INT32: pyarrow.int32(), ValueType.INT64: pyarrow.int64(), @@ -791,3 +894,60 @@ def pg_type_code_to_arrow(code: int) -> str: return feast_value_type_to_pa( pg_type_to_feast_value_type(pg_type_code_to_pg_type(code)) ) + + +def athena_to_feast_value_type(athena_type_as_str: str) -> ValueType: + # Type names from https://docs.aws.amazon.com/athena/latest/ug/data-types.html + type_map = { + "null": ValueType.UNKNOWN, + "boolean": ValueType.BOOL, + "tinyint": ValueType.INT32, + "smallint": ValueType.INT32, + "int": ValueType.INT32, + "bigint": ValueType.INT64, + "double": ValueType.DOUBLE, + "float": ValueType.FLOAT, + "binary": ValueType.BYTES, + "char": ValueType.STRING, + "varchar": ValueType.STRING, + "string": ValueType.STRING, + "timestamp": ValueType.UNIX_TIMESTAMP, + # skip date,decimal,array,map,struct + } + return type_map[athena_type_as_str.lower()] + + +def pa_to_athena_value_type(pa_type: "pyarrow.DataType") -> str: + # PyArrow types: https://arrow.apache.org/docs/python/api/datatypes.html + # Type names from https://docs.aws.amazon.com/athena/latest/ug/data-types.html + pa_type_as_str = str(pa_type).lower() + if pa_type_as_str.startswith("timestamp"): + return "timestamp" + + if pa_type_as_str.startswith("date"): + return "date" + + if pa_type_as_str.startswith("python_values_to_proto_values"): + return pa_type_as_str + + # We have to take into account how arrow types map to parquet types as well. + # For example, null type maps to int32 in parquet, so we have to use int4 in Redshift. + # Other mappings have also been adjusted accordingly. + type_map = { + "null": "null", + "bool": "boolean", + "int8": "tinyint", + "int16": "smallint", + "int32": "int", + "int64": "bigint", + "uint8": "tinyint", + "uint16": "tinyint", + "uint32": "tinyint", + "uint64": "tinyint", + "float": "float", + "double": "double", + "binary": "binary", + "string": "string", + } + + return type_map[pa_type_as_str] diff --git a/sdk/python/feast/ui/package.json b/sdk/python/feast/ui/package.json index 883c19660b..358aa2cdd2 100644 --- a/sdk/python/feast/ui/package.json +++ b/sdk/python/feast/ui/package.json @@ -6,7 +6,7 @@ "@elastic/datemath": "^5.0.3", "@elastic/eui": "^57.0.0", "@emotion/react": "^11.9.0", - "@feast-dev/feast-ui": "^0.20.5", + "@feast-dev/feast-ui": "latest", "@testing-library/jest-dom": "^5.16.4", "@testing-library/react": "^13.2.0", "@testing-library/user-event": "^13.5.0", diff --git a/sdk/python/feast/ui/yarn.lock b/sdk/python/feast/ui/yarn.lock index b44fc5f51a..df2bfe45ff 100644 --- a/sdk/python/feast/ui/yarn.lock +++ b/sdk/python/feast/ui/yarn.lock @@ -1345,10 +1345,10 @@ minimatch "^3.1.2" strip-json-comments "^3.1.1" -"@feast-dev/feast-ui@^0.20.5": - version "0.20.5" - resolved "https://registry.yarnpkg.com/@feast-dev/feast-ui/-/feast-ui-0.20.5.tgz#bb0d6fc81cbd92ca69b779982ab151a8d9cabaee" - integrity sha512-BwMPJSv1MkylHxPnU/2fZX77AC/G4H2DIf+HAj80ZklwB0zbmeZzhXFrVh4xSheevGZFh0L839JeL14WfXPZsA== +"@feast-dev/feast-ui@latest": + version "0.24.0" + resolved "https://registry.yarnpkg.com/@feast-dev/feast-ui/-/feast-ui-0.24.0.tgz#a52037247563290f92d0d993fcaf0d88e9741f36" + integrity sha512-Te27bSVFp7gCE7+p9bbCkCEQ7+nsRCzBtwWivNPBFRn8HC2ewBzmRzzasXlCHok1cXHDbh7Xj7y+2Hshp91LTg== dependencies: "@elastic/datemath" "^5.0.3" "@elastic/eui" "^55.0.1" diff --git a/sdk/python/feast/usage.py b/sdk/python/feast/usage.py index 5e78aa52d2..0965e70999 100644 --- a/sdk/python/feast/usage.py +++ b/sdk/python/feast/usage.py @@ -29,6 +29,7 @@ import requests +from feast import flags_helper from feast.constants import DEFAULT_FEAST_USAGE_VALUE, FEAST_USAGE from feast.version import get_version @@ -53,6 +54,13 @@ ).hexdigest(), } +APPLICATION_NAME = "feast-dev/feast" +USER_AGENT = "{}/{}".format(APPLICATION_NAME, get_version()) + + +def get_user_agent(): + return USER_AGENT + def set_current_project_uuid(project_uuid: str): _constant_attributes["project_id"] = project_uuid @@ -172,7 +180,8 @@ def _export(event: typing.Dict[str, typing.Any]): def _produce_event(ctx: UsageContext): - is_test = bool({"pytest", "unittest"} & sys.modules.keys()) + # Cannot check for unittest because typeguard pulls in unittest + is_test = flags_helper.is_test() or bool({"pytest"} & sys.modules.keys()) event = { "timestamp": datetime.utcnow().isoformat(), "is_test": is_test, diff --git a/sdk/python/requirements/py3.10-ci-requirements.txt b/sdk/python/requirements/py3.10-ci-requirements.txt index 4ff99c247f..9d10b2c313 100644 --- a/sdk/python/requirements/py3.10-ci-requirements.txt +++ b/sdk/python/requirements/py3.10-ci-requirements.txt @@ -42,7 +42,7 @@ asn1crypto==1.5.1 # snowflake-connector-python assertpy==1.1 # via feast (setup.py) -asttokens==2.0.5 +asttokens==2.0.8 # via stack-data async-timeout==4.0.2 # via @@ -56,7 +56,7 @@ attrs==22.1.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.24.2 +azure-core==1.25.0 # via # adlfs # azure-identity @@ -65,9 +65,13 @@ azure-core==1.24.2 azure-datalake-store==0.0.52 # via adlfs azure-identity==1.10.0 - # via adlfs -azure-storage-blob==12.13.0 - # via adlfs + # via + # adlfs + # feast (setup.py) +azure-storage-blob==12.13.1 + # via + # adlfs + # feast (setup.py) babel==2.10.3 # via sphinx backcall==0.2.0 @@ -90,12 +94,17 @@ build==0.8.0 # via # feast (setup.py) # pip-tools +bytewax==0.10.0 + # via feast (setup.py) cachecontrol==0.12.11 # via firebase-admin cachetools==5.2.0 # via google-auth +cassandra-driver==3.25.0 + # via feast (setup.py) certifi==2022.6.15 # via + # kubernetes # minio # msrest # requests @@ -117,6 +126,7 @@ click==8.1.3 # black # bowler # feast (setup.py) + # geomet # great-expectations # moreorless # pip-tools @@ -127,7 +137,7 @@ colorama==0.4.5 # via # feast (setup.py) # great-expectations -coverage[toml]==6.4.2 +coverage[toml]==6.4.4 # via pytest-cov cryptography==35.0.0 # via @@ -138,13 +148,14 @@ cryptography==35.0.0 # great-expectations # moto # msal + # pyjwt # pyopenssl # snowflake-connector-python dask==2022.1.1 # via feast (setup.py) dataclasses==0.6 # via great-expectations -db-dtypes==1.0.2 +db-dtypes==1.0.3 # via google-cloud-bigquery decorator==5.1.1 # via @@ -155,10 +166,12 @@ deprecated==1.2.13 deprecation==2.1.0 # via testcontainers dill==0.3.5.1 - # via feast (setup.py) + # via + # feast (setup.py) + # multiprocess distlib==0.3.5 # via virtualenv -docker==5.0.3 +docker==6.0.0 # via # feast (setup.py) # testcontainers @@ -170,25 +183,25 @@ entrypoints==0.4 # via altair execnet==1.9.0 # via pytest-xdist -executing==0.9.1 +executing==0.10.0 # via stack-data -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) -fastavro==1.5.4 +fastavro==1.6.0 # via # feast (setup.py) # pandavro fastjsonschema==2.16.1 # via nbformat -filelock==3.7.1 +filelock==3.8.0 # via virtualenv firebase-admin==5.2.0 # via feast (setup.py) fissix==21.11.13 # via bowler -flake8==5.0.2 +flake8==5.0.4 # via feast (setup.py) -frozenlist==1.3.0 +frozenlist==1.3.1 # via # aiohttp # aiosignal @@ -200,6 +213,8 @@ fsspec==2022.1.0 # s3fs gcsfs==2022.1.0 # via feast (setup.py) +geomet==0.2.1.post1 + # via cassandra-driver google-api-core[grpc]==2.8.2 # via # feast (setup.py) @@ -211,9 +226,9 @@ google-api-core[grpc]==2.8.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-api-python-client==2.55.0 +google-api-python-client==2.57.0 # via firebase-admin -google-auth==2.9.1 +google-auth==2.10.0 # via # gcsfs # google-api-core @@ -222,13 +237,14 @@ google-auth==2.9.1 # google-auth-oauthlib # google-cloud-core # google-cloud-storage + # kubernetes google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.2 # via gcsfs -google-cloud-bigquery[pandas]==3.3.0 +google-cloud-bigquery[pandas]==3.3.2 # via feast (setup.py) -google-cloud-bigquery-storage==2.14.1 +google-cloud-bigquery-storage==2.14.2 # via # feast (setup.py) # google-cloud-bigquery @@ -238,11 +254,11 @@ google-cloud-core==2.3.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.8.0 +google-cloud-datastore==2.8.1 # via feast (setup.py) -google-cloud-firestore==2.6.0 +google-cloud-firestore==2.6.1 # via firebase-admin -google-cloud-storage==2.4.0 +google-cloud-storage==2.5.0 # via # feast (setup.py) # firebase-admin @@ -261,6 +277,8 @@ googleapis-common-protos==1.56.4 # tensorflow-metadata great-expectations==0.14.13 # via feast (setup.py) +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -290,7 +308,7 @@ httplib2==0.20.4 # google-auth-httplib2 httptools==0.4.0 # via uvicorn -identify==2.5.2 +identify==2.5.3 # via pre-commit idna==3.3 # via @@ -327,7 +345,7 @@ jsonpatch==1.32 # via great-expectations jsonpointer==2.3 # via jsonpatch -jsonschema==4.9.0 +jsonschema==4.13.0 # via # altair # feast (setup.py) @@ -335,14 +353,15 @@ jsonschema==4.9.0 # nbformat jupyter-core==4.11.1 # via nbformat +kubernetes==20.13.0 + # via feast (setup.py) locket==1.0.0 # via partd markupsafe==2.1.1 # via # jinja2 # moto - # werkzeug -matplotlib-inline==0.1.3 +matplotlib-inline==0.1.6 # via ipython mccabe==0.7.0 # via flake8 @@ -356,7 +375,7 @@ mock==2.0.0 # via feast (setup.py) moreorless==0.4.0 # via bowler -moto==3.1.16 +moto==3.1.18 # via feast (setup.py) msal==1.18.0 # via @@ -376,6 +395,8 @@ multidict==6.0.2 # via # aiohttp # yarl +multiprocess==0.70.13 + # via bytewax mypy==0.971 # via # feast (setup.py) @@ -392,7 +413,7 @@ nbformat==5.4.0 # via great-expectations nodeenv==1.7.0 # via pre-commit -numpy==1.23.1 +numpy==1.23.2 # via # altair # db-dtypes @@ -412,6 +433,7 @@ packaging==21.3 # dask # db-dtypes # deprecation + # docker # google-cloud-bigquery # great-expectations # pytest @@ -430,11 +452,11 @@ pandavro==1.5.2 # via feast (setup.py) parso==0.8.3 # via jedi -partd==1.2.0 +partd==1.3.0 # via dask pathspec==0.9.0 # via black -pbr==5.9.0 +pbr==5.10.0 # via mock pep517==0.13.0 # via build @@ -458,7 +480,7 @@ pre-commit==2.20.0 # via feast (setup.py) prompt-toolkit==3.0.30 # via ipython -proto-plus==1.20.6 +proto-plus==1.22.0 # via # feast (setup.py) # google-cloud-bigquery @@ -511,19 +533,19 @@ pyasn1-modules==0.2.8 # via google-auth pybindgen==0.22.1 # via feast (setup.py) -pycodestyle==2.9.0 +pycodestyle==2.9.1 # via flake8 pycparser==2.21 # via cffi pycryptodomex==3.15.0 # via snowflake-connector-python -pydantic==1.9.1 +pydantic==1.9.2 # via # fastapi # feast (setup.py) pyflakes==2.5.0 # via flake8 -pygments==2.12.0 +pygments==2.13.0 # via # feast (setup.py) # ipython @@ -533,6 +555,10 @@ pyjwt[crypto]==2.4.0 # adal # msal # snowflake-connector-python +pymssql==2.2.5 + # via feast (setup.py) +pyodbc==4.0.34 + # via feast (setup.py) pyopenssl==22.0.0 # via snowflake-connector-python pyparsing==2.4.7 @@ -577,11 +603,12 @@ python-dateutil==2.8.2 # botocore # google-cloud-bigquery # great-expectations + # kubernetes # moto # pandas python-dotenv==0.20.0 # via uvicorn -pytz==2022.1 +pytz==2022.2.1 # via # babel # great-expectations @@ -595,6 +622,7 @@ pyyaml==6.0 # via # dask # feast (setup.py) + # kubernetes # pre-commit # uvicorn redis==4.2.2 @@ -612,6 +640,7 @@ requests==2.28.1 # google-cloud-bigquery # google-cloud-storage # great-expectations + # kubernetes # moto # msal # msrest @@ -623,6 +652,7 @@ requests==2.28.1 requests-oauthlib==1.3.1 # via # google-auth-oauthlib + # kubernetes # msrest responses==0.21.0 # via moto @@ -640,10 +670,14 @@ six==1.16.0 # via # azure-core # azure-identity + # cassandra-driver + # geomet # google-auth # google-auth-httplib2 # grpcio # happybase + # isodate + # kubernetes # mock # msrestazure # pandavro @@ -672,11 +706,11 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -sqlalchemy[mypy]==1.4.39 +sqlalchemy[mypy]==1.4.40 # via feast (setup.py) -sqlalchemy2-stubs==0.0.2a24 +sqlalchemy2-stubs==0.0.2a25 # via sqlalchemy -stack-data==0.3.0 +stack-data==0.4.0 # via ipython starlette==0.19.1 # via fastapi @@ -688,7 +722,7 @@ tensorflow-metadata==1.9.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations -testcontainers==3.6.0 +testcontainers==3.6.1 # via feast (setup.py) thriftpy2==0.4.14 # via happybase @@ -729,19 +763,19 @@ types-protobuf==3.19.22 # mypy-protobuf types-python-dateutil==2.8.19 # via feast (setup.py) -types-pytz==2022.1.2 +types-pytz==2022.2.1.0 # via feast (setup.py) types-pyyaml==6.0.11 # via feast (setup.py) -types-redis==4.3.13 +types-redis==4.3.14 # via feast (setup.py) -types-requests==2.28.6 +types-requests==2.28.9 # via feast (setup.py) -types-setuptools==63.2.2 +types-setuptools==65.1.0 # via feast (setup.py) types-tabulate==0.8.11 # via feast (setup.py) -types-urllib3==1.26.20 +types-urllib3==1.26.23 # via types-requests typing-extensions==4.3.0 # via @@ -750,7 +784,7 @@ typing-extensions==4.3.0 # mypy # pydantic # sqlalchemy2-stubs -tzdata==2022.1 +tzdata==2022.2 # via pytz-deprecation-shim tzlocal==4.2 # via great-expectations @@ -759,8 +793,10 @@ uritemplate==4.1.1 urllib3==1.26.11 # via # botocore + # docker # feast (setup.py) # great-expectations + # kubernetes # minio # requests # responses @@ -768,7 +804,7 @@ uvicorn[standard]==0.18.2 # via feast (setup.py) uvloop==0.16.0 # via uvicorn -virtualenv==20.16.2 +virtualenv==20.16.3 # via pre-commit volatile==2.1.0 # via bowler @@ -777,10 +813,12 @@ watchfiles==0.16.1 wcwidth==0.2.5 # via prompt-toolkit websocket-client==1.3.3 - # via docker + # via + # docker + # kubernetes websockets==10.3 # via uvicorn -werkzeug==2.2.1 +werkzeug==2.1.2 # via moto wheel==0.37.1 # via pip-tools @@ -791,7 +829,7 @@ wrapt==1.14.1 # testcontainers xmltodict==0.13.0 # via moto -yarl==1.8.0 +yarl==1.8.1 # via aiohttp zipp==3.8.1 # via importlib-metadata diff --git a/sdk/python/requirements/py3.10-requirements.txt b/sdk/python/requirements/py3.10-requirements.txt index 8ae219f1fe..ac12befb87 100644 --- a/sdk/python/requirements/py3.10-requirements.txt +++ b/sdk/python/requirements/py3.10-requirements.txt @@ -38,9 +38,9 @@ dask==2022.1.1 # via feast (setup.py) dill==0.3.5.1 # via feast (setup.py) -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) -fastavro==1.5.4 +fastavro==1.6.0 # via # feast (setup.py) # pandavro @@ -50,13 +50,15 @@ fsspec==2022.7.1 # via dask google-api-core==2.8.2 # via feast (setup.py) -google-auth==2.9.1 +google-auth==2.10.0 # via google-api-core googleapis-common-protos==1.56.4 # via # feast (setup.py) # google-api-core # tensorflow-metadata +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -73,7 +75,7 @@ idna==3.3 # requests jinja2==3.1.2 # via feast (setup.py) -jsonschema==4.9.0 +jsonschema==4.13.0 # via feast (setup.py) locket==1.0.0 # via partd @@ -87,7 +89,7 @@ mypy==0.971 # via sqlalchemy mypy-extensions==0.4.3 # via mypy -numpy==1.23.1 +numpy==1.23.2 # via # feast (setup.py) # pandas @@ -101,9 +103,9 @@ pandas==1.4.3 # pandavro pandavro==1.5.2 # via feast (setup.py) -partd==1.2.0 +partd==1.3.0 # via dask -proto-plus==1.20.6 +proto-plus==1.22.0 # via feast (setup.py) protobuf==3.20.1 # via @@ -121,11 +123,11 @@ pyasn1==0.4.8 # rsa pyasn1-modules==0.2.8 # via google-auth -pydantic==1.9.1 +pydantic==1.9.2 # via # fastapi # feast (setup.py) -pygments==2.12.0 +pygments==2.13.0 # via feast (setup.py) pyparsing==3.0.9 # via packaging @@ -135,7 +137,7 @@ python-dateutil==2.8.2 # via pandas python-dotenv==0.20.0 # via uvicorn -pytz==2022.1 +pytz==2022.2.1 # via pandas pyyaml==6.0 # via @@ -154,9 +156,9 @@ six==1.16.0 # python-dateutil sniffio==1.2.0 # via anyio -sqlalchemy[mypy]==1.4.39 +sqlalchemy[mypy]==1.4.40 # via feast (setup.py) -sqlalchemy2-stubs==0.0.2a24 +sqlalchemy2-stubs==0.0.2a25 # via sqlalchemy starlette==0.19.1 # via fastapi diff --git a/sdk/python/requirements/py3.8-ci-requirements.txt b/sdk/python/requirements/py3.8-ci-requirements.txt index 931a7d1e24..93011cfdcf 100644 --- a/sdk/python/requirements/py3.8-ci-requirements.txt +++ b/sdk/python/requirements/py3.8-ci-requirements.txt @@ -42,7 +42,7 @@ asn1crypto==1.5.1 # snowflake-connector-python assertpy==1.1 # via feast (setup.py) -asttokens==2.0.5 +asttokens==2.0.8 # via stack-data async-timeout==4.0.2 # via @@ -56,7 +56,7 @@ attrs==22.1.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.24.2 +azure-core==1.25.0 # via # adlfs # azure-identity @@ -65,9 +65,13 @@ azure-core==1.24.2 azure-datalake-store==0.0.52 # via adlfs azure-identity==1.10.0 - # via adlfs -azure-storage-blob==12.13.0 - # via adlfs + # via + # adlfs + # feast (setup.py) +azure-storage-blob==12.13.1 + # via + # adlfs + # feast (setup.py) babel==2.10.3 # via sphinx backcall==0.2.0 @@ -94,12 +98,17 @@ build==0.8.0 # via # feast (setup.py) # pip-tools +bytewax==0.10.0 + # via feast (setup.py) cachecontrol==0.12.11 # via firebase-admin cachetools==5.2.0 # via google-auth +cassandra-driver==3.25.0 + # via feast (setup.py) certifi==2022.6.15 # via + # kubernetes # minio # msrest # requests @@ -121,6 +130,7 @@ click==8.1.3 # black # bowler # feast (setup.py) + # geomet # great-expectations # moreorless # pip-tools @@ -131,7 +141,7 @@ colorama==0.4.5 # via # feast (setup.py) # great-expectations -coverage[toml]==6.4.2 +coverage[toml]==6.4.4 # via pytest-cov cryptography==35.0.0 # via @@ -142,13 +152,14 @@ cryptography==35.0.0 # great-expectations # moto # msal + # pyjwt # pyopenssl # snowflake-connector-python dask==2022.1.1 # via feast (setup.py) dataclasses==0.6 # via great-expectations -db-dtypes==1.0.2 +db-dtypes==1.0.3 # via google-cloud-bigquery decorator==5.1.1 # via @@ -159,10 +170,12 @@ deprecated==1.2.13 deprecation==2.1.0 # via testcontainers dill==0.3.5.1 - # via feast (setup.py) + # via + # feast (setup.py) + # multiprocess distlib==0.3.5 # via virtualenv -docker==5.0.3 +docker==6.0.0 # via # feast (setup.py) # testcontainers @@ -174,25 +187,25 @@ entrypoints==0.4 # via altair execnet==1.9.0 # via pytest-xdist -executing==0.9.1 +executing==0.10.0 # via stack-data -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) -fastavro==1.5.4 +fastavro==1.6.0 # via # feast (setup.py) # pandavro fastjsonschema==2.16.1 # via nbformat -filelock==3.7.1 +filelock==3.8.0 # via virtualenv firebase-admin==5.2.0 # via feast (setup.py) fissix==21.11.13 # via bowler -flake8==5.0.2 +flake8==5.0.4 # via feast (setup.py) -frozenlist==1.3.0 +frozenlist==1.3.1 # via # aiohttp # aiosignal @@ -204,6 +217,8 @@ fsspec==2022.1.0 # s3fs gcsfs==2022.1.0 # via feast (setup.py) +geomet==0.2.1.post1 + # via cassandra-driver google-api-core[grpc]==2.8.2 # via # feast (setup.py) @@ -215,9 +230,9 @@ google-api-core[grpc]==2.8.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-api-python-client==2.55.0 +google-api-python-client==2.57.0 # via firebase-admin -google-auth==2.9.1 +google-auth==2.10.0 # via # gcsfs # google-api-core @@ -226,13 +241,14 @@ google-auth==2.9.1 # google-auth-oauthlib # google-cloud-core # google-cloud-storage + # kubernetes google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.2 # via gcsfs -google-cloud-bigquery[pandas]==3.3.0 +google-cloud-bigquery[pandas]==3.3.2 # via feast (setup.py) -google-cloud-bigquery-storage==2.14.1 +google-cloud-bigquery-storage==2.14.2 # via # feast (setup.py) # google-cloud-bigquery @@ -242,11 +258,11 @@ google-cloud-core==2.3.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.8.0 +google-cloud-datastore==2.8.1 # via feast (setup.py) -google-cloud-firestore==2.6.0 +google-cloud-firestore==2.6.1 # via firebase-admin -google-cloud-storage==2.4.0 +google-cloud-storage==2.5.0 # via # feast (setup.py) # firebase-admin @@ -265,6 +281,8 @@ googleapis-common-protos==1.56.4 # tensorflow-metadata great-expectations==0.14.13 # via feast (setup.py) +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -294,7 +312,7 @@ httplib2==0.20.4 # google-auth-httplib2 httptools==0.4.0 # via uvicorn -identify==2.5.2 +identify==2.5.3 # via pre-commit idna==3.3 # via @@ -333,7 +351,7 @@ jsonpatch==1.32 # via great-expectations jsonpointer==2.3 # via jsonpatch -jsonschema==4.9.0 +jsonschema==4.13.0 # via # altair # feast (setup.py) @@ -341,14 +359,15 @@ jsonschema==4.9.0 # nbformat jupyter-core==4.11.1 # via nbformat +kubernetes==20.13.0 + # via feast (setup.py) locket==1.0.0 # via partd markupsafe==2.1.1 # via # jinja2 # moto - # werkzeug -matplotlib-inline==0.1.3 +matplotlib-inline==0.1.6 # via ipython mccabe==0.7.0 # via flake8 @@ -362,7 +381,7 @@ mock==2.0.0 # via feast (setup.py) moreorless==0.4.0 # via bowler -moto==3.1.16 +moto==3.1.18 # via feast (setup.py) msal==1.18.0 # via @@ -382,6 +401,8 @@ multidict==6.0.2 # via # aiohttp # yarl +multiprocess==0.70.13 + # via bytewax mypy==0.971 # via # feast (setup.py) @@ -398,7 +419,7 @@ nbformat==5.4.0 # via great-expectations nodeenv==1.7.0 # via pre-commit -numpy==1.23.1 +numpy==1.23.2 # via # altair # db-dtypes @@ -418,6 +439,7 @@ packaging==21.3 # dask # db-dtypes # deprecation + # docker # google-cloud-bigquery # great-expectations # pytest @@ -436,11 +458,11 @@ pandavro==1.5.2 # via feast (setup.py) parso==0.8.3 # via jedi -partd==1.2.0 +partd==1.3.0 # via dask pathspec==0.9.0 # via black -pbr==5.9.0 +pbr==5.10.0 # via mock pep517==0.13.0 # via build @@ -466,7 +488,7 @@ pre-commit==2.20.0 # via feast (setup.py) prompt-toolkit==3.0.30 # via ipython -proto-plus==1.20.6 +proto-plus==1.22.0 # via # feast (setup.py) # google-cloud-bigquery @@ -519,19 +541,19 @@ pyasn1-modules==0.2.8 # via google-auth pybindgen==0.22.1 # via feast (setup.py) -pycodestyle==2.9.0 +pycodestyle==2.9.1 # via flake8 pycparser==2.21 # via cffi pycryptodomex==3.15.0 # via snowflake-connector-python -pydantic==1.9.1 +pydantic==1.9.2 # via # fastapi # feast (setup.py) pyflakes==2.5.0 # via flake8 -pygments==2.12.0 +pygments==2.13.0 # via # feast (setup.py) # ipython @@ -541,6 +563,10 @@ pyjwt[crypto]==2.4.0 # adal # msal # snowflake-connector-python +pymssql==2.2.5 + # via feast (setup.py) +pyodbc==4.0.34 + # via feast (setup.py) pyopenssl==22.0.0 # via snowflake-connector-python pyparsing==2.4.7 @@ -585,11 +611,12 @@ python-dateutil==2.8.2 # botocore # google-cloud-bigquery # great-expectations + # kubernetes # moto # pandas python-dotenv==0.20.0 # via uvicorn -pytz==2022.1 +pytz==2022.2.1 # via # babel # great-expectations @@ -603,6 +630,7 @@ pyyaml==6.0 # via # dask # feast (setup.py) + # kubernetes # pre-commit # uvicorn redis==4.2.2 @@ -620,6 +648,7 @@ requests==2.28.1 # google-cloud-bigquery # google-cloud-storage # great-expectations + # kubernetes # moto # msal # msrest @@ -631,6 +660,7 @@ requests==2.28.1 requests-oauthlib==1.3.1 # via # google-auth-oauthlib + # kubernetes # msrest responses==0.21.0 # via moto @@ -650,10 +680,14 @@ six==1.16.0 # via # azure-core # azure-identity + # cassandra-driver + # geomet # google-auth # google-auth-httplib2 # grpcio # happybase + # isodate + # kubernetes # mock # msrestazure # pandavro @@ -682,11 +716,11 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -sqlalchemy[mypy]==1.4.39 +sqlalchemy[mypy]==1.4.40 # via feast (setup.py) -sqlalchemy2-stubs==0.0.2a24 +sqlalchemy2-stubs==0.0.2a25 # via sqlalchemy -stack-data==0.3.0 +stack-data==0.4.0 # via ipython starlette==0.19.1 # via fastapi @@ -698,7 +732,7 @@ tensorflow-metadata==1.9.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations -testcontainers==3.6.0 +testcontainers==3.6.1 # via feast (setup.py) thriftpy2==0.4.14 # via happybase @@ -739,19 +773,19 @@ types-protobuf==3.19.22 # mypy-protobuf types-python-dateutil==2.8.19 # via feast (setup.py) -types-pytz==2022.1.2 +types-pytz==2022.2.1.0 # via feast (setup.py) types-pyyaml==6.0.11 # via feast (setup.py) -types-redis==4.3.13 +types-redis==4.3.14 # via feast (setup.py) -types-requests==2.28.6 +types-requests==2.28.9 # via feast (setup.py) -types-setuptools==63.2.2 +types-setuptools==65.1.0 # via feast (setup.py) types-tabulate==0.8.11 # via feast (setup.py) -types-urllib3==1.26.20 +types-urllib3==1.26.23 # via types-requests typing-extensions==4.3.0 # via @@ -763,7 +797,7 @@ typing-extensions==4.3.0 # pydantic # sqlalchemy2-stubs # starlette -tzdata==2022.1 +tzdata==2022.2 # via pytz-deprecation-shim tzlocal==4.2 # via great-expectations @@ -772,8 +806,10 @@ uritemplate==4.1.1 urllib3==1.26.11 # via # botocore + # docker # feast (setup.py) # great-expectations + # kubernetes # minio # requests # responses @@ -781,7 +817,7 @@ uvicorn[standard]==0.18.2 # via feast (setup.py) uvloop==0.16.0 # via uvicorn -virtualenv==20.16.2 +virtualenv==20.16.3 # via pre-commit volatile==2.1.0 # via bowler @@ -790,10 +826,12 @@ watchfiles==0.16.1 wcwidth==0.2.5 # via prompt-toolkit websocket-client==1.3.3 - # via docker + # via + # docker + # kubernetes websockets==10.3 # via uvicorn -werkzeug==2.2.1 +werkzeug==2.1.2 # via moto wheel==0.37.1 # via pip-tools @@ -804,7 +842,7 @@ wrapt==1.14.1 # testcontainers xmltodict==0.13.0 # via moto -yarl==1.8.0 +yarl==1.8.1 # via aiohttp zipp==3.8.1 # via diff --git a/sdk/python/requirements/py3.8-requirements.txt b/sdk/python/requirements/py3.8-requirements.txt index 362780d69e..c2aef63673 100644 --- a/sdk/python/requirements/py3.8-requirements.txt +++ b/sdk/python/requirements/py3.8-requirements.txt @@ -38,9 +38,9 @@ dask==2022.1.1 # via feast (setup.py) dill==0.3.5.1 # via feast (setup.py) -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) -fastavro==1.5.4 +fastavro==1.6.0 # via # feast (setup.py) # pandavro @@ -50,13 +50,15 @@ fsspec==2022.7.1 # via dask google-api-core==2.8.2 # via feast (setup.py) -google-auth==2.9.1 +google-auth==2.10.0 # via google-api-core googleapis-common-protos==1.56.4 # via # feast (setup.py) # google-api-core # tensorflow-metadata +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -75,7 +77,7 @@ importlib-resources==5.9.0 # via jsonschema jinja2==3.1.2 # via feast (setup.py) -jsonschema==4.9.0 +jsonschema==4.13.0 # via feast (setup.py) locket==1.0.0 # via partd @@ -89,7 +91,7 @@ mypy==0.971 # via sqlalchemy mypy-extensions==0.4.3 # via mypy -numpy==1.23.1 +numpy==1.23.2 # via # feast (setup.py) # pandas @@ -103,11 +105,11 @@ pandas==1.4.3 # pandavro pandavro==1.5.2 # via feast (setup.py) -partd==1.2.0 +partd==1.3.0 # via dask pkgutil-resolve-name==1.3.10 # via jsonschema -proto-plus==1.20.6 +proto-plus==1.22.0 # via feast (setup.py) protobuf==3.20.1 # via @@ -125,11 +127,11 @@ pyasn1==0.4.8 # rsa pyasn1-modules==0.2.8 # via google-auth -pydantic==1.9.1 +pydantic==1.9.2 # via # fastapi # feast (setup.py) -pygments==2.12.0 +pygments==2.13.0 # via feast (setup.py) pyparsing==3.0.9 # via packaging @@ -139,7 +141,7 @@ python-dateutil==2.8.2 # via pandas python-dotenv==0.20.0 # via uvicorn -pytz==2022.1 +pytz==2022.2.1 # via pandas pyyaml==6.0 # via @@ -158,9 +160,9 @@ six==1.16.0 # python-dateutil sniffio==1.2.0 # via anyio -sqlalchemy[mypy]==1.4.39 +sqlalchemy[mypy]==1.4.40 # via feast (setup.py) -sqlalchemy2-stubs==0.0.2a24 +sqlalchemy2-stubs==0.0.2a25 # via sqlalchemy starlette==0.19.1 # via fastapi diff --git a/sdk/python/requirements/py3.9-ci-requirements.txt b/sdk/python/requirements/py3.9-ci-requirements.txt index 5d118a3ae2..e13eee056b 100644 --- a/sdk/python/requirements/py3.9-ci-requirements.txt +++ b/sdk/python/requirements/py3.9-ci-requirements.txt @@ -42,7 +42,7 @@ asn1crypto==1.5.1 # snowflake-connector-python assertpy==1.1 # via feast (setup.py) -asttokens==2.0.5 +asttokens==2.0.8 # via stack-data async-timeout==4.0.2 # via @@ -56,7 +56,7 @@ attrs==22.1.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.24.2 +azure-core==1.25.0 # via # adlfs # azure-identity @@ -65,9 +65,13 @@ azure-core==1.24.2 azure-datalake-store==0.0.52 # via adlfs azure-identity==1.10.0 - # via adlfs -azure-storage-blob==12.13.0 - # via adlfs + # via + # adlfs + # feast (setup.py) +azure-storage-blob==12.13.1 + # via + # adlfs + # feast (setup.py) babel==2.10.3 # via sphinx backcall==0.2.0 @@ -90,12 +94,17 @@ build==0.8.0 # via # feast (setup.py) # pip-tools +bytewax==0.10.0 + # via feast (setup.py) cachecontrol==0.12.11 # via firebase-admin cachetools==5.2.0 # via google-auth +cassandra-driver==3.25.0 + # via feast (setup.py) certifi==2022.6.15 # via + # kubernetes # minio # msrest # requests @@ -117,6 +126,7 @@ click==8.1.3 # black # bowler # feast (setup.py) + # geomet # great-expectations # moreorless # pip-tools @@ -127,7 +137,7 @@ colorama==0.4.5 # via # feast (setup.py) # great-expectations -coverage[toml]==6.4.2 +coverage[toml]==6.4.4 # via pytest-cov cryptography==35.0.0 # via @@ -138,13 +148,14 @@ cryptography==35.0.0 # great-expectations # moto # msal + # pyjwt # pyopenssl # snowflake-connector-python dask==2022.1.1 # via feast (setup.py) dataclasses==0.6 # via great-expectations -db-dtypes==1.0.2 +db-dtypes==1.0.3 # via google-cloud-bigquery decorator==5.1.1 # via @@ -155,10 +166,12 @@ deprecated==1.2.13 deprecation==2.1.0 # via testcontainers dill==0.3.5.1 - # via feast (setup.py) + # via + # feast (setup.py) + # multiprocess distlib==0.3.5 # via virtualenv -docker==5.0.3 +docker==6.0.0 # via # feast (setup.py) # testcontainers @@ -170,25 +183,25 @@ entrypoints==0.4 # via altair execnet==1.9.0 # via pytest-xdist -executing==0.9.1 +executing==0.10.0 # via stack-data -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) -fastavro==1.5.4 +fastavro==1.6.0 # via # feast (setup.py) # pandavro fastjsonschema==2.16.1 # via nbformat -filelock==3.7.1 +filelock==3.8.0 # via virtualenv firebase-admin==5.2.0 # via feast (setup.py) fissix==21.11.13 # via bowler -flake8==5.0.2 +flake8==5.0.4 # via feast (setup.py) -frozenlist==1.3.0 +frozenlist==1.3.1 # via # aiohttp # aiosignal @@ -200,6 +213,8 @@ fsspec==2022.1.0 # s3fs gcsfs==2022.1.0 # via feast (setup.py) +geomet==0.2.1.post1 + # via cassandra-driver google-api-core[grpc]==2.8.2 # via # feast (setup.py) @@ -211,9 +226,9 @@ google-api-core[grpc]==2.8.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-api-python-client==2.55.0 +google-api-python-client==2.57.0 # via firebase-admin -google-auth==2.9.1 +google-auth==2.10.0 # via # gcsfs # google-api-core @@ -222,13 +237,14 @@ google-auth==2.9.1 # google-auth-oauthlib # google-cloud-core # google-cloud-storage + # kubernetes google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.2 # via gcsfs -google-cloud-bigquery[pandas]==3.3.0 +google-cloud-bigquery[pandas]==3.3.2 # via feast (setup.py) -google-cloud-bigquery-storage==2.14.1 +google-cloud-bigquery-storage==2.14.2 # via # feast (setup.py) # google-cloud-bigquery @@ -238,11 +254,11 @@ google-cloud-core==2.3.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.8.0 +google-cloud-datastore==2.8.1 # via feast (setup.py) -google-cloud-firestore==2.6.0 +google-cloud-firestore==2.6.1 # via firebase-admin -google-cloud-storage==2.4.0 +google-cloud-storage==2.5.0 # via # feast (setup.py) # firebase-admin @@ -261,6 +277,8 @@ googleapis-common-protos==1.56.4 # tensorflow-metadata great-expectations==0.14.13 # via feast (setup.py) +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -290,7 +308,7 @@ httplib2==0.20.4 # google-auth-httplib2 httptools==0.4.0 # via uvicorn -identify==2.5.2 +identify==2.5.3 # via pre-commit idna==3.3 # via @@ -327,7 +345,7 @@ jsonpatch==1.32 # via great-expectations jsonpointer==2.3 # via jsonpatch -jsonschema==4.9.0 +jsonschema==4.13.0 # via # altair # feast (setup.py) @@ -335,14 +353,15 @@ jsonschema==4.9.0 # nbformat jupyter-core==4.11.1 # via nbformat +kubernetes==20.13.0 + # via feast (setup.py) locket==1.0.0 # via partd markupsafe==2.1.1 # via # jinja2 # moto - # werkzeug -matplotlib-inline==0.1.3 +matplotlib-inline==0.1.6 # via ipython mccabe==0.7.0 # via flake8 @@ -356,7 +375,7 @@ mock==2.0.0 # via feast (setup.py) moreorless==0.4.0 # via bowler -moto==3.1.16 +moto==3.1.18 # via feast (setup.py) msal==1.18.0 # via @@ -376,6 +395,8 @@ multidict==6.0.2 # via # aiohttp # yarl +multiprocess==0.70.13 + # via bytewax mypy==0.971 # via # feast (setup.py) @@ -392,7 +413,7 @@ nbformat==5.4.0 # via great-expectations nodeenv==1.7.0 # via pre-commit -numpy==1.23.1 +numpy==1.23.2 # via # altair # db-dtypes @@ -412,6 +433,7 @@ packaging==21.3 # dask # db-dtypes # deprecation + # docker # google-cloud-bigquery # great-expectations # pytest @@ -430,11 +452,11 @@ pandavro==1.5.2 # via feast (setup.py) parso==0.8.3 # via jedi -partd==1.2.0 +partd==1.3.0 # via dask pathspec==0.9.0 # via black -pbr==5.9.0 +pbr==5.10.0 # via mock pep517==0.13.0 # via build @@ -458,7 +480,7 @@ pre-commit==2.20.0 # via feast (setup.py) prompt-toolkit==3.0.30 # via ipython -proto-plus==1.20.6 +proto-plus==1.22.0 # via # feast (setup.py) # google-cloud-bigquery @@ -511,19 +533,19 @@ pyasn1-modules==0.2.8 # via google-auth pybindgen==0.22.1 # via feast (setup.py) -pycodestyle==2.9.0 +pycodestyle==2.9.1 # via flake8 pycparser==2.21 # via cffi pycryptodomex==3.15.0 # via snowflake-connector-python -pydantic==1.9.1 +pydantic==1.9.2 # via # fastapi # feast (setup.py) pyflakes==2.5.0 # via flake8 -pygments==2.12.0 +pygments==2.13.0 # via # feast (setup.py) # ipython @@ -533,6 +555,10 @@ pyjwt[crypto]==2.4.0 # adal # msal # snowflake-connector-python +pymssql==2.2.5 + # via feast (setup.py) +pyodbc==4.0.34 + # via feast (setup.py) pyopenssl==22.0.0 # via snowflake-connector-python pyparsing==2.4.7 @@ -577,11 +603,12 @@ python-dateutil==2.8.2 # botocore # google-cloud-bigquery # great-expectations + # kubernetes # moto # pandas python-dotenv==0.20.0 # via uvicorn -pytz==2022.1 +pytz==2022.2.1 # via # babel # great-expectations @@ -595,6 +622,7 @@ pyyaml==6.0 # via # dask # feast (setup.py) + # kubernetes # pre-commit # uvicorn redis==4.2.2 @@ -612,6 +640,7 @@ requests==2.28.1 # google-cloud-bigquery # google-cloud-storage # great-expectations + # kubernetes # moto # msal # msrest @@ -623,15 +652,16 @@ requests==2.28.1 requests-oauthlib==1.3.1 # via # google-auth-oauthlib + # kubernetes # msrest responses==0.21.0 # via moto rsa==4.9 # via google-auth -ruamel-yaml==0.17.17 +ruamel.yaml==0.17.17 # via great-expectations -ruamel-yaml-clib==0.2.6 - # via ruamel-yaml +ruamel.yaml.clib==0.2.6 + # via ruamel.yaml s3fs==2022.1.0 # via feast (setup.py) s3transfer==0.5.2 @@ -642,10 +672,14 @@ six==1.16.0 # via # azure-core # azure-identity + # cassandra-driver + # geomet # google-auth # google-auth-httplib2 # grpcio # happybase + # isodate + # kubernetes # mock # msrestazure # pandavro @@ -674,11 +708,11 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -sqlalchemy[mypy]==1.4.39 +sqlalchemy[mypy]==1.4.40 # via feast (setup.py) -sqlalchemy2-stubs==0.0.2a24 +sqlalchemy2-stubs==0.0.2a25 # via sqlalchemy -stack-data==0.3.0 +stack-data==0.4.0 # via ipython starlette==0.19.1 # via fastapi @@ -690,7 +724,7 @@ tensorflow-metadata==1.9.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations -testcontainers==3.6.0 +testcontainers==3.6.1 # via feast (setup.py) thriftpy2==0.4.14 # via happybase @@ -731,19 +765,19 @@ types-protobuf==3.19.22 # mypy-protobuf types-python-dateutil==2.8.19 # via feast (setup.py) -types-pytz==2022.1.2 +types-pytz==2022.2.1.0 # via feast (setup.py) types-pyyaml==6.0.11 # via feast (setup.py) -types-redis==4.3.13 +types-redis==4.3.14 # via feast (setup.py) -types-requests==2.28.6 +types-requests==2.28.9 # via feast (setup.py) -types-setuptools==63.2.2 +types-setuptools==65.1.0 # via feast (setup.py) types-tabulate==0.8.11 # via feast (setup.py) -types-urllib3==1.26.20 +types-urllib3==1.26.23 # via types-requests typing-extensions==4.3.0 # via @@ -755,7 +789,7 @@ typing-extensions==4.3.0 # pydantic # sqlalchemy2-stubs # starlette -tzdata==2022.1 +tzdata==2022.2 # via pytz-deprecation-shim tzlocal==4.2 # via great-expectations @@ -764,8 +798,10 @@ uritemplate==4.1.1 urllib3==1.26.11 # via # botocore + # docker # feast (setup.py) # great-expectations + # kubernetes # minio # requests # responses @@ -773,7 +809,7 @@ uvicorn[standard]==0.18.2 # via feast (setup.py) uvloop==0.16.0 # via uvicorn -virtualenv==20.16.2 +virtualenv==20.16.3 # via pre-commit volatile==2.1.0 # via bowler @@ -782,10 +818,12 @@ watchfiles==0.16.1 wcwidth==0.2.5 # via prompt-toolkit websocket-client==1.3.3 - # via docker + # via + # docker + # kubernetes websockets==10.3 # via uvicorn -werkzeug==2.2.1 +werkzeug==2.1.2 # via moto wheel==0.37.1 # via pip-tools @@ -796,7 +834,7 @@ wrapt==1.14.1 # testcontainers xmltodict==0.13.0 # via moto -yarl==1.8.0 +yarl==1.8.1 # via aiohttp zipp==3.8.1 # via importlib-metadata diff --git a/sdk/python/requirements/py3.9-requirements.txt b/sdk/python/requirements/py3.9-requirements.txt index 1ef60c531a..0d3cb22bbc 100644 --- a/sdk/python/requirements/py3.9-requirements.txt +++ b/sdk/python/requirements/py3.9-requirements.txt @@ -38,9 +38,9 @@ dask==2022.1.1 # via feast (setup.py) dill==0.3.5.1 # via feast (setup.py) -fastapi==0.79.0 +fastapi==0.79.1 # via feast (setup.py) -fastavro==1.5.4 +fastavro==1.6.0 # via # feast (setup.py) # pandavro @@ -50,13 +50,15 @@ fsspec==2022.7.1 # via dask google-api-core==2.8.2 # via feast (setup.py) -google-auth==2.9.1 +google-auth==2.10.0 # via google-api-core googleapis-common-protos==1.56.4 # via # feast (setup.py) # google-api-core # tensorflow-metadata +greenlet==1.1.2 + # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -73,7 +75,7 @@ idna==3.3 # requests jinja2==3.1.2 # via feast (setup.py) -jsonschema==4.9.0 +jsonschema==4.13.0 # via feast (setup.py) locket==1.0.0 # via partd @@ -87,7 +89,7 @@ mypy==0.971 # via sqlalchemy mypy-extensions==0.4.3 # via mypy -numpy==1.23.1 +numpy==1.23.2 # via # feast (setup.py) # pandas @@ -101,9 +103,9 @@ pandas==1.4.3 # pandavro pandavro==1.5.2 # via feast (setup.py) -partd==1.2.0 +partd==1.3.0 # via dask -proto-plus==1.20.6 +proto-plus==1.22.0 # via feast (setup.py) protobuf==3.20.1 # via @@ -121,11 +123,11 @@ pyasn1==0.4.8 # rsa pyasn1-modules==0.2.8 # via google-auth -pydantic==1.9.1 +pydantic==1.9.2 # via # fastapi # feast (setup.py) -pygments==2.12.0 +pygments==2.13.0 # via feast (setup.py) pyparsing==3.0.9 # via packaging @@ -135,7 +137,7 @@ python-dateutil==2.8.2 # via pandas python-dotenv==0.20.0 # via uvicorn -pytz==2022.1 +pytz==2022.2.1 # via pandas pyyaml==6.0 # via @@ -154,9 +156,9 @@ six==1.16.0 # python-dateutil sniffio==1.2.0 # via anyio -sqlalchemy[mypy]==1.4.39 +sqlalchemy[mypy]==1.4.40 # via feast (setup.py) -sqlalchemy2-stubs==0.0.2a24 +sqlalchemy2-stubs==0.0.2a25 # via sqlalchemy starlette==0.19.1 # via fastapi diff --git a/sdk/python/setup.cfg b/sdk/python/setup.cfg index ebb933f69d..d934249d69 100644 --- a/sdk/python/setup.cfg +++ b/sdk/python/setup.cfg @@ -14,7 +14,7 @@ ignore = E203, E266, E501, W503, C901 max-line-length = 88 max-complexity = 20 select = B,C,E,F,W,T4 -exclude = .git,__pycache__,docs/conf.py,dist,feast/protos,feast/embedded_go/lib +exclude = .git,__pycache__,docs/conf.py,dist,feast/protos,feast/embedded_go/lib,feast/infra/utils/snowflake/snowpark/snowflake_udfs.py [mypy] files=feast,tests diff --git a/sdk/python/tests/README.md b/sdk/python/tests/README.md new file mode 100644 index 0000000000..3212f02482 --- /dev/null +++ b/sdk/python/tests/README.md @@ -0,0 +1,343 @@ +# Testing Suite + +## Overview + +This guide will go over: + +1. how Feast tests are setup +2. how to extend the test suite to test new functionality +3. how to use the existing test suite to test a new custom offline / online store. + +## Test suite overview + +Let's inspect the test setup in `sdk/python/tests/integration`: + +```bash +$ tree +. +├── e2e +│ ├── test_go_feature_server.py +│ ├── test_python_feature_server.py +│ ├── test_universal_e2e.py +│ ├── test_usage_e2e.py +│ └── test_validation.py +├── feature_repos +│ ├── integration_test_repo_config.py +│ ├── repo_configuration.py +│ └── universal +│ ├── catalog +│ ├── data_source_creator.py +│ ├── data_sources +│ │ ├── __init__.py +│ │ ├── bigquery.py +│ │ ├── file.py +│ │ ├── redshift.py +│ │ └── snowflake.py +│ ├── entities.py +│ ├── feature_views.py +│ ├── online_store +│ │ ├── __init__.py +│ │ ├── datastore.py +│ │ ├── dynamodb.py +│ │ ├── hbase.py +│ │ └── redis.py +│ └── online_store_creator.py +├── materialization +│ └── test_lambda.py +├── offline_store +│ ├── test_feature_logging.py +│ ├── test_offline_write.py +│ ├── test_push_features_to_offline_store.py +│ ├── test_s3_custom_endpoint.py +│ └── test_universal_historical_retrieval.py +├── online_store +│ ├── test_online_retrieval.py +│ ├── test_push_features_to_online_store.py +│ └── test_universal_online.py +└── registration + ├── test_feature_store.py + ├── test_inference.py + ├── test_registry.py + ├── test_sql_registry.py + ├── test_universal_cli.py + ├── test_universal_odfv_feature_inference.py + └── test_universal_types.py + +``` + +* `feature_repos` has setup files for most tests in the test suite. +* `conftest.py` and some of the individual test files contain fixtures which can be used to on different offline stores, online stores, etc. and thus abstract away store specific implementations so we don't need to rewrite the same test implementation for different stores. + +## Structure of the test suite + +### What is the universal test suite? + +The universal test suite verifies that crucial Feast functions (e.g `get_historical_features`, `get_online_features` etc.) have the correct behavior for each of the different environments that Feast could be used in. These environments are combinations of an offline store, online store, and provider and the universal test suite serves to run basic functional verification against all of these different permutations. + +We use pytest [fixtures](https://docs.pytest.org/en/6.2.x/fixture.html) to accomplish this without writing excess code. + +Tests in Feast are split into integration and unit tests. + +### Is it an integration or unit test? + +* Integration tests test non local Feast behavior. Integration tests mainly involve testing of Feast components that connect to services outside of Feast(e.g connecting to gcp or aws clients). + * Generally if the test requires the initialization of a feature store in an external environment in order to test (i.e using our universal test fixtures), it is probably an integration test. +* Unit tests, on the other hand, unit tests primarily test local and class level behavior that does not require spinning up an external service. If your test can be run locally without using any other services besides pytest, it is a unit test. + +### Main types of tests + +#### Integration tests + +1. E2E tests + * E2E tests test end-to-end functionality of Feast over the various codepaths (initialize a feature store, apply, and materialize). + * The main codepaths include: + * basic e2e tests for offline stores + * `test_universal_e2e.py` + * go feature server + * `test_go_feature_server.py` + * python http server + * `test_python_feature_server.py` + * usage tracking + * `test_usage_e2e.py` + * data quality monitoring feature validation + * `test_validation.py` +2. Offline and Online Store Tests + * Offline and online store tests mainly test for the offline and online retrieval functionality. + * The various specific functionalities that are tested include: + * push API tests + * `test_push_features_to_offline_store.py` + * `test_push_features_to_online_store.py` + * `test_offline_write.py` + * historical retrieval tests + * `test_universal_historical_retrieval.py` + * online retrieval tests + * `test_universal_online.py` + * data quality monitoring feature logging tests + * `test_feature_logging.py` + * online store tests + * `test_universal_online.py` +3. Registration Tests + * The registration folder contains all of the registry tests and some universal cli tests. This includes: + * CLI Apply and Materialize tests tested against on the universal test suite + * Data type inference tests + * Registry tests +4. Miscellaneous Tests + * AWS Lambda Materialization Tests (Currently do not work) + * `test_lambda.py` + +#### Unit tests + +1. Registry Diff Tests + * These are tests for the infrastructure and registry diff functionality that Feast uses to determine if changes to the registry or infrastructure is needed. +2. Local CLI Tests and Local Feast Tests + * These tests test all of the cli commands against the local file offline store. +3. Infrastructure Unit Tests + * DynamoDB tests with dynamo mocked out + * Repository configuration tests + * Schema inference unit tests + * Key serialization tests + * Basic provider unit tests +4. Feature Store Validation Tests + * These test mainly contain class level validation like hashing tests, protobuf and class serialization, and error and warning handling. + * Data source unit tests + * Feature service unit tests + * Feature service, feature view, and feature validation tests + * Protobuf/json tests for Feast ValueTypes + * Serialization tests + * Type mapping + * Feast types + * Serialization tests due to this [issue](https://github.com/feast-dev/feast/issues/2345) + * Feast usage tracking unit tests + +#### Docstring tests + +Docstring tests are primarily smoke tests to make sure imports and setup functions can be executed without errors. + +## Understanding the test suite with an example test + +### Example test + +Let's look at a sample test using the universal repo: + +{% tabs %} +{% tab code="sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py" %} +```python +@pytest.mark.integration +@pytest.mark.universal_offline_stores +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: f"full:{v}") +def test_historical_features(environment, universal_data_sources, full_feature_names): + store = environment.feature_store + + (entities, datasets, data_sources) = universal_data_sources + + feature_views = construct_universal_feature_views(data_sources) + + entity_df_with_request_data = datasets.entity_df.copy(deep=True) + entity_df_with_request_data["val_to_add"] = [ + i for i in range(len(entity_df_with_request_data)) + ] + entity_df_with_request_data["driver_age"] = [ + i + 100 for i in range(len(entity_df_with_request_data)) + ] + + feature_service = FeatureService( + name="convrate_plus100", + features=[feature_views.driver[["conv_rate"]], feature_views.driver_odfv], + ) + feature_service_entity_mapping = FeatureService( + name="entity_mapping", + features=[ + feature_views.location.with_name("origin").with_join_key_map( + {"location_id": "origin_id"} + ), + feature_views.location.with_name("destination").with_join_key_map( + {"location_id": "destination_id"} + ), + ], + ) + + store.apply( + [ + driver(), + customer(), + location(), + feature_service, + feature_service_entity_mapping, + *feature_views.values(), + ] + ) + # ... more test code + + job_from_df = store.get_historical_features( + entity_df=entity_df_with_request_data, + features=[ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips", + "customer_profile:current_balance", + "customer_profile:avg_passenger_count", + "customer_profile:lifetime_trip_count", + "conv_rate_plus_100:conv_rate_plus_100", + "conv_rate_plus_100:conv_rate_plus_100_rounded", + "conv_rate_plus_100:conv_rate_plus_val_to_add", + "order:order_is_success", + "global_stats:num_rides", + "global_stats:avg_ride_length", + "field_mapping:feature_name", + ], + full_feature_names=full_feature_names, + ) + + if job_from_df.supports_remote_storage_export(): + files = job_from_df.to_remote_storage() + print(files) + assert len(files) > 0 # This test should be way more detailed + + start_time = datetime.utcnow() + actual_df_from_df_entities = job_from_df.to_df() + # ... more test code + + validate_dataframes( + expected_df, + table_from_df_entities, + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + ) + # ... more test code +``` +{% endtab %} +{% endtabs %} + +* The key fixtures are the `environment` and `universal_data_sources` fixtures, which are defined in the `feature_repos` directories and the `conftest.py` file. This by default pulls in a standard dataset with driver and customer entities (that we have pre-defined), certain feature views, and feature values. + * The `environment` fixture sets up a feature store, parametrized by the provider and the online/offline store. It allows the test to query against that feature store without needing to worry about the underlying implementation or any setup that may be involved in creating instances of these datastores. + * Each fixture creates a different integration test with its own `IntegrationTestRepoConfig` which is used by pytest to generate a unique test testing one of the different environments that require testing. + +* Feast tests also use a variety of markers: + * The `@pytest.mark.integration` marker is used to designate integration tests which will cause the test to be run when you call `make test-python-integration`. + * The `@pytest.mark.universal_offline_stores` marker will parametrize the test on all of the universal offline stores including file, redshift, bigquery and snowflake. + * The `full_feature_names` parametrization defines whether or not the test should reference features as their full feature name (fully qualified path) or just the feature name itself. + + +## Writing a new test or reusing existing tests + +### To add a new test to an existing test file + +* Use the same function signatures as an existing test (e.g. use `environment` and `universal_data_sources` as an argument) to include the relevant test fixtures. +* If possible, expand an individual test instead of writing a new test, due to the cost of starting up offline / online stores. +* Use the `universal_offline_stores` and `universal_online_store` markers to parametrize the test against different offline store and online store combinations. You can also designate specific online and offline stores to test by using the `only` parameter on the marker. + +```python +@pytest.mark.universal_online_stores(only=["redis"]) +``` +### To test a new offline / online store from a plugin repo + +* Install Feast in editable mode with `pip install -e`. +* The core tests for offline / online store behavior are parametrized by the `FULL_REPO_CONFIGS` variable defined in `feature_repos/repo_configuration.py`. To overwrite this variable without modifying the Feast repo, create your own file that contains a `FULL_REPO_CONFIGS` (which will require adding a new `IntegrationTestRepoConfig` or two) and set the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. Then the core offline / online store tests can be run with `make test-python-universal`. +* See the [custom offline store demo](https://github.com/feast-dev/feast-custom-offline-store-demo) and the [custom online store demo](https://github.com/feast-dev/feast-custom-online-store-demo) for examples. + +### What are some important things to keep in mind when adding a new offline / online store? + +#### Type mapping/Inference + +Many problems arise when implementing your data store's type conversion to interface with Feast datatypes. +1. You will need to correctly update `inference.py` so that Feast can infer your datasource schemas +2. You also need to update `type_map.py` so that Feast knows how to convert your datastores types to Feast-recognized types in `feast/types.py`. + +#### Historical and online retrieval + +The most important functionality in Feast is historical and online retrieval. Most of the e2e and universal integration test test this functionality in some way. Making sure this functionality works also indirectly asserts that reading and writing from your datastore works as intended. + + +### To include a new offline / online store in the main Feast repo + +* Extend `data_source_creator.py` for your offline store. +* In `repo_configuration.py` add a new `IntegrationTestRepoConfig` or two (depending on how many online stores you want to test). + * Generally, you should only need to test against sqlite. However, if you need to test against a production online store, then you can also test against Redis or dynamodb. +* Run the full test suite with `make test-python-integration.` + +### Including a new offline / online store in the main Feast repo from external plugins with community maintainers. + +* This folder is for plugins that are officially maintained with community owners. Place the APIs in `feast/infra/offline_stores/contrib/`. +* Extend `data_source_creator.py` for your offline store and implement the required APIs. +* In `contrib_repo_configuration.py` add a new `IntegrationTestRepoConfig` (depending on how many online stores you want to test). +* Run the test suite on the contrib test suite with `make test-python-contrib-universal`. + +### To include a new online store + +* In `repo_configuration.py` add a new config that maps to a serialized version of configuration you need in `feature_store.yaml` to setup the online store. +* In `repo_configuration.py`, add new `IntegrationTestRepoConfig` for online stores you want to test. +* Run the full test suite with `make test-python-integration` + +### To use custom data in a new test + +* Check `test_universal_types.py` for an example of how to do this. + +```python +@pytest.mark.integration +def your_test(environment: Environment): + df = #...# + data_source = environment.data_source_creator.create_data_source( + df, + destination_name=environment.feature_store.project + ) + your_fv = driver_feature_view(data_source) + entity = driver(value_type=ValueType.UNKNOWN) + fs.apply([fv, entity]) + + # ... run test +``` + +### Running your own Redis cluster for testing + +* Install Redis on your computer. If you are a mac user, you should be able to `brew install redis`. + * Running `redis-server --help` and `redis-cli --help` should show corresponding help menus. +* * Run `./infra/scripts/redis-cluster.sh start` then `./infra/scripts/redis-cluster.sh create` to start the Redis cluster locally. You should see output that looks like this: +~~~~ +Starting 6001 +Starting 6002 +Starting 6003 +Starting 6004 +Starting 6005 +Starting 6006 +~~~~ +* You should be able to run the integration tests and have the Redis cluster tests pass. +* If you would like to run your own Redis cluster, you can run the above commands with your own specified ports and connect to the newly configured cluster. +* To stop the cluster, run `./infra/scripts/redis-cluster.sh stop` and then `./infra/scripts/redis-cluster.sh clean`. diff --git a/sdk/python/tests/benchmarks/test_benchmark_universal_online_retrieval.py b/sdk/python/tests/benchmarks/test_benchmark_universal_online_retrieval.py index 03070887c4..c942232b47 100644 --- a/sdk/python/tests/benchmarks/test_benchmark_universal_online_retrieval.py +++ b/sdk/python/tests/benchmarks/test_benchmark_universal_online_retrieval.py @@ -1,64 +1,14 @@ -import random -from typing import List - import pytest -from feast import FeatureService -from feast.feast_object import FeastObject -from tests.integration.feature_repos.repo_configuration import ( - construct_universal_feature_views, -) -from tests.integration.feature_repos.universal.entities import ( - customer, - driver, - location, -) - @pytest.mark.benchmark @pytest.mark.integration @pytest.mark.universal_online_stores -def test_online_retrieval(environment, universal_data_sources, benchmark): - fs = environment.feature_store - entities, datasets, data_sources = universal_data_sources - feature_views = construct_universal_feature_views(data_sources) - - feature_service = FeatureService( - "convrate_plus100", - features=[feature_views.driver[["conv_rate"]], feature_views.driver_odfv], - ) - - feast_objects: List[FeastObject] = [] - feast_objects.extend(feature_views.values()) - feast_objects.extend([driver(), customer(), location(), feature_service]) - fs.apply(feast_objects) - fs.materialize(environment.start_date, environment.end_date) - - sample_drivers = random.sample(entities.driver_vals, 10) - - sample_customers = random.sample(entities.customer_vals, 10) - - entity_rows = [ - {"driver_id": d, "customer_id": c, "val_to_add": 50} - for (d, c) in zip(sample_drivers, sample_customers) - ] - - feature_refs = [ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - "conv_rate_plus_100:conv_rate_plus_100", - "conv_rate_plus_100:conv_rate_plus_val_to_add", - "global_stats:num_rides", - "global_stats:avg_ride_length", - ] - unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] - # Remove the on demand feature view output features, since they're not present in the source dataframe - unprefixed_feature_refs.remove("conv_rate_plus_100") - unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") - +def test_online_retrieval(feature_store_for_online_retrieval, benchmark): + """ + Benchmarks a basic online retrieval flow. + """ + fs, feature_refs, entity_rows = feature_store_for_online_retrieval benchmark( fs.get_online_features, features=feature_refs, diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index b4bcccd9c6..69ff7f681c 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -14,10 +14,11 @@ import logging import multiprocessing import os +import random from datetime import datetime, timedelta from multiprocessing import Process from sys import platform -from typing import Any, Dict, List +from typing import Any, Dict, List, Tuple import pandas as pd import pytest @@ -25,7 +26,7 @@ os.environ["FEAST_USAGE"] = "False" os.environ["IS_TEST"] = "True" -from feast import FeatureStore # noqa: E402 +from feast.feature_store import FeatureStore # noqa: E402 from feast.wait import wait_retry_backoff # noqa: E402 from tests.data.data_creator import create_basic_driver_dataset # noqa: E402 from tests.integration.feature_repos.integration_test_repo_config import ( # noqa: E402 @@ -38,11 +39,17 @@ Environment, TestData, construct_test_environment, + construct_universal_feature_views, construct_universal_test_data, ) from tests.integration.feature_repos.universal.data_sources.file import ( # noqa: E402 FileDataSourceCreator, ) +from tests.integration.feature_repos.universal.entities import ( # noqa: E402 + customer, + driver, + location, +) from tests.utils.http_server import check_port_open, free_port # noqa: E402 logger = logging.getLogger(__name__) @@ -373,3 +380,44 @@ def e2e_data_sources(environment: Environment): ) return df, data_source + + +@pytest.fixture +def feature_store_for_online_retrieval( + environment, universal_data_sources +) -> Tuple[FeatureStore, List[str], List[Dict[str, int]]]: + """ + Returns a feature store that is ready for online retrieval, along with entity rows and feature + refs that can be used to query for online features. + """ + fs = environment.feature_store + entities, datasets, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + + feast_objects = [] + feast_objects.extend(feature_views.values()) + feast_objects.extend([driver(), customer(), location()]) + fs.apply(feast_objects) + fs.materialize(environment.start_date, environment.end_date) + + sample_drivers = random.sample(entities.driver_vals, 10) + sample_customers = random.sample(entities.customer_vals, 10) + + entity_rows = [ + {"driver_id": d, "customer_id": c, "val_to_add": 50} + for (d, c) in zip(sample_drivers, sample_customers) + ] + + feature_refs = [ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips", + "customer_profile:current_balance", + "customer_profile:avg_passenger_count", + "customer_profile:lifetime_trip_count", + "conv_rate_plus_100:conv_rate_plus_100", + "conv_rate_plus_100:conv_rate_plus_val_to_add", + "global_stats:num_rides", + "global_stats:avg_ride_length", + ] + + return fs, feature_refs, entity_rows diff --git a/sdk/python/tests/doctest/__init__.py b/sdk/python/tests/doctest/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/doctest/test_all.py b/sdk/python/tests/doctest/test_all.py index 0412e34c36..814a7ca798 100644 --- a/sdk/python/tests/doctest/test_all.py +++ b/sdk/python/tests/doctest/test_all.py @@ -2,10 +2,13 @@ import importlib import pkgutil import sys +import traceback import unittest import feast +FILES_TO_IGNORE = {"app"} + def setup_feature_store(): """Prepares the local environment for a FeatureStore docstring test.""" @@ -15,14 +18,14 @@ def setup_feature_store(): from feast.repo_operations import init_repo from feast.types import Float32, Int64 - init_repo("feature_repo", "local") - fs = FeatureStore(repo_path="feature_repo") + init_repo("project", "local") + fs = FeatureStore(repo_path="project/feature_repo") driver = Entity( name="driver_id", description="driver id", ) driver_hourly_stats = FileSource( - path="feature_repo/data/driver_stats.parquet", + path="project/feature_repo/data/driver_stats.parquet", timestamp_field="event_timestamp", created_timestamp_column="created", ) @@ -35,7 +38,7 @@ def setup_feature_store(): Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int64), ], - batch_source=driver_hourly_stats, + source=driver_hourly_stats, ) fs.apply([driver_hourly_stats_view, driver]) fs.materialize( @@ -48,7 +51,7 @@ def teardown_feature_store(): """Cleans up the local environment after a FeatureStore docstring test.""" import shutil - shutil.rmtree("feature_repo", ignore_errors=True) + shutil.rmtree("project", ignore_errors=True) def test_docstrings(): @@ -68,8 +71,10 @@ def test_docstrings(): for package in current_packages: for _, name, is_pkg in pkgutil.walk_packages(package.__path__): - full_name = package.__name__ + "." + name + if name in FILES_TO_IGNORE: + continue + full_name = package.__name__ + "." + name try: temp_module = importlib.import_module(full_name) if is_pkg: @@ -101,7 +106,7 @@ def test_docstrings(): failed_cases.append(result.failures) except Exception as e: successful = False - failed_cases.append((full_name, e)) + failed_cases.append((full_name, str(e) + traceback.format_exc())) finally: if teardown_function: teardown_function() diff --git a/sdk/python/tests/example_repos/example_feature_repo_1.py b/sdk/python/tests/example_repos/example_feature_repo_1.py index 200065f0b1..eca9aee57c 100644 --- a/sdk/python/tests/example_repos/example_feature_repo_1.py +++ b/sdk/python/tests/example_repos/example_feature_repo_1.py @@ -51,7 +51,7 @@ Field(name="driver_id", dtype=Int64), ], online=True, - batch_source=driver_locations_source, + source=driver_locations_source, tags={}, ) @@ -65,7 +65,7 @@ Field(name="driver_id", dtype=Int64), ], online=True, - stream_source=driver_locations_push_source, + source=driver_locations_push_source, tags={}, ) @@ -80,7 +80,7 @@ Field(name="customer_id", dtype=String), ], online=True, - batch_source=customer_profile_source, + source=customer_profile_source, tags={}, ) @@ -94,7 +94,7 @@ Field(name="customer_id", dtype=String), ], online=True, - batch_source=customer_driver_combined_source, + source=customer_driver_combined_source, tags={}, ) diff --git a/sdk/python/tests/example_repos/example_feature_repo_version_0_19.py b/sdk/python/tests/example_repos/example_feature_repo_version_0_19.py deleted file mode 100644 index 68681794f9..0000000000 --- a/sdk/python/tests/example_repos/example_feature_repo_version_0_19.py +++ /dev/null @@ -1,81 +0,0 @@ -from datetime import timedelta - -import pandas as pd - -from feast import Entity, Feature, FeatureView, FileSource, ValueType -from feast.data_source import RequestDataSource -from feast.on_demand_feature_view import on_demand_feature_view - -driver_hourly_stats = FileSource( - path="%PARQUET_PATH%", # placeholder to be replaced by the test - event_timestamp_column="event_timestamp", # Changed to `timestamp_field` in 0.20 - created_timestamp_column="created", -) - -driver = Entity( - name="driver_id", - value_type=ValueType.INT64, - description="driver id", - join_key="driver_id", # Changed to `join_keys` in 0.20 -) - - -driver_hourly_stats_view = FeatureView( - name="driver_hourly_stats", - entities=["driver_id"], - ttl=timedelta(days=1), - features=[ # Changed to `schema` in 0.20 - Feature(name="conv_rate", dtype=ValueType.FLOAT), # Changed to `Field` in 0.20 - Feature(name="acc_rate", dtype=ValueType.FLOAT), - Feature(name="avg_daily_trips", dtype=ValueType.INT64), - ], - online=True, - batch_source=driver_hourly_stats, # Changed to `source` in 0.20 - tags={}, -) - - -global_daily_stats = FileSource( - path="%PARQUET_PATH_GLOBAL%", # placeholder to be replaced by the test - event_timestamp_column="event_timestamp", # Changed to `timestamp_field` in 0.20 - created_timestamp_column="created", -) - - -global_stats_feature_view = FeatureView( - name="global_daily_stats", - entities=[], - ttl=timedelta(days=1), - features=[ # Changed to `schema` in 0.20 - Feature(name="num_rides", dtype=ValueType.INT32), # Changed to `Field` in 0.20 - Feature(name="avg_ride_length", dtype=ValueType.FLOAT), - ], - online=True, - batch_source=global_daily_stats, # Changed to `source` in 0.20 - tags={}, -) - - -request_source = RequestDataSource( - name="conv_rate_input", - schema={"val_to_add": ValueType.INT64}, -) - - -@on_demand_feature_view( - inputs={ - "conv_rate_input": request_source, - "driver_hourly_stats": driver_hourly_stats_view, - }, - features=[ - Feature(name="conv_rate_plus_100", dtype=ValueType.DOUBLE), - Feature(name="conv_rate_plus_val_to_add", dtype=ValueType.DOUBLE), - ], -) -def conv_rate_plus_100(features_df: pd.DataFrame) -> pd.DataFrame: - df = pd.DataFrame() - df["conv_rate_plus_100"] = features_df["conv_rate"] + 100 - df["conv_rate_plus_val_to_add"] = ( - features_df["conv_rate"] + features_df["val_to_add"] - ) - return df diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_bfvs.py b/sdk/python/tests/example_repos/example_feature_repo_with_bfvs.py new file mode 100644 index 0000000000..e0f75c0c6f --- /dev/null +++ b/sdk/python/tests/example_repos/example_feature_repo_with_bfvs.py @@ -0,0 +1,52 @@ +from datetime import timedelta + +from feast import BatchFeatureView, Entity, Field, FileSource +from feast.types import Float32, Int32, Int64 + +driver_hourly_stats = FileSource( + path="%PARQUET_PATH%", # placeholder to be replaced by the test + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +driver = Entity( + name="driver_id", + description="driver id", +) + + +driver_hourly_stats_view = BatchFeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + Field(name="driver_id", dtype=Int32), + ], + online=True, + source=driver_hourly_stats, + tags={}, +) + + +global_daily_stats = FileSource( + path="%PARQUET_PATH_GLOBAL%", # placeholder to be replaced by the test + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + + +global_stats_feature_view = BatchFeatureView( + name="global_daily_stats", + entities=None, + ttl=timedelta(days=1), + schema=[ + Field(name="num_rides", dtype=Int32), + Field(name="avg_ride_length", dtype=Float32), + ], + online=True, + source=global_daily_stats, + tags={}, +) diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_feature_service.py b/sdk/python/tests/example_repos/example_feature_repo_with_feature_service.py index 372bd9afb7..ff9a0c85e9 100644 --- a/sdk/python/tests/example_repos/example_feature_repo_with_feature_service.py +++ b/sdk/python/tests/example_repos/example_feature_repo_with_feature_service.py @@ -25,7 +25,7 @@ Field(name="driver_id", dtype=Int64), ], online=True, - batch_source=driver_locations_source, + source=driver_locations_source, tags={}, ) diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_feature_service_2.py b/sdk/python/tests/example_repos/example_feature_repo_with_feature_service_2.py new file mode 100644 index 0000000000..3547c3de86 --- /dev/null +++ b/sdk/python/tests/example_repos/example_feature_repo_with_feature_service_2.py @@ -0,0 +1,63 @@ +from datetime import timedelta + +from feast import Entity, FeatureService, FeatureView, Field, FileSource +from feast.types import Float32, Int32, Int64 + +driver_hourly_stats = FileSource( + path="data/driver_stats.parquet", # Fake path + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +driver = Entity( + name="driver_id", +) + +driver_hourly_stats_view = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + Field(name="driver_id", dtype=Int32), + ], + online=True, + source=driver_hourly_stats, + tags={}, +) + +global_daily_stats = FileSource( + path="data/global_stats.parquet", # Fake path + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +global_stats_feature_view = FeatureView( + name="global_daily_stats", + entities=[], + ttl=timedelta(days=1), + schema=[ + Field(name="num_rides", dtype=Int32), + Field(name="avg_ride_length", dtype=Float32), + ], + online=True, + source=global_daily_stats, + tags={}, +) + +all_stats_service = FeatureService( + name="all_stats", + features=[driver_hourly_stats_view, global_stats_feature_view], + tags={"release": "production"}, +) + +some_stats_service = FeatureService( + name="some_stats", + features=[ + driver_hourly_stats_view[["conv_rate"]], + global_stats_feature_view[["num_rides"]], + ], + tags={"release": "production"}, +) diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_feature_service_3.py b/sdk/python/tests/example_repos/example_feature_repo_with_feature_service_3.py new file mode 100644 index 0000000000..c16a5d4abc --- /dev/null +++ b/sdk/python/tests/example_repos/example_feature_repo_with_feature_service_3.py @@ -0,0 +1,52 @@ +from datetime import timedelta + +from feast import Entity, FeatureService, FeatureView, FileSource + +driver_hourly_stats = FileSource( + path="%PARQUET_PATH%", # placeholder to be replaced by the test + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +driver = Entity( + name="driver_id", +) + +driver_hourly_stats_view = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + online=True, + source=driver_hourly_stats, + tags={}, +) + +global_daily_stats = FileSource( + path="%PARQUET_PATH_GLOBAL%", # placeholder to be replaced by the test + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +global_stats_feature_view = FeatureView( + name="global_daily_stats", + entities=[], + ttl=timedelta(days=1), + online=True, + source=global_daily_stats, + tags={}, +) + +all_stats_service = FeatureService( + name="all_stats", + features=[driver_hourly_stats_view, global_stats_feature_view], + tags={"release": "production"}, +) + +some_stats_service = FeatureService( + name="some_stats", + features=[ + driver_hourly_stats_view[["conv_rate"]], + global_stats_feature_view[["num_rides"]], + ], + tags={"release": "production"}, +) diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_inline_batch_source.py b/sdk/python/tests/example_repos/example_feature_repo_with_inline_batch_source.py new file mode 100644 index 0000000000..dc79d28195 --- /dev/null +++ b/sdk/python/tests/example_repos/example_feature_repo_with_inline_batch_source.py @@ -0,0 +1,28 @@ +from datetime import timedelta + +from feast import Entity, FeatureView, Field, FileSource +from feast.types import Float32, Int32, Int64 + +driver = Entity( + name="driver_id", + description="driver id", +) + +driver_hourly_stats_view = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + Field(name="driver_id", dtype=Int32), + ], + online=True, + source=FileSource( + path="data/driver_stats.parquet", # Fake path + timestamp_field="event_timestamp", + created_timestamp_column="created", + ), + tags={}, +) diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_inline_stream_source.py b/sdk/python/tests/example_repos/example_feature_repo_with_inline_stream_source.py new file mode 100644 index 0000000000..5d01791b73 --- /dev/null +++ b/sdk/python/tests/example_repos/example_feature_repo_with_inline_stream_source.py @@ -0,0 +1,37 @@ +from datetime import timedelta + +from feast import Entity, FeatureView, Field, FileSource, KafkaSource +from feast.data_format import AvroFormat +from feast.types import Float32, Int32, Int64 + +driver = Entity( + name="driver_id", + description="driver id", +) + +driver_hourly_stats_view = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + Field(name="driver_id", dtype=Int32), + ], + online=True, + source=KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + kafka_bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=FileSource( + path="data/driver_stats.parquet", # Fake path + timestamp_field="event_timestamp", + created_timestamp_column="created", + ), + watermark_delay_threshold=timedelta(days=1), + ), + tags={}, +) diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_stream_source.py b/sdk/python/tests/example_repos/example_feature_repo_with_stream_source.py new file mode 100644 index 0000000000..0672e3552c --- /dev/null +++ b/sdk/python/tests/example_repos/example_feature_repo_with_stream_source.py @@ -0,0 +1,18 @@ +from datetime import timedelta + +from feast import FileSource, KafkaSource +from feast.data_format import AvroFormat + +stream_source = KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + kafka_bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=FileSource( + path="data/driver_stats.parquet", # Fake path + timestamp_field="event_timestamp", + created_timestamp_column="created", + ), + watermark_delay_threshold=timedelta(days=1), +) diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_ttl_0.py b/sdk/python/tests/example_repos/example_feature_repo_with_ttl_0.py index e2bec03f8f..87ee57ee51 100644 --- a/sdk/python/tests/example_repos/example_feature_repo_with_ttl_0.py +++ b/sdk/python/tests/example_repos/example_feature_repo_with_ttl_0.py @@ -1,6 +1,6 @@ from datetime import timedelta -from feast import Entity, FeatureView, Field, FileSource, ValueType +from feast import Entity, FeatureView, Field, FileSource from feast.types import Float32, Int32, Int64 driver_hourly_stats = FileSource( @@ -9,7 +9,7 @@ created_timestamp_column="created", ) -driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id") +driver = Entity(name="driver_id", description="driver id") driver_hourly_stats_view = FeatureView( diff --git a/sdk/python/tests/foo_provider.py b/sdk/python/tests/foo_provider.py index 7866465b91..d27e2645d4 100644 --- a/sdk/python/tests/foo_provider.py +++ b/sdk/python/tests/foo_provider.py @@ -9,9 +9,9 @@ from feast import Entity, FeatureService, FeatureView, RepoConfig from feast.infra.offline_stores.offline_store import RetrievalJob from feast.infra.provider import Provider +from feast.infra.registry.base_registry import BaseRegistry from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto -from feast.registry import BaseRegistry from feast.saved_dataset import SavedDataset diff --git a/sdk/python/tests/foo_registry_store.py b/sdk/python/tests/foo_registry_store.py index 31fb653e9b..a537ab344b 100644 --- a/sdk/python/tests/foo_registry_store.py +++ b/sdk/python/tests/foo_registry_store.py @@ -1,7 +1,7 @@ from pathlib import Path +from feast.infra.registry.registry_store import RegistryStore from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto -from feast.registry_store import RegistryStore from feast.repo_config import RegistryConfig diff --git a/sdk/python/tests/integration/e2e/__init__.py b/sdk/python/tests/integration/e2e/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/integration/e2e/test_usage_e2e.py b/sdk/python/tests/integration/e2e/test_usage_e2e.py index 5c95bd50b1..4c8be46890 100644 --- a/sdk/python/tests/integration/e2e/test_usage_e2e.py +++ b/sdk/python/tests/integration/e2e/test_usage_e2e.py @@ -57,6 +57,7 @@ def test_usage_on(dummy_exporter, enabling_toggle): online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online.db") ), + entity_key_serialization_version=2, ) ) entity = Entity( @@ -69,10 +70,10 @@ def test_usage_on(dummy_exporter, enabling_toggle): assert len(dummy_exporter) == 3 assert { - "entrypoint": "feast.infra.local.LocalRegistryStore.get_registry_proto" + "entrypoint": "feast.infra.registry.file.FileRegistryStore.get_registry_proto" }.items() <= dummy_exporter[0].items() assert { - "entrypoint": "feast.infra.local.LocalRegistryStore.update_registry_proto" + "entrypoint": "feast.infra.registry.file.FileRegistryStore.update_registry_proto" }.items() <= dummy_exporter[1].items() assert { "entrypoint": "feast.feature_store.FeatureStore.apply" @@ -95,6 +96,7 @@ def test_usage_off(dummy_exporter, enabling_toggle): online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online.db") ), + entity_key_serialization_version=2, ) ) entity = Entity( @@ -138,7 +140,7 @@ def test_exception_usage_off(dummy_exporter, enabling_toggle): def _reload_feast(): """After changing environment need to reload modules and rerun usage decorators""" modules = ( - "feast.infra.local", + "feast.infra.registry.file", "feast.infra.online_stores.sqlite", "feast.feature_store", ) diff --git a/sdk/python/tests/integration/e2e/test_validation.py b/sdk/python/tests/integration/e2e/test_validation.py index 7062948f53..771061b206 100644 --- a/sdk/python/tests/integration/e2e/test_validation.py +++ b/sdk/python/tests/integration/e2e/test_validation.py @@ -65,6 +65,7 @@ def test_historical_retrieval_with_validation(environment, universal_data_source from_=reference_job, name="my_training_dataset", storage=environment.data_source_creator.create_saved_dataset_destination(), + allow_overwrite=True, ) saved_dataset = store.get_saved_dataset("my_training_dataset") @@ -95,6 +96,7 @@ def test_historical_retrieval_fails_on_validation(environment, universal_data_so from_=reference_job, name="my_other_dataset", storage=environment.data_source_creator.create_saved_dataset_destination(), + allow_overwrite=True, ) job = store.get_historical_features( @@ -150,25 +152,33 @@ def test_logged_features_validation(environment, universal_data_sources): # add some non-existing entities to check NotFound feature handling for i in range(5): - entity_df = entity_df.append( - { - "customer_id": 2000 + i, - "driver_id": 6000 + i, - "event_timestamp": datetime.datetime.now(), - }, - ignore_index=True, + entity_df = pd.concat( + [ + entity_df, + pd.DataFrame.from_records( + [ + { + "customer_id": 2000 + i, + "driver_id": 6000 + i, + "event_timestamp": datetime.datetime.now(), + } + ] + ), + ] ) + store_fs = store.get_feature_service(feature_service.name) reference_dataset = store.create_saved_dataset( from_=store.get_historical_features( - entity_df=entity_df, features=feature_service, full_feature_names=True + entity_df=entity_df, features=store_fs, full_feature_names=True ), name="reference_for_validating_logged_features", storage=environment.data_source_creator.create_saved_dataset_destination(), + allow_overwrite=True, ) log_source_df = store.get_historical_features( - entity_df=entity_df, features=feature_service, full_feature_names=False + entity_df=entity_df, features=store_fs, full_feature_names=False ).to_df() logs_df = prepare_logs(log_source_df, feature_service, store) @@ -229,13 +239,16 @@ def test_e2e_validation_via_cli(environment, universal_data_sources): columns=["order_id", "origin_id", "destination_id", "driver_id"] ) retrieval_job = store.get_historical_features( - entity_df=entity_df, features=feature_service, full_feature_names=True + entity_df=entity_df, + features=store.get_feature_service(feature_service.name), + full_feature_names=True, ) logs_df = prepare_logs(retrieval_job.to_df(), feature_service, store) saved_dataset = store.create_saved_dataset( from_=retrieval_job, name="reference_for_validating_logged_features", storage=environment.data_source_creator.create_saved_dataset_destination(), + allow_overwrite=True, ) reference = saved_dataset.as_reference( name="test_reference", profiler=configurable_profiler diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 776fff3bb9..708d9c0a14 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -75,11 +75,11 @@ SNOWFLAKE_CONFIG = { "type": "snowflake.online", - "account": os.environ.get("SNOWFLAKE_CI_DEPLOYMENT", ""), - "user": os.environ.get("SNOWFLAKE_CI_USER", ""), - "password": os.environ.get("SNOWFLAKE_CI_PASSWORD", ""), - "role": os.environ.get("SNOWFLAKE_CI_ROLE", ""), - "warehouse": os.environ.get("SNOWFLAKE_CI_WAREHOUSE", ""), + "account": os.getenv("SNOWFLAKE_CI_DEPLOYMENT", ""), + "user": os.getenv("SNOWFLAKE_CI_USER", ""), + "password": os.getenv("SNOWFLAKE_CI_PASSWORD", ""), + "role": os.getenv("SNOWFLAKE_CI_ROLE", ""), + "warehouse": os.getenv("SNOWFLAKE_CI_WAREHOUSE", ""), "database": "FEAST", "schema": "ONLINE", } @@ -349,6 +349,7 @@ class Environment: python_feature_server: bool worker_id: str online_store_creator: Optional[OnlineStoreCreator] = None + fixture_request: Optional[pytest.FixtureRequest] = None def __post_init__(self): self.end_date = datetime.utcnow().replace(microsecond=0, second=0, minute=0) @@ -368,6 +369,7 @@ def construct_test_environment( fixture_request: Optional[pytest.FixtureRequest], test_suite_name: str = "integration_test", worker_id: str = "worker_id", + entity_key_serialization_version: int = 2, ) -> Environment: _uuid = str(uuid.uuid4()).replace("-", "")[:6] @@ -437,6 +439,7 @@ def construct_test_environment( repo_path=repo_dir_name, feature_server=feature_server, go_feature_serving=test_repo_config.go_feature_serving, + entity_key_serialization_version=entity_key_serialization_version, ) # Create feature_store.yaml out of the config @@ -455,6 +458,7 @@ def construct_test_environment( python_feature_server=test_repo_config.python_feature_server, worker_id=worker_id, online_store_creator=online_creator, + fixture_request=fixture_request, ) return environment diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py index ae83ea8eb0..f0a09b4d5b 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py @@ -12,7 +12,11 @@ SavedDatasetSnowflakeStorage, SnowflakeLoggingDestination, ) -from feast.infra.utils.snowflake_utils import get_snowflake_conn, write_pandas +from feast.infra.utils.snowflake.snowflake_utils import ( + execute_snowflake_statement, + get_snowflake_conn, + write_pandas, +) from feast.repo_config import FeastConfigBaseModel from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, @@ -34,8 +38,10 @@ def __init__(self, project_name: str, *args, **kwargs): warehouse=os.environ["SNOWFLAKE_CI_WAREHOUSE"], database="FEAST", schema="OFFLINE", - storage_integration_name="FEAST_S3", - blob_export_location="s3://feast-snowflake-offload/export", + storage_integration_name=os.getenv("BLOB_EXPORT_STORAGE_NAME", "FEAST_S3"), + blob_export_location=os.getenv( + "BLOB_EXPORT_URI", "s3://feast-snowflake-offload/export" + ), ) def create_data_source( @@ -87,9 +93,7 @@ def get_prefixed_table_name(self, suffix: str) -> str: return f"{self.project_name}_{suffix}" def teardown(self): - snowflake_conn = get_snowflake_conn(self.offline_store_config) - - with snowflake_conn as conn: - cur = conn.cursor() + with get_snowflake_conn(self.offline_store_config) as conn: for table in self.tables: - cur.execute(f'DROP TABLE IF EXISTS "{table}"') + query = f'DROP TABLE IF EXISTS "{table}"' + execute_snowflake_statement(conn, query) diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index b6e9aa8fc0..4eece13412 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -12,7 +12,6 @@ OnDemandFeatureView, PushSource, StreamFeatureView, - ValueType, ) from feast.data_source import DataSource, RequestSource from feast.types import Array, FeastType, Float32, Float64, Int32, Int64 @@ -42,24 +41,6 @@ def driver_feature_view( ) -def global_feature_view( - data_source: DataSource, - name="test_entityless", - infer_features: bool = False, - value_type: ValueType = ValueType.INT32, -) -> FeatureView: - return FeatureView( - name=name, - entities=[], - # Test that Features still work for FeatureViews. - features=None - if infer_features - else [Feature(name="entityless_value", dtype=value_type)], - ttl=timedelta(days=5), - source=data_source, - ) - - def conv_rate_plus_100(features_df: pd.DataFrame) -> pd.DataFrame: df = pd.DataFrame() df["conv_rate_plus_100"] = features_df["conv_rate"] + 100 @@ -88,6 +69,7 @@ def conv_rate_plus_100_feature_view( schema=[] if infer_features else _features, sources=sources, udf=conv_rate_plus_100, + udf_string="raw udf source", ) @@ -125,6 +107,7 @@ def similarity_feature_view( sources=sources, schema=[] if infer_features else _fields, udf=similarity, + udf_string="similarity raw udf", ) @@ -138,10 +121,10 @@ def create_conv_rate_request_source(): def create_similarity_request_source(): return RequestSource( name="similarity_input", - schema={ - "vector_double": ValueType.DOUBLE_LIST, - "vector_float": ValueType.FLOAT_LIST, - }, + schema=[ + Field(name="vector_doube", dtype=Array(Float64)), + Field(name="vector_float", dtype=Array(Float32)), + ], ) @@ -155,7 +138,7 @@ def create_item_embeddings_feature_view(source, infer_features: bool = False): Field(name="embedding_double", dtype=Array(Float64)), Field(name="embedding_float", dtype=Array(Float32)), ], - batch_source=source, + source=source, ttl=timedelta(hours=2), ) return item_embeddings_feature_view @@ -240,12 +223,11 @@ def create_global_stats_feature_view(source, infer_features: bool = False): global_stats_feature_view = FeatureView( name="global_stats", entities=[], - features=None + schema=None if infer_features else [ - # Test that Features still work for FeatureViews. - Feature(name="num_rides", dtype=ValueType.INT32), - Feature(name="avg_ride_length", dtype=ValueType.FLOAT), + Field(name="num_rides", dtype=Int32), + Field(name="avg_ride_length", dtype=Float32), ], source=source, ttl=timedelta(days=2), @@ -288,8 +270,7 @@ def create_field_mapping_feature_view(source): return FeatureView( name="field_mapping", entities=[], - # Test that Features still work for FeatureViews. - features=[Feature(name="feature_name", dtype=ValueType.INT32)], + schema=[Field(name="feature_name", dtype=Int32)], source=source, ttl=timedelta(days=2), ) diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py b/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py new file mode 100644 index 0000000000..190d94a830 --- /dev/null +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py @@ -0,0 +1,56 @@ +# +# Copyright 2019 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import time +from typing import Dict + +from testcontainers.core.container import DockerContainer +from testcontainers.core.waiting_utils import wait_for_logs + +from tests.integration.feature_repos.universal.online_store_creator import ( + OnlineStoreCreator, +) + + +class CassandraOnlineStoreCreator(OnlineStoreCreator): + def __init__(self, project_name: str, **kwargs): + super().__init__(project_name) + self.container = DockerContainer("library/cassandra:4.0.4").with_exposed_ports( + "9042" + ) + + def create_online_store(self) -> Dict[str, object]: + self.container.start() + log_string_to_wait_for = "Startup complete" + # on a modern machine it takes about 45-60 seconds for the container + # to start accepting CQL requests: + wait_for_logs( + container=self.container, predicate=log_string_to_wait_for, timeout=90 + ) + keyspace_name = "feast_keyspace" + keyspace_creation_command = f"create KEYSPACE \"{keyspace_name}\" WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 1}};" + self.container.exec(f'cqlsh -e "{keyspace_creation_command}"') + time.sleep(2) + exposed_port = int(self.container.get_exposed_port("9042")) + return { + "type": "cassandra", + "hosts": ["127.0.0.1"], + "port": exposed_port, + "keyspace": keyspace_name, + } + + def teardown(self): + self.container.stop() diff --git a/sdk/python/tests/integration/materialization/__init__.py b/sdk/python/tests/integration/materialization/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/integration/materialization/contrib/bytewax/README.md b/sdk/python/tests/integration/materialization/contrib/bytewax/README.md new file mode 100644 index 0000000000..4ed5d49a68 --- /dev/null +++ b/sdk/python/tests/integration/materialization/contrib/bytewax/README.md @@ -0,0 +1,22 @@ +# Running Bytewax integration tests + +To run the Bytewax integration tests, you'll need to provision a cluster using [eksctl.](https://docs.aws.amazon.com/eks/latest/userguide/eksctl.html). + +## Creating an EKS cluster + +In this directory is a configuration file for a single-node EKS cluster + +To create the EKS cluster needed for testing, issue the following command: + +``` shell +> eksctl create cluster -f ./eks-config.yaml +``` + +When the tests are complete, delete the created cluster with: + +``` shell +> eksctl delete cluster bytewax-feast-cluster +``` + + + diff --git a/sdk/python/tests/integration/materialization/contrib/bytewax/eks-config.yaml b/sdk/python/tests/integration/materialization/contrib/bytewax/eks-config.yaml new file mode 100644 index 0000000000..5f8d0655aa --- /dev/null +++ b/sdk/python/tests/integration/materialization/contrib/bytewax/eks-config.yaml @@ -0,0 +1,13 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: bytewax-feast-cluster + version: "1.22" + region: us-west-2 + +managedNodeGroups: +- name: ng-1 + instanceType: c6a.large + desiredCapacity: 1 + privateNetworking: true diff --git a/sdk/python/tests/integration/materialization/contrib/bytewax/test_bytewax.py b/sdk/python/tests/integration/materialization/contrib/bytewax/test_bytewax.py new file mode 100644 index 0000000000..0d2cecb2f1 --- /dev/null +++ b/sdk/python/tests/integration/materialization/contrib/bytewax/test_bytewax.py @@ -0,0 +1,67 @@ +from datetime import timedelta + +import pytest + +from feast import Entity, Feature, FeatureView, ValueType +from tests.data.data_creator import create_basic_driver_dataset +from tests.integration.feature_repos.integration_test_repo_config import ( + IntegrationTestRepoConfig, + RegistryLocation, +) +from tests.integration.feature_repos.repo_configuration import ( + construct_test_environment, +) +from tests.integration.feature_repos.universal.data_sources.redshift import ( + RedshiftDataSourceCreator, +) +from tests.utils.e2e_test_validation import validate_offline_online_store_consistency + + +@pytest.mark.integration +@pytest.mark.skip(reason="Run this test manually after creating an EKS cluster.") +def test_bytewax_materialization(): + bytewax_config = IntegrationTestRepoConfig( + provider="aws", + online_store={"type": "dynamodb", "region": "us-west-2"}, + offline_store_creator=RedshiftDataSourceCreator, + batch_engine={ + "type": "bytewax", + }, + registry_location=RegistryLocation.S3, + ) + bytewax_environment = construct_test_environment(bytewax_config, None) + + df = create_basic_driver_dataset() + ds = bytewax_environment.data_source_creator.create_data_source( + df, + bytewax_environment.feature_store.project, + field_mapping={"ts_1": "ts"}, + ) + + fs = bytewax_environment.feature_store + driver = Entity( + name="driver_id", + join_key="driver_id", + value_type=ValueType.INT64, + ) + + driver_stats_fv = FeatureView( + name="driver_hourly_stats", + entities=["driver_id"], + ttl=timedelta(weeks=52), + features=[Feature(name="value", dtype=ValueType.FLOAT)], + batch_source=ds, + ) + + try: + fs.apply([driver, driver_stats_fv]) + + # materialization is run in two steps and + # we use timestamp from generated dataframe as a split point + split_dt = df["ts_1"][4].to_pydatetime() - timedelta(seconds=1) + + print(f"Split datetime: {split_dt}") + + validate_offline_online_store_consistency(fs, driver_stats_fv, split_dt) + finally: + fs.teardown() diff --git a/sdk/python/tests/integration/materialization/test_lambda.py b/sdk/python/tests/integration/materialization/test_lambda.py index 8ffd31e0cd..d93508c156 100644 --- a/sdk/python/tests/integration/materialization/test_lambda.py +++ b/sdk/python/tests/integration/materialization/test_lambda.py @@ -3,9 +3,9 @@ import pytest from feast.entity import Entity -from feast.feature import Feature from feast.feature_view import FeatureView -from feast.types import ValueType +from feast.field import Field +from feast.types import Float32 from tests.data.data_creator import create_basic_driver_dataset from tests.integration.feature_repos.integration_test_repo_config import ( IntegrationTestRepoConfig, @@ -24,16 +24,23 @@ def test_lambda_materialization_consistency(): lambda_config = IntegrationTestRepoConfig( provider="aws", - online_store={"type": "dynamodb", "region": "us-west-2"}, + online_store={ + "type": "dynamodb", + "region": "us-west-2", + "consistent_reads": True, + }, offline_store_creator=RedshiftDataSourceCreator, batch_engine={ "type": "lambda", - "materialization_image": "402087665549.dkr.ecr.us-west-2.amazonaws.com/feast-lambda-consumer:v1", + "materialization_image": "402087665549.dkr.ecr.us-west-2.amazonaws.com/feast-lambda-consumer:v2", "lambda_role": "arn:aws:iam::402087665549:role/lambda_execution_role", }, registry_location=RegistryLocation.S3, ) - lambda_environment = construct_test_environment(lambda_config, None) + # TODO(adchia): figure out why entity_key_serialization_version 2 breaks with this test + lambda_environment = construct_test_environment( + lambda_config, None, entity_key_serialization_version=1 + ) df = create_basic_driver_dataset() ds = lambda_environment.data_source_creator.create_data_source( @@ -45,16 +52,15 @@ def test_lambda_materialization_consistency(): fs = lambda_environment.feature_store driver = Entity( name="driver_id", - join_key="driver_id", - value_type=ValueType.INT64, + join_keys=["driver_id"], ) driver_stats_fv = FeatureView( name="driver_hourly_stats", - entities=["driver_id"], + entities=[driver], ttl=timedelta(weeks=52), - features=[Feature(name="value", dtype=ValueType.FLOAT)], - batch_source=ds, + schema=[Field(name="value", dtype=Float32)], + source=ds, ) try: diff --git a/sdk/python/tests/integration/materialization/test_snowflake.py b/sdk/python/tests/integration/materialization/test_snowflake.py new file mode 100644 index 0000000000..0cf1471dfe --- /dev/null +++ b/sdk/python/tests/integration/materialization/test_snowflake.py @@ -0,0 +1,127 @@ +import os +from datetime import timedelta + +import pytest + +from feast.entity import Entity +from feast.feature_view import FeatureView +from tests.data.data_creator import create_basic_driver_dataset +from tests.integration.feature_repos.integration_test_repo_config import ( + IntegrationTestRepoConfig, +) +from tests.integration.feature_repos.repo_configuration import ( + construct_test_environment, +) +from tests.integration.feature_repos.universal.data_sources.snowflake import ( + SnowflakeDataSourceCreator, +) +from tests.utils.e2e_test_validation import validate_offline_online_store_consistency + +SNOWFLAKE_ENGINE_CONFIG = { + "type": "snowflake.engine", + "account": os.getenv("SNOWFLAKE_CI_DEPLOYMENT", ""), + "user": os.getenv("SNOWFLAKE_CI_USER", ""), + "password": os.getenv("SNOWFLAKE_CI_PASSWORD", ""), + "role": os.getenv("SNOWFLAKE_CI_ROLE", ""), + "warehouse": os.getenv("SNOWFLAKE_CI_WAREHOUSE", ""), + "database": "FEAST", + "schema": "MATERIALIZATION", +} + +SNOWFLAKE_ONLINE_CONFIG = { + "type": "snowflake.online", + "account": os.getenv("SNOWFLAKE_CI_DEPLOYMENT", ""), + "user": os.getenv("SNOWFLAKE_CI_USER", ""), + "password": os.getenv("SNOWFLAKE_CI_PASSWORD", ""), + "role": os.getenv("SNOWFLAKE_CI_ROLE", ""), + "warehouse": os.getenv("SNOWFLAKE_CI_WAREHOUSE", ""), + "database": "FEAST", + "schema": "ONLINE", +} + + +@pytest.mark.integration +def test_snowflake_materialization_consistency_internal(): + snowflake_config = IntegrationTestRepoConfig( + online_store=SNOWFLAKE_ONLINE_CONFIG, + offline_store_creator=SnowflakeDataSourceCreator, + batch_engine=SNOWFLAKE_ENGINE_CONFIG, + ) + snowflake_environment = construct_test_environment(snowflake_config, None) + + df = create_basic_driver_dataset() + ds = snowflake_environment.data_source_creator.create_data_source( + df, + snowflake_environment.feature_store.project, + field_mapping={"ts_1": "ts"}, + ) + + fs = snowflake_environment.feature_store + driver = Entity( + name="driver_id", + join_keys=["driver_id"], + ) + + driver_stats_fv = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(weeks=52), + source=ds, + ) + + try: + fs.apply([driver, driver_stats_fv]) + + # materialization is run in two steps and + # we use timestamp from generated dataframe as a split point + split_dt = df["ts_1"][4].to_pydatetime() - timedelta(seconds=1) + + print(f"Split datetime: {split_dt}") + + validate_offline_online_store_consistency(fs, driver_stats_fv, split_dt) + finally: + fs.teardown() + snowflake_environment.data_source_creator.teardown() + + +@pytest.mark.integration +def test_snowflake_materialization_consistency_external(): + snowflake_config = IntegrationTestRepoConfig( + offline_store_creator=SnowflakeDataSourceCreator, + batch_engine=SNOWFLAKE_ENGINE_CONFIG, + ) + snowflake_environment = construct_test_environment(snowflake_config, None) + + df = create_basic_driver_dataset() + ds = snowflake_environment.data_source_creator.create_data_source( + df, + snowflake_environment.feature_store.project, + field_mapping={"ts_1": "ts"}, + ) + + fs = snowflake_environment.feature_store + driver = Entity( + name="driver_id", + join_keys=["driver_id"], + ) + + driver_stats_fv = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(weeks=52), + source=ds, + ) + + try: + fs.apply([driver, driver_stats_fv]) + + # materialization is run in two steps and + # we use timestamp from generated dataframe as a split point + split_dt = df["ts_1"][4].to_pydatetime() - timedelta(seconds=1) + + print(f"Split datetime: {split_dt}") + + validate_offline_online_store_consistency(fs, driver_stats_fv, split_dt) + finally: + fs.teardown() + snowflake_environment.data_source_creator.teardown() diff --git a/sdk/python/tests/integration/offline_store/__init__.py b/sdk/python/tests/integration/offline_store/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/integration/offline_store/test_persist.py b/sdk/python/tests/integration/offline_store/test_persist.py new file mode 100644 index 0000000000..8e6f182917 --- /dev/null +++ b/sdk/python/tests/integration/offline_store/test_persist.py @@ -0,0 +1,54 @@ +import pytest + +from feast.errors import SavedDatasetLocationAlreadyExists +from feast.saved_dataset import SavedDatasetStorage +from tests.integration.feature_repos.repo_configuration import ( + construct_universal_feature_views, +) +from tests.integration.feature_repos.universal.entities import ( + customer, + driver, + location, +) + + +@pytest.mark.integration +@pytest.mark.universal_offline_stores(only=["file"]) +def test_persist_does_not_overwrite(environment, universal_data_sources): + """ + Tests that the persist method does not overwrite an existing location in the offline store. + + This test currently is only run against the file offline store as it is the only implementation + that prevents overwriting. As more offline stores add this check, they should be added to this test. + """ + store = environment.feature_store + entities, datasets, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + store.apply([driver(), customer(), location(), *feature_views.values()]) + + features = [ + "customer_profile:current_balance", + "customer_profile:avg_passenger_count", + "customer_profile:lifetime_trip_count", + ] + + entity_df = datasets.entity_df.drop( + columns=["order_id", "origin_id", "destination_id"] + ) + job = store.get_historical_features( + entity_df=entity_df, + features=features, + ) + + with pytest.raises(SavedDatasetLocationAlreadyExists): + # Copy data source destination to a saved dataset destination. + saved_dataset_destination = SavedDatasetStorage.from_data_source( + data_sources.customer + ) + + # This should fail since persisting to a preexisting location is not allowed. + store.create_saved_dataset( + from_=job, + name="my_training_dataset", + storage=saved_dataset_destination, + ) diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 718b7577d9..0abb290563 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -146,7 +146,9 @@ def test_historical_features(environment, universal_data_sources, full_feature_n validate_dataframes( expected_df, actual_df_from_df_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) assert_feature_service_correctness( @@ -170,7 +172,9 @@ def test_historical_features(environment, universal_data_sources, full_feature_n validate_dataframes( expected_df, table_from_df_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -185,15 +189,16 @@ def test_historical_features_with_shared_batch_source( store = environment.feature_store entities, datasets, data_sources = universal_data_sources + driver_entity = driver() driver_stats_v1 = FeatureView( name="driver_stats_v1", - entities=["driver"], + entities=[driver_entity], schema=[Field(name="avg_daily_trips", dtype=Int32)], source=data_sources.driver, ) driver_stats_v2 = FeatureView( name="driver_stats_v2", - entities=["driver"], + entities=[driver_entity], schema=[ Field(name="avg_daily_trips", dtype=Int32), Field(name="conv_rate", dtype=Float32), @@ -201,7 +206,7 @@ def test_historical_features_with_shared_batch_source( source=data_sources.driver, ) - store.apply([driver(), driver_stats_v1, driver_stats_v2]) + store.apply([driver_entity, driver_stats_v1, driver_stats_v2]) with pytest.raises(KeyError): store.get_historical_features( @@ -328,7 +333,9 @@ def test_historical_features_with_entities_from_query( validate_dataframes( expected_df_query, actual_df_from_sql_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) table_from_sql_entities = job_from_sql.to_arrow().to_pandas() @@ -340,7 +347,9 @@ def test_historical_features_with_entities_from_query( validate_dataframes( expected_df_query, table_from_sql_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -380,6 +389,7 @@ def test_historical_features_persisting( name="saved_dataset", storage=environment.data_source_creator.create_saved_dataset_destination(), tags={"env": "test"}, + allow_overwrite=True, ) event_timestamp = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL @@ -413,13 +423,17 @@ def test_historical_features_persisting( validate_dataframes( expected_df, saved_dataset.to_df(), - keys=[event_timestamp, "driver_id", "customer_id"], + sort_by=[event_timestamp, "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) validate_dataframes( job.to_df(), saved_dataset.to_df(), - keys=[event_timestamp, "driver_id", "customer_id"], + sort_by=[event_timestamp, "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -491,7 +505,9 @@ def test_historical_features_with_no_ttl( validate_dataframes( expected_df, job.to_df(), - keys=[event_timestamp, "driver_id", "customer_id"], + sort_by=[event_timestamp, "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -569,8 +585,7 @@ def test_historical_features_from_bigquery_sources_containing_backfills(environm name="driver_stats", entities=[driver], schema=[Field(name="avg_daily_trips", dtype=Int32)], - batch_source=driver_stats_data_source, - ttl=None, + source=driver_stats_data_source, ) store.apply([driver, driver_fv]) @@ -589,4 +604,8 @@ def test_historical_features_from_bigquery_sources_containing_backfills(environm print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n")) assert sorted(expected_df.columns) == sorted(actual_df.columns) - validate_dataframes(expected_df, actual_df, keys=["driver_id"]) + validate_dataframes( + expected_df, + actual_df, + sort_by=["driver_id"], + ) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 738b00f7d7..7852991652 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -13,11 +13,14 @@ import requests from botocore.exceptions import BotoCoreError -from feast import Entity, FeatureService, FeatureView, Field +from feast.entity import Entity from feast.errors import ( FeatureNameCollisionError, RequestDataNotFoundInEntityRowsException, ) +from feast.feature_service import FeatureService +from feast.feature_view import FeatureView +from feast.field import Field from feast.online_response import TIMESTAMP_POSTFIX from feast.types import Float32, Int32, String from feast.wait import wait_retry_backoff @@ -124,7 +127,7 @@ def test_write_to_online_store_event_check(environment): name="feature_view_123", schema=[Field(name="string_col", dtype=String)], entities=[e], - batch_source=file_source, + source=file_source, ttl=timedelta(minutes=5), ) # Register Feature View and Entity @@ -331,15 +334,16 @@ def test_online_retrieval_with_shared_batch_source(environment, universal_data_s fs = environment.feature_store entities, datasets, data_sources = universal_data_sources + driver_entity = driver() driver_stats_v1 = FeatureView( name="driver_stats_v1", - entities=["driver"], + entities=[driver_entity], schema=[Field(name="avg_daily_trips", dtype=Int32)], source=data_sources.driver, ) driver_stats_v2 = FeatureView( name="driver_stats_v2", - entities=["driver"], + entities=[driver_entity], schema=[ Field(name="avg_daily_trips", dtype=Int32), Field(name="conv_rate", dtype=Float32), @@ -347,7 +351,7 @@ def test_online_retrieval_with_shared_batch_source(environment, universal_data_s source=data_sources.driver, ) - fs.apply([driver(), driver_stats_v1, driver_stats_v2]) + fs.apply([driver_entity, driver_stats_v1, driver_stats_v2]) data = pd.DataFrame( { @@ -766,6 +770,21 @@ def eventually_apply() -> Tuple[None, bool]: assert all(v is None for v in online_features["value"]) +@pytest.mark.integration +@pytest.mark.universal_online_stores +def test_online_retrieval_success(feature_store_for_online_retrieval): + """ + Tests that online retrieval executes successfully (i.e. without errors). + + Does not test for correctness of the results of online retrieval. + """ + fs, feature_refs, entity_rows = feature_store_for_online_retrieval + fs.get_online_features( + features=feature_refs, + entity_rows=entity_rows, + ) + + def response_feature_name( feature: str, feature_refs: List[str], full_feature_names: bool ) -> str: diff --git a/sdk/python/tests/integration/registration/__init__.py b/sdk/python/tests/integration/registration/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/integration/registration/test_feature_store.py b/sdk/python/tests/integration/registration/test_feature_store.py index 7b95afadba..deb1b0635f 100644 --- a/sdk/python/tests/integration/registration/test_feature_store.py +++ b/sdk/python/tests/integration/registration/test_feature_store.py @@ -30,11 +30,7 @@ from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig from feast.repo_config import RepoConfig from feast.types import Array, Bytes, Float64, Int64, String -from tests.utils.data_source_test_creator import ( - prep_file_source, - simple_bq_source_using_query_arg, - simple_bq_source_using_table_arg, -) +from tests.utils.data_source_test_creator import prep_file_source @pytest.mark.integration @@ -92,54 +88,23 @@ def test_feature_view_inference_success(test_feature_store, dataframe_source): entities=[entity], ttl=timedelta(minutes=5), online=True, - batch_source=file_source, - tags={}, - ) - - fv2 = FeatureView( - name="fv2", - entities=[entity], - ttl=timedelta(minutes=5), - online=True, - batch_source=simple_bq_source_using_table_arg(dataframe_source, "ts_1"), - tags={}, - ) - - fv3 = FeatureView( - name="fv3", - entities=[entity], - ttl=timedelta(minutes=5), - online=True, - batch_source=simple_bq_source_using_query_arg(dataframe_source, "ts_1"), + source=file_source, tags={}, ) - test_feature_store.apply([entity, fv1, fv2, fv3]) # Register Feature Views + test_feature_store.apply([entity, fv1]) # Register Feature Views feature_view_1 = test_feature_store.list_feature_views()[0] - feature_view_2 = test_feature_store.list_feature_views()[1] - feature_view_3 = test_feature_store.list_feature_views()[2] actual_file_source = { (feature.name, feature.dtype) for feature in feature_view_1.features } - actual_bq_using_table_arg_source = { - (feature.name, feature.dtype) for feature in feature_view_2.features - } - actual_bq_using_query_arg_source = { - (feature.name, feature.dtype) for feature in feature_view_3.features - } expected = { ("float_col", Float64), ("int64_col", Int64), ("string_col", String), } - assert ( - expected - == actual_file_source - == actual_bq_using_table_arg_source - == actual_bq_using_query_arg_source - ) + assert expected == actual_file_source test_feature_store.teardown() @@ -159,7 +124,6 @@ def test_apply_feature_view_integration(test_feature_store): path="file://feast/*", timestamp_field="ts_col", created_timestamp_column="timestamp", - date_partition_column="date_partition_col", ) entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) @@ -175,7 +139,7 @@ def test_apply_feature_view_integration(test_feature_store): ], entities=[entity], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) @@ -230,6 +194,7 @@ def feature_store_with_local_registry(): project="default", provider="local", online_store=SqliteOnlineStoreConfig(path=online_store_path), + entity_key_serialization_version=2, ) ) @@ -253,6 +218,7 @@ def feature_store_with_gcs_registry(): registry=f"gs://{bucket_name}/registry.db", project="default", provider="gcp", + entity_key_serialization_version=2, ) ) @@ -271,5 +237,6 @@ def feature_store_with_s3_registry(): region=os.getenv("AWS_REGION", "us-west-2") ), offline_store=FileOfflineStoreConfig(), + entity_key_serialization_version=2, ) ) diff --git a/sdk/python/tests/integration/registration/test_inference.py b/sdk/python/tests/integration/registration/test_inference.py index de02fe53fe..17bb09933e 100644 --- a/sdk/python/tests/integration/registration/test_inference.py +++ b/sdk/python/tests/integration/registration/test_inference.py @@ -5,11 +5,7 @@ from feast import RepoConfig from feast.errors import RegistryInferenceFailure from feast.inference import update_data_sources_with_inferred_event_timestamp_col -from tests.utils.data_source_test_creator import ( - prep_file_source, - simple_bq_source_using_query_arg, - simple_bq_source_using_table_arg, -) +from tests.utils.data_source_test_creator import prep_file_source @pytest.mark.integration @@ -20,23 +16,27 @@ def test_update_file_data_source_with_inferred_event_timestamp_col(simple_datase with prep_file_source(df=simple_dataset_1) as file_source: data_sources = [ file_source, - simple_bq_source_using_table_arg(simple_dataset_1), - simple_bq_source_using_query_arg(simple_dataset_1), ] update_data_sources_with_inferred_event_timestamp_col( - data_sources, RepoConfig(provider="local", project="test") + data_sources, + RepoConfig( + provider="local", project="test", entity_key_serialization_version=2 + ), ) actual_event_timestamp_cols = [ source.timestamp_field for source in data_sources ] - assert actual_event_timestamp_cols == ["ts_1", "ts_1", "ts_1"] + assert actual_event_timestamp_cols == ["ts_1"] with prep_file_source(df=df_with_two_viable_timestamp_cols) as file_source: with pytest.raises(RegistryInferenceFailure): # two viable timestamp_fields update_data_sources_with_inferred_event_timestamp_col( - [file_source], RepoConfig(provider="local", project="test") + [file_source], + RepoConfig( + provider="local", project="test", entity_key_serialization_version=2 + ), ) @@ -52,7 +52,9 @@ def test_update_data_sources_with_inferred_event_timestamp_col(universal_data_so update_data_sources_with_inferred_event_timestamp_col( data_sources_copy.values(), - RepoConfig(provider="local", project="test"), + RepoConfig( + provider="local", project="test", entity_key_serialization_version=2 + ), ) actual_event_timestamp_cols = [ source.timestamp_field for source in data_sources_copy.values() diff --git a/sdk/python/tests/integration/registration/test_registry.py b/sdk/python/tests/integration/registration/test_registry.py index 0cc161d997..739fb9ec5c 100644 --- a/sdk/python/tests/integration/registration/test_registry.py +++ b/sdk/python/tests/integration/registration/test_registry.py @@ -23,7 +23,7 @@ from feast.entity import Entity from feast.feature_view import FeatureView from feast.field import Field -from feast.registry import Registry +from feast.infra.registry.registry import Registry from feast.repo_config import RegistryConfig from feast.types import Array, Bytes, Int64, String from tests.utils.e2e_test_validation import validate_registry_data_source_apply @@ -129,7 +129,7 @@ def test_apply_feature_view_integration(test_registry): ], entities=[entity], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) diff --git a/sdk/python/tests/integration/registration/test_universal_cli.py b/sdk/python/tests/integration/registration/test_universal_cli.py index 1fb82ce59f..e7f7a7cb63 100644 --- a/sdk/python/tests/integration/registration/test_universal_cli.py +++ b/sdk/python/tests/integration/registration/test_universal_cli.py @@ -26,7 +26,10 @@ def test_universal_cli(environment: Environment): try: repo_path = Path(repo_dir_name) feature_store_yaml = make_feature_store_yaml( - project, environment.test_repo_config, repo_path + project, + environment.test_repo_config, + repo_path, + environment.data_source_creator, ) repo_config = repo_path / "feature_store.yaml" @@ -120,7 +123,10 @@ def test_odfv_apply(environment) -> None: try: repo_path = Path(repo_dir_name) feature_store_yaml = make_feature_store_yaml( - project, environment.test_repo_config, repo_path + project, + environment.test_repo_config, + repo_path, + environment.data_source_creator, ) repo_config = repo_path / "feature_store.yaml" @@ -151,7 +157,10 @@ def test_nullable_online_store(test_nullable_online_store) -> None: try: repo_path = Path(repo_dir_name) feature_store_yaml = make_feature_store_yaml( - project, test_nullable_online_store, repo_path + project, + test_nullable_online_store, + repo_path, + test_nullable_online_store.offline_store_creator(project), ) repo_config = repo_path / "feature_store.yaml" diff --git a/sdk/python/tests/integration/scaffolding/__init__.py b/sdk/python/tests/integration/scaffolding/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/unit/cli/__init__.py b/sdk/python/tests/unit/cli/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/unit/cli/test_cli.py b/sdk/python/tests/unit/cli/test_cli.py index 9b535ce8fb..f55e5ffc06 100644 --- a/sdk/python/tests/unit/cli/test_cli.py +++ b/sdk/python/tests/unit/cli/test_cli.py @@ -72,6 +72,19 @@ def test_3rd_party_registry_store() -> None: assertpy.assert_that(return_code).is_equal_to(0) +def test_3rd_party_registry_store_with_fs_yaml_override() -> None: + runner = CliRunner() + + fs_yaml_file = "test_fs.yaml" + with setup_third_party_registry_store_repo( + "foo.registry_store.FooRegistryStore", fs_yaml_file_name=fs_yaml_file + ) as repo_path: + return_code, output = runner.run_with_output( + ["--feature-store-yaml", fs_yaml_file, "apply"], cwd=repo_path + ) + assertpy.assert_that(return_code).is_equal_to(0) + + @contextmanager def setup_third_party_provider_repo(provider_name: str): with tempfile.TemporaryDirectory() as repo_dir_name: @@ -106,13 +119,15 @@ def setup_third_party_provider_repo(provider_name: str): @contextmanager -def setup_third_party_registry_store_repo(registry_store: str): +def setup_third_party_registry_store_repo( + registry_store: str, fs_yaml_file_name: str = "feature_store.yaml" +): with tempfile.TemporaryDirectory() as repo_dir_name: # Construct an example repo in a temporary dir repo_path = Path(repo_dir_name) - repo_config = repo_path / "feature_store.yaml" + repo_config = repo_path / fs_yaml_file_name repo_config.write_text( dedent( diff --git a/sdk/python/tests/unit/cli/test_cli_apply_duplicates.py b/sdk/python/tests/unit/cli/test_cli_apply_duplicates.py index 998662781e..e331a1cc2d 100644 --- a/sdk/python/tests/unit/cli/test_cli_apply_duplicates.py +++ b/sdk/python/tests/unit/cli/test_cli_apply_duplicates.py @@ -15,7 +15,7 @@ def test_cli_apply_duplicated_featureview_names() -> None: def test_cli_apply_duplicate_data_source_names() -> None: run_simple_apply_test( example_repo_file_name="example_repo_duplicate_data_source_names.py", - expected_error=b"Please ensure that all data source names are case-insensitively unique", + expected_error=b"Multiple data sources share the same case-insensitive name", ) @@ -125,7 +125,7 @@ def test_cli_apply_imported_featureview_with_duplication() -> None: repo_example_2 = repo_path / "example_2.py" repo_example_2.write_text( "from datetime import timedelta\n" - "from example import driver_hourly_stats, driver_hourly_stats_view\n" + "from example import driver, driver_hourly_stats, driver_hourly_stats_view\n" "from feast import FeatureService, FeatureView\n" "a_feature_service = FeatureService(\n" " name='driver_locations_service',\n" @@ -133,10 +133,10 @@ def test_cli_apply_imported_featureview_with_duplication() -> None: ")\n" "driver_hourly_stats_view_2 = FeatureView(\n" " name='driver_hourly_stats',\n" - " entities=['driver_id'],\n" + " entities=[driver],\n" " ttl=timedelta(days=1),\n" " online=True,\n" - " batch_source=driver_hourly_stats,\n" + " source=driver_hourly_stats,\n" " tags={'dummy': 'true'})\n" ) diff --git a/sdk/python/tests/unit/cli/test_cli_chdir.py b/sdk/python/tests/unit/cli/test_cli_chdir.py index 8260a95efd..cf1d031227 100644 --- a/sdk/python/tests/unit/cli/test_cli_chdir.py +++ b/sdk/python/tests/unit/cli/test_cli_chdir.py @@ -15,7 +15,7 @@ def test_cli_chdir() -> None: # Make sure the path is absolute by resolving any symlinks temp_path = Path(temp_dir).resolve() result = runner.run(["init", "my_project"], cwd=temp_path) - repo_path = temp_path / "my_project" + repo_path = temp_path / "my_project" / "feature_repo" assert result.returncode == 0 result = runner.run(["--chdir", repo_path, "apply"], cwd=temp_path) diff --git a/sdk/python/tests/unit/diff/__init__.py b/sdk/python/tests/unit/diff/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/unit/diff/test_registry_diff.py b/sdk/python/tests/unit/diff/test_registry_diff.py index 0effdfba97..8af6c50a13 100644 --- a/sdk/python/tests/unit/diff/test_registry_diff.py +++ b/sdk/python/tests/unit/diff/test_registry_diff.py @@ -1,9 +1,14 @@ +import pandas as pd + +from feast import Field from feast.diff.registry_diff import ( diff_registry_objects, tag_objects_for_keep_delete_update_add, ) from feast.entity import Entity from feast.feature_view import FeatureView +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import String from tests.utils.data_source_test_creator import prep_file_source @@ -13,34 +18,29 @@ def test_tag_objects_for_keep_delete_update_add(simple_dataset_1): to_delete = FeatureView( name="to_delete", entities=[entity], - batch_source=file_source, - ttl=None, + source=file_source, ) unchanged_fv = FeatureView( name="fv1", entities=[entity], - batch_source=file_source, - ttl=None, + source=file_source, ) pre_changed = FeatureView( name="fv2", entities=[entity], - batch_source=file_source, - ttl=None, + source=file_source, tags={"when": "before"}, ) post_changed = FeatureView( name="fv2", entities=[entity], - batch_source=file_source, - ttl=None, + source=file_source, tags={"when": "after"}, ) to_add = FeatureView( name="to_add", entities=[entity], - batch_source=file_source, - ttl=None, + source=file_source, ) keep, delete, update, add = tag_objects_for_keep_delete_update_add( @@ -67,15 +67,13 @@ def test_diff_registry_objects_feature_views(simple_dataset_1): pre_changed = FeatureView( name="fv2", entities=[entity], - batch_source=file_source, - ttl=None, + source=file_source, tags={"when": "before"}, ) post_changed = FeatureView( name="fv2", entities=[entity], - batch_source=file_source, - ttl=None, + source=file_source, tags={"when": "after"}, ) @@ -96,3 +94,54 @@ def test_diff_registry_objects_feature_views(simple_dataset_1): assert feast_object_diffs.feast_object_property_diffs[0].val_declared == { "when": "after" } + + +def test_diff_odfv(simple_dataset_1): + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity = Entity(name="id", join_keys=["id"]) + fv = FeatureView( + name="fv2", + entities=[entity], + source=file_source, + tags={"when": "before"}, + ) + + @on_demand_feature_view( + sources=[fv], + schema=[Field(name="first_char", dtype=String)], + ) + def pre_changed(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["first_char"] = inputs["string_col"].str[:1].astype("string") + return df + + @on_demand_feature_view( + sources=[fv], + schema=[Field(name="first_char", dtype=String)], + ) + def post_changed(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["first_char"] = inputs["string_col"].str[:1].astype("string") + "hi" + return df + + feast_object_diffs = diff_registry_objects( + pre_changed, pre_changed, "on demand feature view" + ) + assert len(feast_object_diffs.feast_object_property_diffs) == 0 + + feast_object_diffs = diff_registry_objects( + pre_changed, post_changed, "on demand feature view" + ) + + # Note that user_defined_function.body is excluded because it always changes (dill is non-deterministic), even + # if no code is changed + assert len(feast_object_diffs.feast_object_property_diffs) == 3 + assert feast_object_diffs.feast_object_property_diffs[0].property_name == "name" + assert ( + feast_object_diffs.feast_object_property_diffs[1].property_name + == "user_defined_function.name" + ) + assert ( + feast_object_diffs.feast_object_property_diffs[2].property_name + == "user_defined_function.body_text" + ) diff --git a/sdk/python/tests/unit/infra/__init__.py b/sdk/python/tests/unit/infra/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/unit/infra/online_store/__init__.py b/sdk/python/tests/unit/infra/online_store/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/unit/infra/online_store/test_dynamodb_online_store.py b/sdk/python/tests/unit/infra/online_store/test_dynamodb_online_store.py index c8eca6201f..9dca44dc09 100644 --- a/sdk/python/tests/unit/infra/online_store/test_dynamodb_online_store.py +++ b/sdk/python/tests/unit/infra/online_store/test_dynamodb_online_store.py @@ -41,6 +41,7 @@ def repo_config(): online_store=DynamoDBOnlineStoreConfig(region=REGION), # online_store={"type": "dynamodb", "region": REGION}, offline_store=FileOfflineStoreConfig(), + entity_key_serialization_version=2, ) diff --git a/sdk/python/tests/unit/infra/scaffolding/__init__.py b/sdk/python/tests/unit/infra/scaffolding/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/unit/infra/scaffolding/test_repo_config.py b/sdk/python/tests/unit/infra/scaffolding/test_repo_config.py index 3ec91c0044..22fd1e696f 100644 --- a/sdk/python/tests/unit/infra/scaffolding/test_repo_config.py +++ b/sdk/python/tests/unit/infra/scaffolding/test_repo_config.py @@ -21,7 +21,7 @@ def _test_config(config_text, expect_error: Optional[str]): error = None rc = None try: - rc = load_repo_config(repo_path) + rc = load_repo_config(repo_path, repo_config) except FeastConfigError as e: error = e @@ -42,6 +42,7 @@ def test_nullable_online_store_aws(): registry: "registry.db" provider: aws online_store: null + entity_key_serialization_version: 2 """ ), expect_error="__root__ -> offline_store -> cluster_id\n" @@ -57,6 +58,7 @@ def test_nullable_online_store_gcp(): registry: "registry.db" provider: gcp online_store: null + entity_key_serialization_version: 2 """ ), expect_error=None, @@ -71,6 +73,7 @@ def test_nullable_online_store_local(): registry: "registry.db" provider: local online_store: null + entity_key_serialization_version: 2 """ ), expect_error=None, @@ -84,6 +87,7 @@ def test_local_config(): project: foo registry: "registry.db" provider: local + entity_key_serialization_version: 2 """ ), expect_error=None, @@ -99,6 +103,7 @@ def test_local_config_with_full_online_class(): provider: local online_store: type: feast.infra.online_stores.sqlite.SqliteOnlineStore + entity_key_serialization_version: 2 """ ), expect_error=None, @@ -114,6 +119,7 @@ def test_local_config_with_full_online_class_directly(): registry: "registry.db" provider: local online_store: feast.infra.online_stores.sqlite.SqliteOnlineStore + entity_key_serialization_version: 2 """ ), expect_error=None, @@ -128,6 +134,7 @@ def test_gcp_config(): project: foo registry: gs://registry.db provider: gcp + entity_key_serialization_version: 2 """ ), expect_error=None, @@ -161,6 +168,7 @@ def test_no_online_store_type(): provider: local online_store: path: "blah" + entity_key_serialization_version: 2 """ ), expect_error=None, @@ -190,6 +198,7 @@ def test_no_project(): provider: local online_store: path: foo + entity_key_serialization_version: 2 """ ), expect_error="1 validation error for RepoConfig\n" diff --git a/sdk/python/tests/unit/infra/test_inference_unit_tests.py b/sdk/python/tests/unit/infra/test_inference_unit_tests.py index 7a564679d6..c5ed83c12f 100644 --- a/sdk/python/tests/unit/infra/test_inference_unit_tests.py +++ b/sdk/python/tests/unit/infra/test_inference_unit_tests.py @@ -5,7 +5,6 @@ from feast.data_source import RequestSource from feast.entity import Entity from feast.errors import DataSourceNoNameException, SpecifiedFeaturesNotPresentError -from feast.feature import Feature from feast.feature_service import FeatureService from feast.feature_view import FeatureView from feast.field import Field @@ -15,7 +14,7 @@ ) from feast.on_demand_feature_view import on_demand_feature_view from feast.repo_config import RepoConfig -from feast.types import Float32, Float64, Int64, String, UnixTimestamp, ValueType +from feast.types import Float32, Float64, Int64, String, UnixTimestamp from tests.utils.data_source_test_creator import prep_file_source @@ -46,13 +45,9 @@ def test_infer_datasource_names_dwh(): assert data_source_with_query.name == source_name # If we have a query and no name, throw an error - if dwh_class == SparkSource: - with pytest.raises(DataSourceNoNameException): - print(f"Testing dwh {dwh_class}") - data_source = dwh_class(query="test_query") - else: + with pytest.raises(DataSourceNoNameException): + print(f"Testing dwh {dwh_class}") data_source = dwh_class(query="test_query") - assert data_source.name == "" def test_on_demand_features_type_inference(): @@ -78,13 +73,10 @@ def test_view(features_df: pd.DataFrame) -> pd.DataFrame: test_view.infer_features() @on_demand_feature_view( - # Note: we deliberately use `inputs` instead of `sources` to test that `inputs` - # still works correctly, even though it is deprecated. - # TODO(felixwang9817): Remove references to `inputs` once it is fully deprecated. - inputs={"date_request": date_request}, - features=[ - Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), - Feature(name="object_output", dtype=ValueType.STRING), + sources=[date_request], + schema=[ + Field(name="output", dtype=UnixTimestamp), + Field(name="object_output", dtype=String), ], ) def invalid_test_view(features_df: pd.DataFrame) -> pd.DataFrame: @@ -97,14 +89,11 @@ def invalid_test_view(features_df: pd.DataFrame) -> pd.DataFrame: invalid_test_view.infer_features() @on_demand_feature_view( - # Note: we deliberately use positional arguments here to test that they work correctly, - # even though positional arguments are deprecated in favor of keyword arguments. - # TODO(felixwang9817): Remove positional arguments once they are fully deprecated. - [ - Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), - Feature(name="missing", dtype=ValueType.STRING), + schema=[ + Field(name="output", dtype=UnixTimestamp), + Field(name="missing", dtype=String), ], - {"date_request": date_request}, + sources=[date_request], ) def test_view_with_missing_feature(features_df: pd.DataFrame) -> pd.DataFrame: data = pd.DataFrame() @@ -115,30 +104,19 @@ def test_view_with_missing_feature(features_df: pd.DataFrame) -> pd.DataFrame: test_view_with_missing_feature.infer_features() -# TODO(kevjumba): remove this in feast 0.24 when deprecating -@pytest.mark.parametrize( - "request_source_schema", - [ - [Field(name="some_date", dtype=UnixTimestamp)], - {"some_date": ValueType.UNIX_TIMESTAMP}, - ], -) -def test_datasource_inference(request_source_schema): +def test_datasource_inference(): # Create Feature Views date_request = RequestSource( name="date_request", - schema=request_source_schema, + schema=[Field(name="some_date", dtype=UnixTimestamp)], ) @on_demand_feature_view( - # Note: we deliberately use positional arguments here to test that they work correctly, - # even though positional arguments are deprecated in favor of keyword arguments. - # TODO(felixwang9817): Remove positional arguments once they are fully deprecated. - [ - Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), - Feature(name="string_output", dtype=ValueType.STRING), - ], sources=[date_request], + schema=[ + Field(name="output", dtype=UnixTimestamp), + Field(name="string_output", dtype=String), + ], ) def test_view(features_df: pd.DataFrame) -> pd.DataFrame: data = pd.DataFrame() @@ -166,9 +144,9 @@ def invalid_test_view(features_df: pd.DataFrame) -> pd.DataFrame: @on_demand_feature_view( sources=[date_request], - features=[ - Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), - Feature(name="missing", dtype=ValueType.STRING), + schema=[ + Field(name="output", dtype=UnixTimestamp), + Field(name="missing", dtype=String), ], ) def test_view_with_missing_feature(features_df: pd.DataFrame) -> pd.DataFrame: @@ -212,7 +190,11 @@ def test_feature_view_inference_respects_basic_inference(): assert len(feature_view_1.entity_columns) == 1 update_feature_views_with_inferred_features_and_entities( - [feature_view_1], [entity1], RepoConfig(provider="local", project="test") + [feature_view_1], + [entity1], + RepoConfig( + provider="local", project="test", entity_key_serialization_version=2 + ), ) assert len(feature_view_1.schema) == 2 assert len(feature_view_1.features) == 1 @@ -225,7 +207,9 @@ def test_feature_view_inference_respects_basic_inference(): update_feature_views_with_inferred_features_and_entities( [feature_view_2], [entity1, entity2], - RepoConfig(provider="local", project="test"), + RepoConfig( + provider="local", project="test", entity_key_serialization_version=2 + ), ) assert len(feature_view_2.schema) == 3 assert len(feature_view_2.features) == 1 @@ -250,7 +234,11 @@ def test_feature_view_inference_on_entity_columns(simple_dataset_1): assert len(feature_view_1.entity_columns) == 0 update_feature_views_with_inferred_features_and_entities( - [feature_view_1], [entity1], RepoConfig(provider="local", project="test") + [feature_view_1], + [entity1], + RepoConfig( + provider="local", project="test", entity_key_serialization_version=2 + ), ) # The schema is only used as a parameter, as is therefore not updated during inference. @@ -263,41 +251,6 @@ def test_feature_view_inference_on_entity_columns(simple_dataset_1): assert len(feature_view_1.entity_columns) == 1 -def test_feature_view_inference_respects_entity_value_type(simple_dataset_1): - """ - Tests that feature view inference still respects an entity's value type. - """ - # TODO(felixwang9817): Remove this test once entity value_type is removed. - with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: - entity1 = Entity( - name="test1", join_keys=["id_join_key"], value_type=ValueType.STRING - ) - feature_view_1 = FeatureView( - name="test1", - entities=[entity1], - schema=[Field(name="int64_col", dtype=Int64)], - source=file_source, - ) - - assert len(feature_view_1.schema) == 1 - assert len(feature_view_1.features) == 1 - assert len(feature_view_1.entity_columns) == 0 - - update_feature_views_with_inferred_features_and_entities( - [feature_view_1], [entity1], RepoConfig(provider="local", project="test") - ) - - # The schema is only used as a parameter, as is therefore not updated during inference. - assert len(feature_view_1.schema) == 1 - - # Since there is already a feature specified, additional features are not inferred. - assert len(feature_view_1.features) == 1 - - # The single entity column is inferred correctly and has type String. - assert len(feature_view_1.entity_columns) == 1 - assert feature_view_1.entity_columns[0].dtype == String - - def test_feature_view_inference_on_feature_columns(simple_dataset_1): """ Tests that feature view inference correctly infers feature columns. @@ -316,7 +269,11 @@ def test_feature_view_inference_on_feature_columns(simple_dataset_1): assert len(feature_view_1.entity_columns) == 1 update_feature_views_with_inferred_features_and_entities( - [feature_view_1], [entity1], RepoConfig(provider="local", project="test") + [feature_view_1], + [entity1], + RepoConfig( + provider="local", project="test", entity_key_serialization_version=2 + ), ) # The schema is only used as a parameter, as is therefore not updated during inference. @@ -337,6 +294,10 @@ def test_feature_view_inference_on_feature_columns(simple_dataset_1): def test_update_feature_services_with_inferred_features(simple_dataset_1): + """ + Tests that a feature service that references feature views without specified features will + be updated with the correct projections after feature inference. + """ with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: entity1 = Entity(name="test1", join_keys=["id_join_key"]) feature_view_1 = FeatureView( @@ -362,7 +323,9 @@ def test_update_feature_services_with_inferred_features(simple_dataset_1): update_feature_views_with_inferred_features_and_entities( [feature_view_1, feature_view_2], [entity1], - RepoConfig(provider="local", project="test"), + RepoConfig( + provider="local", project="test", entity_key_serialization_version=2 + ), ) feature_service.infer_features( fvs_to_update={ @@ -379,4 +342,60 @@ def test_update_feature_services_with_inferred_features(simple_dataset_1): assert len(feature_service.feature_view_projections[1].features) == 3 +def test_update_feature_services_with_specified_features(simple_dataset_1): + """ + Tests that a feature service that references feature views with specified features will + have the correct projections both before and after feature inference. + """ + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity1 = Entity(name="test1", join_keys=["id_join_key"]) + feature_view_1 = FeatureView( + name="test1", + entities=[entity1], + schema=[ + Field(name="float_col", dtype=Float32), + Field(name="id_join_key", dtype=Int64), + ], + source=file_source, + ) + feature_view_2 = FeatureView( + name="test2", + entities=[entity1], + schema=[ + Field(name="int64_col", dtype=Int64), + Field(name="id_join_key", dtype=Int64), + ], + source=file_source, + ) + + feature_service = FeatureService( + name="fs_1", features=[feature_view_1[["float_col"]], feature_view_2] + ) + assert len(feature_service.feature_view_projections) == 2 + assert len(feature_service.feature_view_projections[0].features) == 1 + assert len(feature_service.feature_view_projections[0].desired_features) == 0 + assert len(feature_service.feature_view_projections[1].features) == 1 + assert len(feature_service.feature_view_projections[1].desired_features) == 0 + + update_feature_views_with_inferred_features_and_entities( + [feature_view_1, feature_view_2], + [entity1], + RepoConfig( + provider="local", project="test", entity_key_serialization_version=2 + ), + ) + assert len(feature_view_1.features) == 1 + assert len(feature_view_2.features) == 1 + + feature_service.infer_features( + fvs_to_update={ + feature_view_1.name: feature_view_1, + feature_view_2.name: feature_view_2, + } + ) + + assert len(feature_service.feature_view_projections[0].features) == 1 + assert len(feature_service.feature_view_projections[1].features) == 1 + + # TODO(felixwang9817): Add tests that interact with field mapping. diff --git a/sdk/python/tests/unit/infra/test_local_registry.py b/sdk/python/tests/unit/infra/test_local_registry.py index d69ae6aafd..1e3b2aec88 100644 --- a/sdk/python/tests/unit/infra/test_local_registry.py +++ b/sdk/python/tests/unit/infra/test_local_registry.py @@ -23,15 +23,15 @@ from feast.data_format import AvroFormat, ParquetFormat from feast.data_source import KafkaSource from feast.entity import Entity -from feast.feature import Feature from feast.feature_view import FeatureView from feast.field import Field +from feast.infra.registry.registry import Registry from feast.on_demand_feature_view import RequestSource, on_demand_feature_view -from feast.registry import Registry from feast.repo_config import RegistryConfig from feast.stream_feature_view import StreamFeatureView from feast.types import Array, Bytes, Float32, Int32, Int64, String from feast.value_type import ValueType +from tests.integration.feature_repos.universal.entities import driver from tests.utils.e2e_test_validation import validate_registry_data_source_apply @@ -113,7 +113,7 @@ def test_apply_feature_view_success(test_registry): ], entities=[entity], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) @@ -181,7 +181,7 @@ def test_apply_on_demand_feature_view_success(test_registry): driver_daily_features_view = FeatureView( name="driver_daily_features", - entities=["driver"], + entities=[driver()], ttl=timedelta(seconds=8640000000), schema=[ Field(name="daily_miles_driven", dtype=Float32), @@ -313,12 +313,7 @@ def simple_udf(x: int): "test_registry", [lazy_fixture("local_registry")], ) -# TODO(kevjumba): remove this in feast 0.24 when deprecating -@pytest.mark.parametrize( - "request_source_schema", - [[Field(name="my_input_1", dtype=Int32)], {"my_input_1": ValueType.INT32}], -) -def test_modify_feature_views_success(test_registry, request_source_schema): +def test_modify_feature_views_success(test_registry): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), @@ -329,7 +324,7 @@ def test_modify_feature_views_success(test_registry, request_source_schema): request_source = RequestSource( name="request_source", - schema=request_source_schema, + schema=[Field(name="my_input_1", dtype=Int32)], ) entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) @@ -339,14 +334,14 @@ def test_modify_feature_views_success(test_registry, request_source_schema): schema=[Field(name="fs1_my_feature_1", dtype=Int64)], entities=[entity], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) @on_demand_feature_view( - features=[ - Feature(name="odfv1_my_feature_1", dtype=ValueType.STRING), - Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), + schema=[ + Field(name="odfv1_my_feature_1", dtype=String), + Field(name="odfv1_my_feature_2", dtype=Int32), ], sources=[request_source], ) @@ -364,9 +359,9 @@ def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: # Modify odfv by changing a single feature dtype @on_demand_feature_view( - features=[ - Feature(name="odfv1_my_feature_1", dtype=ValueType.FLOAT), - Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), + schema=[ + Field(name="odfv1_my_feature_1", dtype=Float32), + Field(name="odfv1_my_feature_2", dtype=Int32), ], sources=[request_source], ) diff --git a/sdk/python/tests/unit/infra/test_provider.py b/sdk/python/tests/unit/infra/test_provider.py index 217a1361b4..31ca5ec106 100644 --- a/sdk/python/tests/unit/infra/test_provider.py +++ b/sdk/python/tests/unit/infra/test_provider.py @@ -23,12 +23,12 @@ def test_get_column_names_preserves_feature_ordering(): - entity = Entity("my-entity", description="My entity") + entity = Entity(name="my-entity", description="My entity") fv = FeatureView( name="my-fv", entities=[entity], ttl=timedelta(days=1), - batch_source=BigQuerySource(table="non-existent-mock"), + source=BigQuerySource(table="non-existent-mock"), schema=[ Field(name="a", dtype=String), Field(name="b", dtype=String), diff --git a/sdk/python/tests/unit/local_feast_tests/__init__.py b/sdk/python/tests/unit/local_feast_tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/unit/local_feast_tests/test_e2e_local.py b/sdk/python/tests/unit/local_feast_tests/test_e2e_local.py index 97d6463f5f..1ead69f52a 100644 --- a/sdk/python/tests/unit/local_feast_tests/test_e2e_local.py +++ b/sdk/python/tests/unit/local_feast_tests/test_e2e_local.py @@ -52,7 +52,7 @@ def test_e2e_local() -> None: ) with runner.local_repo( - get_example_repo("example_feature_repo_version_0_19.py") + get_example_repo("example_feature_repo_with_bfvs.py") .replace("%PARQUET_PATH%", driver_stats_path) .replace("%PARQUET_PATH_GLOBAL%", global_stats_path), "file", @@ -153,7 +153,7 @@ def test_partial() -> None: Field(name="test", dtype=String), ], online=True, - batch_source=driver_locations_source, + source=driver_locations_source, tags={}, ) diff --git a/sdk/python/tests/unit/local_feast_tests/test_feature_service.py b/sdk/python/tests/unit/local_feast_tests/test_feature_service.py new file mode 100644 index 0000000000..82c1dd2a1d --- /dev/null +++ b/sdk/python/tests/unit/local_feast_tests/test_feature_service.py @@ -0,0 +1,96 @@ +import os +import tempfile +from datetime import datetime, timedelta + +from feast.driver_test_data import ( + create_driver_hourly_stats_df, + create_global_daily_stats_df, +) +from tests.utils.basic_read_write_test import basic_rw_test +from tests.utils.cli_repo_creator import CliRunner, get_example_repo + + +def test_apply_without_fv_inference() -> None: + """ + Tests that feature services based on feature views that do not require inference can be applied correctly. + """ + runner = CliRunner() + with runner.local_repo( + get_example_repo("example_feature_repo_with_feature_service_2.py"), "file" + ) as store: + assert len(store.list_feature_services()) == 2 + + fs = store.get_feature_service("all_stats") + assert len(fs.feature_view_projections) == 2 + assert len(fs.feature_view_projections[0].features) == 3 + assert len(fs.feature_view_projections[0].desired_features) == 0 + assert len(fs.feature_view_projections[1].features) == 2 + assert len(fs.feature_view_projections[1].desired_features) == 0 + assert len(fs.tags) == 1 + assert fs.tags["release"] == "production" + + fs = store.get_feature_service("some_stats") + assert len(fs.feature_view_projections) == 2 + assert len(fs.feature_view_projections[0].features) == 1 + assert len(fs.feature_view_projections[0].desired_features) == 0 + assert len(fs.feature_view_projections[0].features) == 1 + assert len(fs.feature_view_projections[0].desired_features) == 0 + + +def test_apply_with_fv_inference() -> None: + """ + Tests that feature services based on feature views that require inference can be applied correctly. + """ + runner = CliRunner() + with tempfile.TemporaryDirectory() as data_dir: + # Generate test data. + end_date = datetime.now().replace(microsecond=0, second=0, minute=0) + start_date = end_date - timedelta(days=15) + + driver_entities = [1001, 1002, 1003, 1004, 1005] + driver_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date) + driver_stats_path = os.path.join(data_dir, "driver_stats.parquet") + driver_df.to_parquet(path=driver_stats_path, allow_truncated_timestamps=True) + + global_df = create_global_daily_stats_df(start_date, end_date) + global_stats_path = os.path.join(data_dir, "global_stats.parquet") + global_df.to_parquet(path=global_stats_path, allow_truncated_timestamps=True) + + with runner.local_repo( + get_example_repo("example_feature_repo_with_feature_service_3.py") + .replace("%PARQUET_PATH%", driver_stats_path) + .replace("%PARQUET_PATH_GLOBAL%", global_stats_path), + "file", + ) as store: + assert len(store.list_feature_services()) == 2 + + fs = store.get_feature_service("all_stats") + assert len(fs.feature_view_projections) == 2 + assert len(fs.feature_view_projections[0].features) == 3 + assert len(fs.feature_view_projections[0].desired_features) == 0 + assert len(fs.feature_view_projections[1].features) == 2 + assert len(fs.feature_view_projections[1].desired_features) == 0 + assert len(fs.tags) == 1 + assert fs.tags["release"] == "production" + + fs = store.get_feature_service("some_stats") + assert len(fs.feature_view_projections) == 2 + assert len(fs.feature_view_projections[0].features) == 1 + assert len(fs.feature_view_projections[0].desired_features) == 0 + assert len(fs.feature_view_projections[0].features) == 1 + assert len(fs.feature_view_projections[0].desired_features) == 0 + + +def test_read() -> None: + """ + Test that feature values are correctly read through a feature service. + """ + runner = CliRunner() + with runner.local_repo( + get_example_repo("example_feature_repo_with_feature_service.py"), "file" + ) as store: + basic_rw_test( + store, + view_name="driver_locations", + feature_service_name="driver_locations_service", + ) diff --git a/sdk/python/tests/unit/local_feast_tests/test_feature_service_apply.py b/sdk/python/tests/unit/local_feast_tests/test_feature_service_apply.py deleted file mode 100644 index dc642a6e3c..0000000000 --- a/sdk/python/tests/unit/local_feast_tests/test_feature_service_apply.py +++ /dev/null @@ -1,25 +0,0 @@ -from feast.feature_service import FeatureService -from tests.utils.cli_repo_creator import CliRunner, get_example_repo - - -def test_read_pre_applied() -> None: - """ - Read feature values from the FeatureStore using a FeatureService. - """ - runner = CliRunner() - with runner.local_repo( - get_example_repo("example_feature_repo_with_feature_service.py"), "file" - ) as store: - assert len(store.list_feature_services()) == 1 - fs = store.get_feature_service("driver_locations_service") - assert len(fs.tags) == 1 - assert fs.tags["release"] == "production" - - fv = store.get_feature_view("driver_locations") - - fs = FeatureService(name="new_feature_service", features=[fv[["lon"]]]) - - store.apply([fs]) - - assert len(store.list_feature_services()) == 2 - store.get_feature_service("new_feature_service") diff --git a/sdk/python/tests/unit/local_feast_tests/test_feature_service_read.py b/sdk/python/tests/unit/local_feast_tests/test_feature_service_read.py deleted file mode 100644 index 2b5b311dc9..0000000000 --- a/sdk/python/tests/unit/local_feast_tests/test_feature_service_read.py +++ /dev/null @@ -1,17 +0,0 @@ -from tests.utils.basic_read_write_test import basic_rw_test -from tests.utils.cli_repo_creator import CliRunner, get_example_repo - - -def test_feature_service_read() -> None: - """ - Read feature values from the FeatureStore using a FeatureService. - """ - runner = CliRunner() - with runner.local_repo( - get_example_repo("example_feature_repo_with_feature_service.py"), "file" - ) as store: - basic_rw_test( - store, - view_name="driver_locations", - feature_service_name="driver_locations_service", - ) diff --git a/sdk/python/tests/unit/local_feast_tests/test_init.py b/sdk/python/tests/unit/local_feast_tests/test_init.py index f9bf536e56..c5d3cbe57d 100644 --- a/sdk/python/tests/unit/local_feast_tests/test_init.py +++ b/sdk/python/tests/unit/local_feast_tests/test_init.py @@ -15,7 +15,7 @@ def test_repo_init() -> None: with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) result = runner.run(["init", "my_project"], cwd=temp_path) - repo_path = temp_path / "my_project" + repo_path = temp_path / "my_project" / "feature_repo" assert result.returncode == 0 result = runner.run(["apply"], cwd=repo_path) assert result.returncode == 0 diff --git a/sdk/python/tests/unit/local_feast_tests/test_local_feature_store.py b/sdk/python/tests/unit/local_feast_tests/test_local_feature_store.py index 44a35e0660..2cced75eb2 100644 --- a/sdk/python/tests/unit/local_feast_tests/test_local_feature_store.py +++ b/sdk/python/tests/unit/local_feast_tests/test_local_feature_store.py @@ -4,15 +4,20 @@ import pytest from pytest_lazyfixture import lazy_fixture -from feast import FileSource -from feast.data_format import ParquetFormat +from feast import BatchFeatureView +from feast.aggregation import Aggregation +from feast.data_format import AvroFormat, ParquetFormat +from feast.data_source import KafkaSource from feast.entity import Entity from feast.feature_store import FeatureStore from feast.feature_view import FeatureView from feast.field import Field +from feast.infra.offline_stores.file_source import FileSource from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig from feast.repo_config import RepoConfig -from feast.types import Array, Bytes, Int64, String +from feast.stream_feature_view import stream_feature_view +from feast.types import Array, Bytes, Float32, Int64, String +from tests.utils.cli_repo_creator import CliRunner, get_example_repo from tests.utils.data_source_test_creator import prep_file_source @@ -20,7 +25,7 @@ "test_feature_store", [lazy_fixture("feature_store_with_local_registry")], ) -def test_apply_entity_success(test_feature_store): +def test_apply_entity(test_feature_store): entity = Entity( name="driver_car_id", description="Car driver id", @@ -48,14 +53,13 @@ def test_apply_entity_success(test_feature_store): "test_feature_store", [lazy_fixture("feature_store_with_local_registry")], ) -def test_apply_feature_view_success(test_feature_store): +def test_apply_feature_view(test_feature_store): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", timestamp_field="ts_col", created_timestamp_column="timestamp", - date_partition_column="date_partition_col", ) entity = Entity(name="fs1_my_entity_1", join_keys=["entity_id"]) @@ -71,18 +75,33 @@ def test_apply_feature_view_success(test_feature_store): ], entities=[entity], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, + ttl=timedelta(minutes=5), + ) + + bfv = BatchFeatureView( + name="batch_feature_view", + schema=[ + Field(name="fs1_my_feature_1", dtype=Int64), + Field(name="fs1_my_feature_2", dtype=String), + Field(name="fs1_my_feature_3", dtype=Array(String)), + Field(name="fs1_my_feature_4", dtype=Array(Bytes)), + Field(name="entity_id", dtype=Int64), + ], + entities=[entity], + tags={"team": "matchmaking"}, + source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View - test_feature_store.apply([entity, fv1]) + test_feature_store.apply([entity, fv1, bfv]) feature_views = test_feature_store.list_feature_views() # List Feature Views assert ( - len(feature_views) == 1 + len(feature_views) == 2 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == Int64 @@ -102,7 +121,97 @@ def test_apply_feature_view_success(test_feature_store): "test_feature_store", [lazy_fixture("feature_store_with_local_registry")], ) -def test_apply_object_and_read(test_feature_store): +def test_apply_feature_view_with_inline_batch_source( + test_feature_store, simple_dataset_1 +) -> None: + """Test that a feature view and an inline batch source are both correctly applied.""" + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity = Entity(name="driver_entity", join_keys=["test_key"]) + driver_fv = FeatureView( + name="driver_fv", + entities=[entity], + source=file_source, + ) + + test_feature_store.apply([entity, driver_fv]) + + fvs = test_feature_store.list_feature_views() + assert len(fvs) == 1 + assert fvs[0] == driver_fv + + ds = test_feature_store.list_data_sources() + assert len(ds) == 1 + assert ds[0] == file_source + + +def test_apply_feature_view_with_inline_batch_source_from_repo() -> None: + """Test that a feature view and an inline batch source are both correctly applied.""" + runner = CliRunner() + with runner.local_repo( + get_example_repo("example_feature_repo_with_inline_batch_source.py"), "file" + ) as store: + ds = store.list_data_sources() + assert len(ds) == 1 + + +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +def test_apply_feature_view_with_inline_stream_source( + test_feature_store, simple_dataset_1 +) -> None: + """Test that a feature view and an inline stream source are both correctly applied.""" + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity = Entity(name="driver_entity", join_keys=["test_key"]) + + stream_source = KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + kafka_bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=file_source, + watermark_delay_threshold=timedelta(days=1), + ) + + driver_fv = FeatureView( + name="driver_fv", + entities=[entity], + source=stream_source, + ) + + test_feature_store.apply([entity, driver_fv]) + + fvs = test_feature_store.list_feature_views() + assert len(fvs) == 1 + assert fvs[0] == driver_fv + + ds = test_feature_store.list_data_sources() + assert len(ds) == 2 + if isinstance(ds[0], FileSource): + assert ds[0] == file_source + assert ds[1] == stream_source + else: + assert ds[0] == stream_source + assert ds[1] == file_source + + +def test_apply_feature_view_with_inline_stream_source_from_repo() -> None: + """Test that a feature view and an inline stream source are both correctly applied.""" + runner = CliRunner() + with runner.local_repo( + get_example_repo("example_feature_repo_with_inline_stream_source.py"), "file" + ) as store: + ds = store.list_data_sources() + assert len(ds) == 2 + + +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +def test_apply_entities_and_feature_views(test_feature_store): assert isinstance(test_feature_store, FeatureStore) # Create Feature Views batch_source = FileSource( @@ -127,7 +236,7 @@ def test_apply_object_and_read(test_feature_store): ], entities=[e1], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) @@ -142,7 +251,7 @@ def test_apply_object_and_read(test_feature_store): ], entities=[e2], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) @@ -164,9 +273,8 @@ def test_apply_object_and_read(test_feature_store): [lazy_fixture("feature_store_with_local_registry")], ) @pytest.mark.parametrize("dataframe_source", [lazy_fixture("simple_dataset_1")]) -def test_reapply_feature_view_success(test_feature_store, dataframe_source): +def test_reapply_feature_view(test_feature_store, dataframe_source): with prep_file_source(df=dataframe_source, timestamp_field="ts_1") as file_source: - e = Entity(name="id", join_keys=["id_join_key"]) # Create Feature View @@ -174,7 +282,7 @@ def test_reapply_feature_view_success(test_feature_store, dataframe_source): name="my_feature_view_1", schema=[Field(name="string_col", dtype=String)], entities=[e], - batch_source=file_source, + source=file_source, ttl=timedelta(minutes=5), ) @@ -204,7 +312,7 @@ def test_reapply_feature_view_success(test_feature_store, dataframe_source): name="my_feature_view_1", schema=[Field(name="int64_col", dtype=Int64)], entities=[e], - batch_source=file_source, + source=file_source, ttl=timedelta(minutes=5), ) test_feature_store.apply([fv1]) @@ -216,7 +324,7 @@ def test_reapply_feature_view_success(test_feature_store, dataframe_source): test_feature_store.teardown() -def test_apply_conflicting_featureview_names(feature_store_with_local_registry): +def test_apply_conflicting_feature_view_names(feature_store_with_local_registry): """Test applying feature views with non-case-insensitively unique names""" driver = Entity(name="driver", join_keys=["driver_id"]) customer = Entity(name="customer", join_keys=["customer_id"]) @@ -226,7 +334,7 @@ def test_apply_conflicting_featureview_names(feature_store_with_local_registry): entities=[driver], ttl=timedelta(seconds=10), online=False, - batch_source=FileSource(path="driver_stats.parquet"), + source=FileSource(path="driver_stats.parquet"), tags={}, ) @@ -235,7 +343,7 @@ def test_apply_conflicting_featureview_names(feature_store_with_local_registry): entities=[customer], ttl=timedelta(seconds=10), online=False, - batch_source=FileSource(path="customer_stats.parquet"), + source=FileSource(path="customer_stats.parquet"), tags={}, ) try: @@ -252,6 +360,191 @@ def test_apply_conflicting_featureview_names(feature_store_with_local_registry): feature_store_with_local_registry.teardown() +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +def test_apply_stream_feature_view(test_feature_store, simple_dataset_1) -> None: + """Test that a stream feature view is correctly applied.""" + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity = Entity(name="driver_entity", join_keys=["test_key"]) + + stream_source = KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + kafka_bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=file_source, + watermark_delay_threshold=timedelta(days=1), + ) + + @stream_feature_view( + entities=[entity], + ttl=timedelta(days=30), + owner="test@example.com", + online=True, + schema=[Field(name="dummy_field", dtype=Float32)], + description="desc", + aggregations=[ + Aggregation( + column="dummy_field", + function="max", + time_window=timedelta(days=1), + ), + Aggregation( + column="dummy_field2", + function="count", + time_window=timedelta(days=24), + ), + ], + timestamp_field="event_timestamp", + mode="spark", + source=stream_source, + tags={}, + ) + def simple_sfv(df): + return df + + test_feature_store.apply([entity, simple_sfv]) + + stream_feature_views = test_feature_store.list_stream_feature_views() + assert len(stream_feature_views) == 1 + assert stream_feature_views[0] == simple_sfv + + features = test_feature_store.get_online_features( + features=["simple_sfv:dummy_field"], + entity_rows=[{"test_key": 1001}], + ).to_dict(include_event_timestamps=True) + + assert "test_key" in features + assert features["test_key"] == [1001] + assert "dummy_field" in features + assert features["dummy_field"] == [None] + + +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +def test_apply_stream_feature_view_udf(test_feature_store, simple_dataset_1) -> None: + """Test that a stream feature view with a udf is correctly applied.""" + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity = Entity(name="driver_entity", join_keys=["test_key"]) + + stream_source = KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + kafka_bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=file_source, + watermark_delay_threshold=timedelta(days=1), + ) + + @stream_feature_view( + entities=[entity], + ttl=timedelta(days=30), + owner="test@example.com", + online=True, + schema=[Field(name="dummy_field", dtype=Float32)], + description="desc", + aggregations=[ + Aggregation( + column="dummy_field", + function="max", + time_window=timedelta(days=1), + ), + Aggregation( + column="dummy_field2", + function="count", + time_window=timedelta(days=24), + ), + ], + timestamp_field="event_timestamp", + mode="spark", + source=stream_source, + tags={}, + ) + def pandas_view(pandas_df): + import pandas as pd + + assert type(pandas_df) == pd.DataFrame + df = pandas_df.transform(lambda x: x + 10, axis=1) + df.insert(2, "C", [20.2, 230.0, 34.0], True) + return df + + import pandas as pd + + test_feature_store.apply([entity, pandas_view]) + + stream_feature_views = test_feature_store.list_stream_feature_views() + assert len(stream_feature_views) == 1 + assert stream_feature_views[0] == pandas_view + + sfv = stream_feature_views[0] + + df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) + new_df = sfv.udf(df) + expected_df = pd.DataFrame( + {"A": [11, 12, 13], "B": [20, 30, 40], "C": [20.2, 230.0, 34.0]} + ) + assert new_df.equals(expected_df) + + +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +def test_apply_batch_source(test_feature_store, simple_dataset_1) -> None: + """Test that a batch source is applied correctly.""" + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + test_feature_store.apply([file_source]) + + ds = test_feature_store.list_data_sources() + assert len(ds) == 1 + assert ds[0] == file_source + + +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +def test_apply_stream_source(test_feature_store, simple_dataset_1) -> None: + """Test that a stream source is applied correctly.""" + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + stream_source = KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + kafka_bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=file_source, + watermark_delay_threshold=timedelta(days=1), + ) + + test_feature_store.apply([stream_source]) + + ds = test_feature_store.list_data_sources() + assert len(ds) == 2 + if isinstance(ds[0], FileSource): + assert ds[0] == file_source + assert ds[1] == stream_source + else: + assert ds[0] == stream_source + assert ds[1] == file_source + + +def test_apply_stream_source_from_repo() -> None: + """Test that a stream source is applied correctly.""" + runner = CliRunner() + with runner.local_repo( + get_example_repo("example_feature_repo_with_stream_source.py"), "file" + ) as store: + ds = store.list_data_sources() + assert len(ds) == 2 + + @pytest.fixture def feature_store_with_local_registry(): fd, registry_path = mkstemp() @@ -262,5 +555,6 @@ def feature_store_with_local_registry(): project="default", provider="local", online_store=SqliteOnlineStoreConfig(path=online_store_path), + entity_key_serialization_version=2, ) ) diff --git a/sdk/python/tests/unit/local_feast_tests/test_stream_feature_view_apply.py b/sdk/python/tests/unit/local_feast_tests/test_stream_feature_view_apply.py deleted file mode 100644 index 0def3cc783..0000000000 --- a/sdk/python/tests/unit/local_feast_tests/test_stream_feature_view_apply.py +++ /dev/null @@ -1,149 +0,0 @@ -from datetime import timedelta - -from feast.aggregation import Aggregation -from feast.data_format import AvroFormat -from feast.data_source import KafkaSource -from feast.entity import Entity -from feast.field import Field -from feast.stream_feature_view import stream_feature_view -from feast.types import Float32 -from tests.utils.cli_repo_creator import CliRunner, get_example_repo -from tests.utils.data_source_test_creator import prep_file_source - - -def test_apply_stream_feature_view(simple_dataset_1) -> None: - """ - Test apply of StreamFeatureView. - """ - runner = CliRunner() - with runner.local_repo( - get_example_repo("empty_feature_repo.py"), "file" - ) as fs, prep_file_source( - df=simple_dataset_1, timestamp_field="ts_1" - ) as file_source: - entity = Entity(name="driver_entity", join_keys=["test_key"]) - - stream_source = KafkaSource( - name="kafka", - timestamp_field="event_timestamp", - kafka_bootstrap_servers="", - message_format=AvroFormat(""), - topic="topic", - batch_source=file_source, - watermark_delay_threshold=timedelta(days=1), - ) - - @stream_feature_view( - entities=[entity], - ttl=timedelta(days=30), - owner="test@example.com", - online=True, - schema=[Field(name="dummy_field", dtype=Float32)], - description="desc", - aggregations=[ - Aggregation( - column="dummy_field", - function="max", - time_window=timedelta(days=1), - ), - Aggregation( - column="dummy_field2", - function="count", - time_window=timedelta(days=24), - ), - ], - timestamp_field="event_timestamp", - mode="spark", - source=stream_source, - tags={}, - ) - def simple_sfv(df): - return df - - fs.apply([entity, simple_sfv]) - - stream_feature_views = fs.list_stream_feature_views() - assert len(stream_feature_views) == 1 - assert stream_feature_views[0] == simple_sfv - - features = fs.get_online_features( - features=["simple_sfv:dummy_field"], - entity_rows=[{"test_key": 1001}], - ).to_dict(include_event_timestamps=True) - - assert "test_key" in features - assert features["test_key"] == [1001] - assert "dummy_field" in features - assert features["dummy_field"] == [None] - - -def test_stream_feature_view_udf(simple_dataset_1) -> None: - """ - Test apply of StreamFeatureView udfs are serialized correctly and usable. - """ - runner = CliRunner() - with runner.local_repo( - get_example_repo("empty_feature_repo.py"), "file" - ) as fs, prep_file_source( - df=simple_dataset_1, timestamp_field="ts_1" - ) as file_source: - entity = Entity(name="driver_entity", join_keys=["test_key"]) - - stream_source = KafkaSource( - name="kafka", - timestamp_field="event_timestamp", - kafka_bootstrap_servers="", - message_format=AvroFormat(""), - topic="topic", - batch_source=file_source, - watermark_delay_threshold=timedelta(days=1), - ) - - @stream_feature_view( - entities=[entity], - ttl=timedelta(days=30), - owner="test@example.com", - online=True, - schema=[Field(name="dummy_field", dtype=Float32)], - description="desc", - aggregations=[ - Aggregation( - column="dummy_field", - function="max", - time_window=timedelta(days=1), - ), - Aggregation( - column="dummy_field2", - function="count", - time_window=timedelta(days=24), - ), - ], - timestamp_field="event_timestamp", - mode="spark", - source=stream_source, - tags={}, - ) - def pandas_view(pandas_df): - import pandas as pd - - assert type(pandas_df) == pd.DataFrame - df = pandas_df.transform(lambda x: x + 10, axis=1) - df.insert(2, "C", [20.2, 230.0, 34.0], True) - return df - - import pandas as pd - - fs.apply([entity, pandas_view]) - - stream_feature_views = fs.list_stream_feature_views() - assert len(stream_feature_views) == 1 - assert stream_feature_views[0] == pandas_view - - sfv = stream_feature_views[0] - - df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) - new_df = sfv.udf(df) - expected_df = pd.DataFrame( - {"A": [11, 12, 13], "B": [20, 30, 40], "C": [20.2, 230.0, 34.0]} - ) - assert new_df.equals(expected_df) diff --git a/sdk/python/tests/unit/online_store/test_online_retrieval.py b/sdk/python/tests/unit/online_store/test_online_retrieval.py index 731230a5f6..6f96e7b5d9 100644 --- a/sdk/python/tests/unit/online_store/test_online_retrieval.py +++ b/sdk/python/tests/unit/online_store/test_online_retrieval.py @@ -142,6 +142,7 @@ def test_online() -> None: online_store=store.config.online_store, project=store.project, provider=store.config.provider, + entity_key_serialization_version=2, ) ) @@ -204,6 +205,7 @@ def test_online() -> None: online_store=store.config.online_store, project=store.project, provider=store.config.provider, + entity_key_serialization_version=2, ) ) diff --git a/sdk/python/tests/unit/test_data_sources.py b/sdk/python/tests/unit/test_data_sources.py index 0b437e50b9..1e8fb75c3e 100644 --- a/sdk/python/tests/unit/test_data_sources.py +++ b/sdk/python/tests/unit/test_data_sources.py @@ -1,13 +1,11 @@ import pytest -from feast import ValueType from feast.data_format import ProtoFormat from feast.data_source import ( DataSource, KafkaSource, KinesisSource, PushSource, - RequestDataSource, RequestSource, ) from feast.field import Field @@ -32,17 +30,6 @@ def test_push_with_batch(): assert push_source.batch_source.name == push_source_unproto.batch_source.name -def test_request_data_source_deprecation(): - with pytest.warns(DeprecationWarning): - request_data_source = RequestDataSource( - name="vals_to_add", - schema={"val_to_add": ValueType.INT64, "val_to_add_2": ValueType.INT64}, - ) - request_data_source_proto = request_data_source.to_proto() - returned_request_source = RequestSource.from_proto(request_data_source_proto) - assert returned_request_source == request_data_source - - def test_request_source_primitive_type_to_proto(): schema = [ Field(name="f1", dtype=Float32), @@ -92,73 +79,6 @@ def test_hash(): assert len(s4) == 3 -# TODO(kevjumba): Remove this test in feast 0.24 when positional arguments are removed. -def test_default_data_source_kw_arg_warning(): - # source_class = request.param - with pytest.warns(DeprecationWarning): - source = KafkaSource( - "name", "column", "bootstrap_servers", ProtoFormat("class_path"), "topic" - ) - assert source.name == "name" - assert source.timestamp_field == "column" - assert source.kafka_options.kafka_bootstrap_servers == "bootstrap_servers" - assert source.kafka_options.topic == "topic" - with pytest.raises(ValueError): - KafkaSource("name", "column", "bootstrap_servers", topic="topic") - - with pytest.warns(DeprecationWarning): - source = KinesisSource( - "name", - "column", - "c_column", - ProtoFormat("class_path"), - "region", - "stream_name", - ) - assert source.name == "name" - assert source.timestamp_field == "column" - assert source.created_timestamp_column == "c_column" - assert source.kinesis_options.region == "region" - assert source.kinesis_options.stream_name == "stream_name" - - with pytest.raises(ValueError): - KinesisSource( - "name", "column", "c_column", region="region", stream_name="stream_name" - ) - - with pytest.warns(DeprecationWarning): - source = RequestSource( - "name", [Field(name="val_to_add", dtype=Int64)], description="description" - ) - assert source.name == "name" - assert source.description == "description" - - with pytest.raises(ValueError): - RequestSource("name") - - with pytest.warns(DeprecationWarning): - source = PushSource( - "name", - BigQuerySource(name="bigquery_source", table="table"), - description="description", - ) - assert source.name == "name" - assert source.description == "description" - assert source.batch_source.name == "bigquery_source" - - with pytest.raises(ValueError): - PushSource("name") - - # No name warning for DataSource - with pytest.warns(UserWarning): - source = KafkaSource( - timestamp_field="column", - kafka_bootstrap_servers="bootstrap_servers", - message_format=ProtoFormat("class_path"), - topic="topic", - ) - - def test_proto_conversion(): bigquery_source = BigQuerySource( name="test_source", diff --git a/sdk/python/tests/unit/test_entity.py b/sdk/python/tests/unit/test_entity.py index 66ed02a71c..78f7123104 100644 --- a/sdk/python/tests/unit/test_entity.py +++ b/sdk/python/tests/unit/test_entity.py @@ -19,32 +19,33 @@ def test_join_key_default(): - with pytest.deprecated_call(): - entity = Entity("my-entity", description="My entity") + entity = Entity(name="my-entity", description="My entity") assert entity.join_key == "my-entity" def test_entity_class_contains_tags(): - with pytest.deprecated_call(): - entity = Entity( - "my-entity", - description="My entity", - tags={"key1": "val1", "key2": "val2"}, - ) + entity = Entity( + name="my-entity", + description="My entity", + tags={"key1": "val1", "key2": "val2"}, + ) assert "key1" in entity.tags.keys() and entity.tags["key1"] == "val1" assert "key2" in entity.tags.keys() and entity.tags["key2"] == "val2" def test_entity_without_tags_empty_dict(): - with pytest.deprecated_call(): - entity = Entity("my-entity", description="My entity") + entity = Entity(name="my-entity", description="My entity") assert entity.tags == dict() assert len(entity.tags) == 0 def test_entity_without_description(): - with pytest.deprecated_call(): - Entity("my-entity") + _ = Entity(name="my-entity") + + +def test_entity_without_name(): + with pytest.raises(TypeError): + _ = Entity() def test_name_not_specified(): @@ -55,15 +56,6 @@ def test_multiple_args(): assertpy.assert_that(lambda: Entity("a", ValueType.STRING)).raises(ValueError) -def test_name_keyword(recwarn): - Entity(name="my-entity") - assert len(recwarn) == 0 - Entity(name="my-entity", join_key="test") - assert len(recwarn) == 1 - Entity(name="my-entity", join_keys=["test"]) - assert len(recwarn) == 1 - - def test_hash(): entity1 = Entity(name="my-entity") entity2 = Entity(name="my-entity") diff --git a/sdk/python/tests/unit/test_feature_service.py b/sdk/python/tests/unit/test_feature_service.py index da69809b3e..4448d2e8ea 100644 --- a/sdk/python/tests/unit/test_feature_service.py +++ b/sdk/python/tests/unit/test_feature_service.py @@ -1,5 +1,3 @@ -import pytest - from feast.feature_service import FeatureService from feast.feature_view import FeatureView from feast.field import Field @@ -59,22 +57,6 @@ def test_hash(): assert len(s4) == 3 -def test_feature_view_kw_args_warning(): - with pytest.warns(DeprecationWarning): - service = FeatureService("name", [], tags={"tag_1": "tag"}, description="desc") - assert service.name == "name" - assert service.tags == {"tag_1": "tag"} - assert service.description == "desc" - - # More positional args than name and features - with pytest.raises(ValueError): - service = FeatureService("name", [], {"tag_1": "tag"}, "desc") - - # No name defined. - with pytest.raises(ValueError): - service = FeatureService(features=[], tags={"tag_1": "tag"}, description="desc") - - @no_warnings def test_feature_view_kw_args_normal(): file_source = FileSource(name="my-file-source", path="test.parquet") diff --git a/sdk/python/tests/unit/test_feature_views.py b/sdk/python/tests/unit/test_feature_views.py index 7b608b621d..0fe3f839e1 100644 --- a/sdk/python/tests/unit/test_feature_views.py +++ b/sdk/python/tests/unit/test_feature_views.py @@ -24,7 +24,7 @@ def test_create_batch_feature_view(): source=batch_source, ) - with pytest.raises(ValueError): + with pytest.raises(TypeError): BatchFeatureView( name="test batch feature view", entities=[], ttl=timedelta(days=30) ) @@ -74,7 +74,7 @@ def test_create_stream_feature_view(): aggregations=[], ) - with pytest.raises(ValueError): + with pytest.raises(TypeError): StreamFeatureView( name="test batch feature view", entities=[], diff --git a/sdk/python/tests/unit/test_on_demand_feature_view.py b/sdk/python/tests/unit/test_on_demand_feature_view.py index 5a0f5c98d8..ca8e7b25cb 100644 --- a/sdk/python/tests/unit/test_on_demand_feature_view.py +++ b/sdk/python/tests/unit/test_on_demand_feature_view.py @@ -13,14 +13,12 @@ # limitations under the License. import pandas as pd -import pytest -from feast import RequestSource from feast.feature_view import FeatureView from feast.field import Field from feast.infra.offline_stores.file_source import FileSource -from feast.on_demand_feature_view import OnDemandFeatureView, on_demand_feature_view -from feast.types import Float32, String, UnixTimestamp +from feast.on_demand_feature_view import OnDemandFeatureView +from feast.types import Float32 def udf1(features_df: pd.DataFrame) -> pd.DataFrame: @@ -57,6 +55,7 @@ def test_hash(): Field(name="output2", dtype=Float32), ], udf=udf1, + udf_string="udf1 source code", ) on_demand_feature_view_2 = OnDemandFeatureView( name="my-on-demand-feature-view", @@ -66,6 +65,7 @@ def test_hash(): Field(name="output2", dtype=Float32), ], udf=udf1, + udf_string="udf1 source code", ) on_demand_feature_view_3 = OnDemandFeatureView( name="my-on-demand-feature-view", @@ -75,6 +75,7 @@ def test_hash(): Field(name="output2", dtype=Float32), ], udf=udf2, + udf_string="udf2 source code", ) on_demand_feature_view_4 = OnDemandFeatureView( name="my-on-demand-feature-view", @@ -84,6 +85,7 @@ def test_hash(): Field(name="output2", dtype=Float32), ], udf=udf2, + udf_string="udf2 source code", description="test", ) @@ -103,63 +105,3 @@ def test_hash(): on_demand_feature_view_4, } assert len(s4) == 3 - - -def test_inputs_parameter_deprecation_in_odfv(): - date_request = RequestSource( - name="date_request", - schema=[Field(name="some_date", dtype=UnixTimestamp)], - ) - with pytest.warns(DeprecationWarning): - - @on_demand_feature_view( - inputs={"date_request": date_request}, - schema=[ - Field(name="output", dtype=UnixTimestamp), - Field(name="string_output", dtype=String), - ], - ) - def test_view(features_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["output"] = features_df["some_date"] - data["string_output"] = features_df["some_date"].astype(pd.StringDtype()) - return data - - odfv = test_view - assert odfv.name == "test_view" - assert len(odfv.source_request_sources) == 1 - assert odfv.source_request_sources["date_request"].name == "date_request" - assert odfv.source_request_sources["date_request"].schema == date_request.schema - - with pytest.raises(ValueError): - - @on_demand_feature_view( - inputs={"date_request": date_request}, - sources=[date_request], - schema=[ - Field(name="output", dtype=UnixTimestamp), - Field(name="string_output", dtype=String), - ], - ) - def incorrect_testview(features_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["output"] = features_df["some_date"] - data["string_output"] = features_df["some_date"].astype(pd.StringDtype()) - return data - - @on_demand_feature_view( - inputs={"odfv": date_request}, - schema=[ - Field(name="output", dtype=UnixTimestamp), - Field(name="string_output", dtype=String), - ], - ) - def test_correct_view(features_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["output"] = features_df["some_date"] - data["string_output"] = features_df["some_date"].astype(pd.StringDtype()) - return data - - odfv = test_correct_view - assert odfv.name == "test_correct_view" - assert odfv.source_request_sources["date_request"].schema == date_request.schema diff --git a/sdk/python/tests/integration/registration/test_sql_registry.py b/sdk/python/tests/unit/test_sql_registry.py similarity index 87% rename from sdk/python/tests/integration/registration/test_sql_registry.py rename to sdk/python/tests/unit/test_sql_registry.py index 286b1abd21..877811d0f3 100644 --- a/sdk/python/tests/integration/registration/test_sql_registry.py +++ b/sdk/python/tests/unit/test_sql_registry.py @@ -22,17 +22,18 @@ from testcontainers.core.container import DockerContainer from testcontainers.core.waiting_utils import wait_for_logs -from feast import Feature, FileSource, RequestSource +from feast import FileSource, RequestSource from feast.data_format import ParquetFormat from feast.entity import Entity from feast.errors import FeatureViewNotFoundException from feast.feature_view import FeatureView from feast.field import Field -from feast.infra.registry_stores.sql import SqlRegistry +from feast.infra.registry.sql import SqlRegistry from feast.on_demand_feature_view import on_demand_feature_view from feast.repo_config import RegistryConfig from feast.types import Array, Bytes, Float32, Int32, Int64, String from feast.value_type import ValueType +from tests.integration.feature_repos.universal.entities import driver POSTGRES_USER = "test" POSTGRES_PASSWORD = "test" @@ -108,13 +109,27 @@ def mysql_registry(): container.stop() +@pytest.fixture(scope="session") +def sqlite_registry(): + registry_config = RegistryConfig( + registry_type="sql", + path="sqlite://", + ) + + yield SqlRegistry(registry_config, None) + + @pytest.mark.skipif( sys.platform == "darwin" and "GITHUB_REF" in os.environ, reason="does not run on mac github actions", ) @pytest.mark.parametrize( "sql_registry", - [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + [ + lazy_fixture("mysql_registry"), + lazy_fixture("pg_registry"), + lazy_fixture("sqlite_registry"), + ], ) def test_apply_entity_success(sql_registry): entity = Entity( @@ -153,6 +168,9 @@ def test_apply_entity_success(sql_registry): and entity.tags["team"] == "matchmaking" ) + # After the first apply, the created_timestamp should be the same as the last_update_timestamp. + assert entity.created_timestamp == entity.last_updated_timestamp + sql_registry.delete_entity("driver_car_id", project) assert_project_uuid(project, project_uuid, sql_registry) entities = sql_registry.list_entities(project) @@ -174,7 +192,11 @@ def assert_project_uuid(project, project_uuid, sql_registry): ) @pytest.mark.parametrize( "sql_registry", - [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + [ + lazy_fixture("mysql_registry"), + lazy_fixture("pg_registry"), + lazy_fixture("sqlite_registry"), + ], ) def test_apply_feature_view_success(sql_registry): # Create Feature Views @@ -197,7 +219,7 @@ def test_apply_feature_view_success(sql_registry): ], entities=[entity], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) @@ -237,6 +259,9 @@ def test_apply_feature_view_success(sql_registry): and feature_view.entities[0] == "fs1_my_entity_1" ) + # After the first apply, the created_timestamp should be the same as the last_update_timestamp. + assert feature_view.created_timestamp == feature_view.last_updated_timestamp + sql_registry.delete_feature_view("my_feature_view_1", project) feature_views = sql_registry.list_feature_views(project) assert len(feature_views) == 0 @@ -250,7 +275,11 @@ def test_apply_feature_view_success(sql_registry): ) @pytest.mark.parametrize( "sql_registry", - [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + [ + lazy_fixture("mysql_registry"), + lazy_fixture("pg_registry"), + lazy_fixture("sqlite_registry"), + ], ) def test_apply_on_demand_feature_view_success(sql_registry): # Create Feature Views @@ -265,7 +294,7 @@ def test_apply_on_demand_feature_view_success(sql_registry): driver_daily_features_view = FeatureView( name="driver_daily_features", - entities=["driver"], + entities=[driver()], ttl=timedelta(seconds=8640000000), schema=[ Field(name="daily_miles_driven", dtype=Float32), @@ -334,13 +363,13 @@ def location_features_from_push(inputs: pd.DataFrame) -> pd.DataFrame: ) @pytest.mark.parametrize( "sql_registry", - [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + [ + lazy_fixture("mysql_registry"), + lazy_fixture("pg_registry"), + lazy_fixture("sqlite_registry"), + ], ) -@pytest.mark.parametrize( - "request_source_schema", - [[Field(name="my_input_1", dtype=Int32)], {"my_input_1": ValueType.INT32}], -) -def test_modify_feature_views_success(sql_registry, request_source_schema): +def test_modify_feature_views_success(sql_registry): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), @@ -351,7 +380,7 @@ def test_modify_feature_views_success(sql_registry, request_source_schema): request_source = RequestSource( name="request_source", - schema=request_source_schema, + schema=[Field(name="my_input_1", dtype=Int32)], ) entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) @@ -361,14 +390,14 @@ def test_modify_feature_views_success(sql_registry, request_source_schema): schema=[Field(name="fs1_my_feature_1", dtype=Int64)], entities=[entity], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) @on_demand_feature_view( - features=[ - Feature(name="odfv1_my_feature_1", dtype=ValueType.STRING), - Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), + schema=[ + Field(name="odfv1_my_feature_1", dtype=String), + Field(name="odfv1_my_feature_2", dtype=Int32), ], sources=[request_source], ) @@ -386,9 +415,9 @@ def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: # Modify odfv by changing a single feature dtype @on_demand_feature_view( - features=[ - Feature(name="odfv1_my_feature_1", dtype=ValueType.FLOAT), - Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), + schema=[ + Field(name="odfv1_my_feature_1", dtype=Float32), + Field(name="odfv1_my_feature_2", dtype=Int32), ], sources=[request_source], ) @@ -459,10 +488,13 @@ def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: sys.platform == "darwin" and "GITHUB_REF" in os.environ, reason="does not run on mac github actions", ) -@pytest.mark.integration @pytest.mark.parametrize( "sql_registry", - [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + [ + lazy_fixture("mysql_registry"), + lazy_fixture("pg_registry"), + lazy_fixture("sqlite_registry"), + ], ) def test_apply_data_source(sql_registry): # Create Feature Views @@ -486,7 +518,7 @@ def test_apply_data_source(sql_registry): ], entities=[entity], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) @@ -519,3 +551,25 @@ def test_apply_data_source(sql_registry): assert registry_batch_source == batch_source sql_registry.teardown() + + +@pytest.mark.skipif( + sys.platform == "darwin" and "GITHUB_REF" in os.environ, + reason="does not run on mac github actions", +) +@pytest.mark.parametrize( + "sql_registry", + [ + lazy_fixture("mysql_registry"), + lazy_fixture("pg_registry"), + lazy_fixture("sqlite_registry"), + ], +) +def test_update_infra(sql_registry): + # Create infra object + project = "project" + infra = sql_registry.get_infra(project=project) + + # Should run update infra successfully + sql_registry.update_infra(infra, project) + sql_registry.teardown() diff --git a/sdk/python/tests/utils/cli_repo_creator.py b/sdk/python/tests/utils/cli_repo_creator.py index a038b85840..92b6dd992a 100644 --- a/sdk/python/tests/utils/cli_repo_creator.py +++ b/sdk/python/tests/utils/cli_repo_creator.py @@ -60,7 +60,6 @@ def local_repo(self, example_repo_py: str, offline_store: str): ) with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: - repo_path = Path(repo_dir_name) data_path = Path(data_dir_name) @@ -76,6 +75,7 @@ def local_repo(self, example_repo_py: str, offline_store: str): path: {data_path / "online_store.db"} offline_store: type: {offline_store} + entity_key_serialization_version: 2 """ ) ) @@ -84,11 +84,21 @@ def local_repo(self, example_repo_py: str, offline_store: str): repo_example.write_text(example_repo_py) result = self.run(["apply"], cwd=repo_path) - print(f"Apply: stdout: {str(result.stdout)}\n stderr: {str(result.stderr)}") - assert result.returncode == 0 + stdout = result.stdout.decode("utf-8") + stderr = result.stderr.decode("utf-8") + print(f"Apply stdout:\n{stdout}") + print(f"Apply stderr:\n{stderr}") + assert ( + result.returncode == 0 + ), f"stdout: {result.stdout}\nstderr: {result.stderr}" yield FeatureStore(repo_path=str(repo_path), config=None) result = self.run(["teardown"], cwd=repo_path) - print(f"Apply: stdout: {str(result.stdout)}\n stderr: {str(result.stderr)}") - assert result.returncode == 0 + stdout = result.stdout.decode("utf-8") + stderr = result.stderr.decode("utf-8") + print(f"Apply stdout:\n{stdout}") + print(f"Apply stderr:\n{stderr}") + assert ( + result.returncode == 0 + ), f"stdout: {result.stdout}\nstderr: {result.stderr}" diff --git a/sdk/python/tests/utils/e2e_test_validation.py b/sdk/python/tests/utils/e2e_test_validation.py index b2eb78f3c8..e2b8b14eb4 100644 --- a/sdk/python/tests/utils/e2e_test_validation.py +++ b/sdk/python/tests/utils/e2e_test_validation.py @@ -14,7 +14,7 @@ from feast.data_format import ParquetFormat from feast.entity import Entity from feast.field import Field -from feast.registry import Registry +from feast.infra.registry.registry import Registry from feast.types import Array, Bytes, Int64, String from tests.integration.feature_repos.integration_test_repo_config import ( IntegrationTestRepoConfig, @@ -164,8 +164,12 @@ def _check_offline_and_online_features( ) -def make_feature_store_yaml(project, test_repo_config, repo_dir_name: Path): - offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(project) +def make_feature_store_yaml( + project, + test_repo_config, + repo_dir_name: Path, + offline_creator: DataSourceCreator, +): offline_store_config = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store @@ -177,6 +181,7 @@ def make_feature_store_yaml(project, test_repo_config, repo_dir_name: Path): offline_store=offline_store_config, online_store=online_store, repo_path=str(Path(repo_dir_name)), + entity_key_serialization_version=2, ) config_dict = config.dict() if ( @@ -199,7 +204,8 @@ def make_feature_store_yaml(project, test_repo_config, repo_dir_name: Path): ), ] -if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True": +# Only test if this is NOT a local test +if os.getenv("FEAST_IS_LOCAL_TEST", "False") != "True": NULLABLE_ONLINE_STORE_CONFIGS.extend( [ IntegrationTestRepoConfig( @@ -238,7 +244,7 @@ def validate_registry_data_source_apply(test_registry: Registry): ], entities=[entity], tags={"team": "matchmaking"}, - batch_source=batch_source, + source=batch_source, ttl=timedelta(minutes=5), ) diff --git a/sdk/python/tests/utils/feature_records.py b/sdk/python/tests/utils/feature_records.py index acc08ec121..3f210f9e1c 100644 --- a/sdk/python/tests/utils/feature_records.py +++ b/sdk/python/tests/utils/feature_records.py @@ -1,12 +1,13 @@ from datetime import datetime, timedelta from typing import Any, Dict, List, Optional +import numpy as np import pandas as pd import pytest from pandas.testing import assert_frame_equal as pd_assert_frame_equal from pytz import utc -from feast import FeatureStore, utils +from feast import FeatureService, FeatureStore, utils from feast.errors import FeatureNameCollisionError from feast.feature_view import FeatureView @@ -283,12 +284,17 @@ def get_response_feature_name(feature: str, full_feature_names: bool) -> str: def assert_feature_service_correctness( - store, feature_service, full_feature_names, entity_df, expected_df, event_timestamp + store: FeatureStore, + feature_service: FeatureService, + full_feature_names: bool, + entity_df, + expected_df, + event_timestamp, ): job_from_df = store.get_historical_features( entity_df=entity_df, - features=feature_service, + features=store.get_feature_service(feature_service.name), full_feature_names=full_feature_names, ) @@ -309,7 +315,9 @@ def assert_feature_service_correctness( validate_dataframes( expected_df, actual_df_from_df_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + sort_by=[event_timestamp, "order_id", "driver_id", "customer_id"], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) @@ -354,7 +362,7 @@ def assert_feature_service_entity_mapping_correctness( validate_dataframes( expected_df, actual_df_from_df_entities, - keys=[ + sort_by=[ event_timestamp, "order_id", "driver_id", @@ -362,6 +370,8 @@ def assert_feature_service_entity_mapping_correctness( "origin_id", "destination_id", ], + event_timestamp_column=event_timestamp, + timestamp_precision=timedelta(milliseconds=1), ) else: # using 2 of the same FeatureView without full_feature_names=True will result in collision @@ -373,18 +383,40 @@ def assert_feature_service_entity_mapping_correctness( ) -def validate_dataframes(expected_df, actual_df, keys): - expected_df: pd.DataFrame = ( - expected_df.sort_values(by=keys).drop_duplicates().reset_index(drop=True) +# Specify timestamp_precision to relax timestamp equality constraints +def validate_dataframes( + expected_df: pd.DataFrame, + actual_df: pd.DataFrame, + sort_by: List[str], + event_timestamp_column: Optional[str] = None, + timestamp_precision: timedelta = timedelta(seconds=0), +): + expected_df = ( + expected_df.sort_values(by=sort_by).drop_duplicates().reset_index(drop=True) ) actual_df = ( actual_df[expected_df.columns] - .sort_values(by=keys) + .sort_values(by=sort_by) .drop_duplicates() .reset_index(drop=True) ) - + if event_timestamp_column: + expected_timestamp_col = expected_df[event_timestamp_column].to_frame() + actual_timestamp_col = expected_df[event_timestamp_column].to_frame() + expected_df = expected_df.drop(event_timestamp_column, axis=1) + actual_df = actual_df.drop(event_timestamp_column, axis=1) + if event_timestamp_column in sort_by: + sort_by.remove(event_timestamp_column) + + diffs = expected_timestamp_col.to_numpy() - actual_timestamp_col.to_numpy() + for diff in diffs: + if isinstance(diff, np.ndarray): + diff = diff[0] + if isinstance(diff, np.timedelta64): + assert abs(diff) <= timestamp_precision.seconds + else: + assert abs(diff) <= timestamp_precision pd_assert_frame_equal( expected_df, actual_df, diff --git a/setup.py b/setup.py index f03aeefcf6..37ed471cfa 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,7 @@ "numpy>=1.22,<3", "pandas>=1.4.3,<2", "pandavro==1.5.*", # For some reason pandavro higher than 1.5.* only support pandas less than 1.3. - "protobuf>3.20,<4", + "protobuf<5,>3", "proto-plus>=1.20.0,<2", "pyarrow>=4,<9", "pydantic>=1,<2", @@ -93,6 +93,8 @@ AWS_REQUIRED = ["boto3>=1.17.0,<=1.20.23", "docker>=5.0.2", "s3fs>=0.4.0,<=2022.01.0"] +BYTEWAX_REQUIRED = ["bytewax==0.10.0", "docker>=5.0.2", "kubernetes<=20.13.0"] + SNOWFLAKE_REQUIRED = [ "snowflake-connector-python[pandas]>=2.7.3,<=2.7.8", ] @@ -117,12 +119,26 @@ "happybase>=1.2.0,<3", ] +CASSANDRA_REQUIRED = [ + "cassandra-driver>=3.24.0,<4", +] + GE_REQUIRED = ["great_expectations>=0.14.0,<0.15.0"] GO_REQUIRED = [ "cffi==1.15.*,<2", ] +AZURE_REQUIRED = ( + [ + "azure-storage-blob>=0.37.0", + "azure-identity>=1.6.1", + "SQLAlchemy>=1.4.19", + "pyodbc>=4.0.30", + "pymssql", + ] +) + CI_REQUIRED = ( [ "build", @@ -170,6 +186,7 @@ + GCP_REQUIRED + REDIS_REQUIRED + AWS_REQUIRED + + BYTEWAX_REQUIRED + SNOWFLAKE_REQUIRED + SPARK_REQUIRED + POSTGRES_REQUIRED @@ -177,6 +194,8 @@ + TRINO_REQUIRED + GE_REQUIRED + HBASE_REQUIRED + + CASSANDRA_REQUIRED + + AZURE_REQUIRED ) @@ -501,16 +520,19 @@ def copy_extensions_to_source(self): "ci": CI_REQUIRED, "gcp": GCP_REQUIRED, "aws": AWS_REQUIRED, + "bytewax": BYTEWAX_REQUIRED, "redis": REDIS_REQUIRED, "snowflake": SNOWFLAKE_REQUIRED, "spark": SPARK_REQUIRED, "trino": TRINO_REQUIRED, "postgres": POSTGRES_REQUIRED, + "azure": AZURE_REQUIRED, "mysql": MYSQL_REQUIRED, "ge": GE_REQUIRED, "hbase": HBASE_REQUIRED, "go": GO_REQUIRED, "docs": DOCS_REQUIRED, + "cassandra": CASSANDRA_REQUIRED, }, include_package_data=True, license="Apache", diff --git a/ui/.npmrc b/ui/.npmrc new file mode 100644 index 0000000000..bd3327ab5a --- /dev/null +++ b/ui/.npmrc @@ -0,0 +1 @@ +//registry.npmjs.org/:_authToken=${NPM_TOKEN} \ No newline at end of file diff --git a/ui/CONTRIBUTING.md b/ui/CONTRIBUTING.md new file mode 100644 index 0000000000..970bd3676c --- /dev/null +++ b/ui/CONTRIBUTING.md @@ -0,0 +1,103 @@ +

Table of contents

+ +- [General contributor notes](#general-contributor-notes) + - [`feast ui` command](#feast-ui-command) + - [NPM package project structure](#npm-package-project-structure) + - [Tests](#tests) + - [Yarn commands](#yarn-commands) + - [`yarn install`](#yarn-install) + - [`yarn start`](#yarn-start) + - [`yarn test`](#yarn-test) +- [Release process](#release-process) + - [(Advanced) Manually publishing the Feast Package to NPM](#advanced-manually-publishing-the-feast-package-to-npm) + - [Requirements](#requirements) + - [Steps for Publishing](#steps-for-publishing) + +# General contributor notes +In this doc, we describe how to contribute both to the Feast Web UI NPM package as well as the embedded Feast UI in the Python SDK (i.e. what's run when you run `feast ui`) + +## `feast ui` command +You can see the logic in [../sdk/python/feast/ui](../sdk/python/feast/ui/). This instance is loaded in [../sdk/python/feast/ui_server.py](../sdk/python/feast/ui_server.py). + +Under the hood, what happens is that the Feast SDK spins up a server which exposes an endpoint to the registry. It then mounts the UI on the server and points it to fetch data from that registry. + +## NPM package project structure +The Web UI is powered by a JSON registry dump from Feast (running `feast registry-dump`). Running `yarn start` launches a UI +powered by test data. +- `public/` contains assets as well as demo data loaded by the Web UI. + - There is a `projects-list.json` which represents all Feast projects the UI shows. + - There is also a `registry.json` which is the registry dump for the feature repo. +- `feature_repo/` contains a sample Feast repo which generates the `registry.json` +- `src/` contains the Web UI source code. + - `src/contexts` has React context objects around project level metadata or registry path metadata to inject into pages. The contexts are static contexts provided by [FeastUISansProviders.tsx](src/FeastUISansProviders.tsx) + - `src/parsers` parses the `registry.json` into in memory representations of Feast objects (feature views, data sources, entities, feature services). + - This has ~1:1 mappings to the protobuf objects in [feast/protos/feast/core](https://github.com/feast-dev/feast/tree/master/protos/feast/core). + - There are also "relationships" which create an in-memory lineage graph which can be used to construct links in pages. + - This generates state which pages will load via React queries (to the registry path). + - `src/pages` has all individual web pages and their layouts. For any given Feast object (e.g. entity), there exist: + - an **Index page** (which is the first page you hit when you click on that object). This loads using a React query the in memory representation of all objects (parsed from `src/parsers`) and embeds: + - a **Listing page** (i.e. listing all the objects in the registry in a table). This creates links to the instance pages + - an **Instance page** (which shows details for an individual entity, feature view, etc). This embeds: + - a default Overview tab, which shows all the Feast metadata (e.g. for a given entity) + - custom tabs from `src/custom-tabs`. + - Other subdirectories: + - `src/components` has common React components that are re-used across pages + - `src/custom-tabs` houses custom tabs and a custom tab React context which exist on the core pages. There is a `TabsRegistryContext` which is also supplied by the [FeastUISansProviders.tsx](src/FeastUISansProviders.tsx), and if there are custom tabs, the Feast UI will embed them as a new tab in the corresponding page (e.g. feature view page). + - `src/graphics` houses icons that are used throughout the UI + - `src/hooks` has React hooks. The most complex hooks here define the bulk of the search / filter functionality. + +## Tests +There are very few tests for this UI. There is a smoke test that ensures pages can load in [FeastUISansProviders.test.tsx](src/FeastUISansProviders.test.tsx) + + +## Yarn commands + +If you would like to simply try things out and see how the UI works, you can simply run the code in this repo. + +> **Note**: there is an `.npmrc` which is setup for automatic releases. You'll need to comment out the line in there and continue + +First: + +### `yarn install` + +That will install the all the dependencies that the UI needs, as well as development dependencies. Then in the project directory, you can run: + +### `yarn start` + +Runs the app in the development mode.\ +Open [http://localhost:3000](http://localhost:3000) to view it in the browser. + +The page will reload if you make edits.\ +You will also see any lint errors in the console. + +### `yarn test` + +Launches the test runner in the interactive watch mode.\ +See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information. + +# Release process +There are a couple of components in Feast that are tied to the Web UI. These are all automatically handled during the release GitHub action: +1. the npm package + - The release process for Feast automatically bumps the package version (see [bump_file_versions.py](../infra/scripts/release/bump_file_versions.py)) and releases the new NPM package (see [publish.yml](../.github/workflows/publish.yml) in the `publish-web-ui-npm` job) +2. the Feast Python SDK, which bundles in a compiled version of the Feast Web UI which is run on a `feast ui` CLI command. + - The bundled Web UI in the Python SDK always compiles in the latest npm version + +## (Advanced) Manually publishing the Feast Package to NPM + +This generally should not be necessary, since new package versions are released with the overall Feast release workflow (see [publish.yml](../.github/workflows/publish.yml) in the `publish-web-ui-npm` job) + +The Feast UI is published as a module to NPM and can be found here: https://www.npmjs.com/package/@feast-dev/feast-ui + +### Requirements + +To publish a new version of the module, you will need: +- to be part of the @feast-dev team in NPM. Ask `#feast-development` on http://slack.feast.dev to add you if necessary. +- to [login to your NPM account on the command line](https://docs.npmjs.com/cli/v8/commands/npm-adduser). + +### Steps for Publishing + +1. Make sure tests are passing. Run tests with `yarn test` in the ui directory. +2. Bump the version number in `package.json` as appropriate. +3. Package the modules for distributions. Run the library build script with `yarn build:lib`. We use [Rollup](https://rollupjs.org/) for building the module, and the configs are in the `rollup.config.js` file. +4. Publish the package to NPM. Run `npm publish` +5. [Check NPM to see that the package was properly published](https://www.npmjs.com/package/@feast-dev/feast-ui). \ No newline at end of file diff --git a/ui/PUBLISHING_TO_NPM.md b/ui/PUBLISHING_TO_NPM.md deleted file mode 100644 index 0ab1af3923..0000000000 --- a/ui/PUBLISHING_TO_NPM.md +++ /dev/null @@ -1,13 +0,0 @@ -# Publishing the Feast Package to NPM - -The Feast UI is published as a module to NPM and can be found here: https://www.npmjs.com/package/@feast-dev/feast-ui - -To publish a new version of the module, you will need to be part of the @feast-dev team in NPM. Ask Tony to add you if necessary. You will also need to [login to your NPM account on the command line](https://docs.npmjs.com/cli/v8/commands/npm-adduser). - -## Steps for Publishing - -1. Make sure tests are passing. Run tests with `yarn test` in the ui directory. -2. Bump the version number in `package.json` as appropriate. -3. Package the modules for distributions. Run the library build script with `yarn build:lib`. We use [Rollup](https://rollupjs.org/) for building the module, and the configs are in the `rollup.config.js` file. -4. Publish the package to NPM. Run `npm publish` -5. [Check NPM to see that the package was properly publish](https://www.npmjs.com/package/@feast-dev/feast-ui). diff --git a/ui/README.md b/ui/README.md index 713d1c17c5..e91a8741ec 100644 --- a/ui/README.md +++ b/ui/README.md @@ -4,22 +4,10 @@ This project was bootstrapped with [Create React App](https://github.com/facebook/create-react-app). -## Project structure -The Web UI is powered by a JSON registry dump from Feast (running `feast registry-dump`). Running `yarn start` launches a UI -powered by test data. -- `public/` contains assets as well as demo data loaded by the Web UI. - - There is a `projects-list.json` which represents all Feast projects the UI shows. - - There is also a `registry.json` which is the registry dump for the feature repo. -- `feature_repo/` contains a sample Feast repo which generates the `registry.json` -- `src/` contains the Web UI source code. This parses the registry json blob in `src/parsers` to make this data -available for the rest of the UI. -- `src/custom-tabs` includes sample custom tabs. This is a WIP plugin system where users can inject their own tabs and -data to the UI. - ## Usage There are three modes of usage: -- via the 'feast ui' CLI to view the current feature repository +- via the `feast ui` CLI to view the current feature repository - importing the UI as a module - running the entire build as a React app. @@ -124,28 +112,6 @@ const tabsRegistry = { Examples of custom tabs can be found in the `/custom-tabs` folder. -### Alternative: Run this Repo - -If you would like to simply try things out and see how the UI works, you can simply run the code in this repo. First: - -### `yarn install` - -That will install the all the dependencies that the UI needs, as well as development dependencies. Then in the project directory, you can run: - -### `yarn start` - -Runs the app in the development mode.\ -Open [http://localhost:3000](http://localhost:3000) to view it in the browser. - -The page will reload if you make edits.\ -You will also see any lint errors in the console. - -### `yarn test` - -Launches the test runner in the interactive watch mode.\ -See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information. - - ## On React and Create React App This project was bootstrapped with Create React App, and uses its scripts to simplify UI development. You can learn more in the [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started). diff --git a/ui/feature_repo/feature_store.yaml b/ui/feature_repo/feature_store.yaml index 31b27e2385..6ecad3eb51 100644 --- a/ui/feature_repo/feature_store.yaml +++ b/ui/feature_repo/feature_store.yaml @@ -5,6 +5,3 @@ online_store: type: sqlite offline_store: type: file -flags: - alpha_features: true - on_demand_transforms: true diff --git a/ui/feature_repo/features.py b/ui/feature_repo/features.py index 0e6b7f4238..293f438c67 100644 --- a/ui/feature_repo/features.py +++ b/ui/feature_repo/features.py @@ -2,14 +2,13 @@ import pandas as pd -from feast import Entity, FeatureService, FeatureView, Field, FileSource, ValueType +from feast import Entity, FeatureService, FeatureView, Field, FileSource from feast.data_source import RequestSource from feast.on_demand_feature_view import on_demand_feature_view from feast.types import Bool, Int64, String zipcode = Entity( name="zipcode", - value_type=ValueType.INT64, description="A zipcode", tags={"owner": "danny@tecton.ai", "team": "hack week",}, ) @@ -23,7 +22,7 @@ zipcode_features = FeatureView( name="zipcode_features", - entities=["zipcode"], + entities=[zipcode], ttl=timedelta(days=3650), schema=[ Field(name="city", dtype=String), @@ -32,6 +31,7 @@ Field(name="tax_returns_filed", dtype=Int64), Field(name="population", dtype=Int64), Field(name="total_wages", dtype=Int64), + Field(name="zipcode", dtype=Int64), ], source=zipcode_source, tags={ @@ -44,7 +44,7 @@ zipcode_features = FeatureView( name="zipcode_features", - entities=["zipcode"], + entities=[zipcode], ttl=timedelta(days=3650), schema=[ Field(name="city", dtype=String), @@ -53,6 +53,7 @@ Field(name="tax_returns_filed", dtype=Int64), Field(name="population", dtype=Int64), Field(name="total_wages", dtype=Int64), + Field(name="zipcode", dtype=Int64), ], source=zipcode_source, tags={ @@ -65,11 +66,12 @@ zipcode_money_features = FeatureView( name="zipcode_money_features", - entities=["zipcode"], + entities=[zipcode], ttl=timedelta(days=3650), schema=[ Field(name="tax_returns_filed", dtype=Int64), Field(name="total_wages", dtype=Int64), + Field(name="zipcode", dtype=Int64), ], source=zipcode_source, tags={ @@ -82,7 +84,6 @@ dob_ssn = Entity( name="dob_ssn", - value_type=ValueType.STRING, description="Date of birth and last four digits of social security number", tags={"owner": "tony@tecton.ai", "team": "hack week",}, ) @@ -96,7 +97,7 @@ credit_history = FeatureView( name="credit_history", - entities=["dob_ssn"], + entities=[dob_ssn], ttl=timedelta(days=9000), schema=[ Field(name="credit_card_due", dtype=Int64), @@ -108,6 +109,7 @@ Field(name="missed_payments_1y", dtype=Int64), Field(name="missed_payments_6m", dtype=Int64), Field(name="bankruptcies", dtype=Int64), + Field(name="dob_ssn", dtype=String), ], source=credit_history_source, tags={ diff --git a/ui/package.json b/ui/package.json index 22128cc968..7f0e7c3fbe 100644 --- a/ui/package.json +++ b/ui/package.json @@ -1,6 +1,6 @@ { "name": "@feast-dev/feast-ui", - "version": "0.20.5", + "version": "0.24.0", "private": false, "files": [ "dist" diff --git a/ui/src/custom-tabs/data-tab/DataQuery.tsx b/ui/src/custom-tabs/data-tab/DataQuery.tsx index f101c122e4..c79764ef99 100644 --- a/ui/src/custom-tabs/data-tab/DataQuery.tsx +++ b/ui/src/custom-tabs/data-tab/DataQuery.tsx @@ -1,9 +1,5 @@ import { useQuery } from "react-query"; -interface DataQueryInterface { - featureView: string | undefined; -} - const DataQuery = (featureView: string) => { const queryKey = `data-tab-namespace:${featureView}`; @@ -13,8 +9,7 @@ const DataQuery = (featureView: string) => { // Customizing the URL based on your needs const url = `/demo-custom-tabs/demo.json`; - return fetch(url) - .then((res) => res.json()) + return fetch(url).then((res) => res.json()); }, { enabled: !!featureView, // Only start the query when the variable is not undefined diff --git a/ui/src/custom-tabs/data-tab/DataTab.tsx b/ui/src/custom-tabs/data-tab/DataTab.tsx index 144083420a..4592d197e2 100644 --- a/ui/src/custom-tabs/data-tab/DataTab.tsx +++ b/ui/src/custom-tabs/data-tab/DataTab.tsx @@ -14,7 +14,6 @@ import { EuiTableRow, EuiTableRowCell, } from "@elastic/eui"; -import useLoadRegularFeatureView from "../../pages/feature-views/useLoadFeatureView"; import DataQuery from "./DataQuery"; const FeatureViewDataRow = z.object({ @@ -26,29 +25,25 @@ type FeatureViewDataRowType = z.infer; const LineHeightProp: React.CSSProperties = { lineHeight: 1, -} +}; -const EuiFeatureViewDataRow = ({name, value}: FeatureViewDataRowType) => { +const EuiFeatureViewDataRow = ({ name, value }: FeatureViewDataRowType) => { return ( - - {name} - + {name} -
-            {value}
-          
+
{value}
); -} +}; const FeatureViewDataTable = (data: any) => { var items: FeatureViewDataRowType[] = []; - for (let element in data.data){ + for (let element in data.data) { const row: FeatureViewDataRowType = { name: element, value: JSON.stringify(data.data[element], null, 2), @@ -60,48 +55,44 @@ const FeatureViewDataTable = (data: any) => { return ( - - Data Item Name - - - Data Item Value - + Data Item Name + Data Item Value {items.map((item) => { - return + return ; })} - ) -} + ); +}; const DataTab = () => { - const fName = "credit_history" + const fName = "credit_history"; const { isLoading, isError, isSuccess, data } = DataQuery(fName); const isEmpty = data === undefined; return ( - {isLoading && ( - - Loading - - )} - {isEmpty &&

No feature view with name: {fName}

} - {isError &&

Error loading feature view: {fName}

} - {isSuccess && data && ( - - - - - -

Properties

-
- - -
-
-
-
+ {isLoading && ( + + Loading + + )} + {isEmpty &&

No feature view with name: {fName}

} + {isError &&

Error loading feature view: {fName}

} + {isSuccess && data && ( + + + + + +

Properties

+
+ + +
+
+
+
)}
); diff --git a/ui/src/pages/feature-services/FeatureServiceListingTable.tsx b/ui/src/pages/feature-services/FeatureServiceListingTable.tsx index c6205b020a..b865da6e23 100644 --- a/ui/src/pages/feature-services/FeatureServiceListingTable.tsx +++ b/ui/src/pages/feature-services/FeatureServiceListingTable.tsx @@ -53,10 +53,10 @@ const FeatureServiceListingTable = ({ }, }, { - name: "Created at", - field: "meta.createdTimestamp", + name: "Last updated", + field: "meta.lastUpdatedTimestamp", render: (date: Date) => { - return date.toLocaleDateString("en-CA"); + return date ? date.toLocaleDateString("en-CA") : "n/a"; }, }, ]; diff --git a/ui/src/pages/feature-services/FeatureServiceOverviewTab.tsx b/ui/src/pages/feature-services/FeatureServiceOverviewTab.tsx index a3fc897325..ea62b3b3a7 100644 --- a/ui/src/pages/feature-services/FeatureServiceOverviewTab.tsx +++ b/ui/src/pages/feature-services/FeatureServiceOverviewTab.tsx @@ -66,14 +66,20 @@ const FeatureServiceOverviewTab = () => { description="Feature Views" /> - - - + {data.meta.lastUpdatedTimestamp ? ( + + + + ) : ( + + No last updated timestamp specified on this feature service. + + )} diff --git a/ui/src/parsers/feastFeatureServices.ts b/ui/src/parsers/feastFeatureServices.ts index 96c03e38ef..6812b7e02c 100644 --- a/ui/src/parsers/feastFeatureServices.ts +++ b/ui/src/parsers/feastFeatureServices.ts @@ -19,7 +19,8 @@ const FeastFeatureServiceSchema = z.object({ description: z.string().optional(), }), meta: z.object({ - createdTimestamp: z.string().transform((val) => new Date(val)), + createdTimestamp: z.string().transform((val) => new Date(val)).optional(), + lastUpdatedTimestamp: z.string().transform((val) => new Date(val)).optional(), }), }); diff --git a/ui/src/parsers/feastODFVS.ts b/ui/src/parsers/feastODFVS.ts index 8341438d50..4d09cc72df 100644 --- a/ui/src/parsers/feastODFVS.ts +++ b/ui/src/parsers/feastODFVS.ts @@ -1,6 +1,5 @@ import { z } from "zod"; import { FeastFeatureColumnSchema } from "./feastFeatureViews"; -import { FEAST_FEATURE_VALUE_TYPES } from "./types"; const FeatureViewProjectionSchema = z.object({ featureViewProjection: z.object({ diff --git a/ui/src/parsers/feastSavedDataset.ts b/ui/src/parsers/feastSavedDataset.ts index 2c97acda74..ce1d39b4e7 100644 --- a/ui/src/parsers/feastSavedDataset.ts +++ b/ui/src/parsers/feastSavedDataset.ts @@ -8,11 +8,11 @@ const FeastSavedDatasetSchema = z.object({ storage: z.object({ fileStorage: z.object({ fileFormat: z.object({ - parquestFormat: z.object({}).optional(), - }), + parquetFormat: z.object({}).optional(), + }).optional(), fileUrl: z.string(), - }), - }), + }).optional(), + }).optional(), featureService: z .object({ spec: z.object({ @@ -21,7 +21,7 @@ const FeastSavedDatasetSchema = z.object({ }) .transform((obj) => { return obj.spec.name; - }), + }).optional(), profile: z.string().optional(), }), meta: z.object({