diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 6ee866781d..c47a8ec5c3 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -1,7 +1,13 @@ name: build_wheels # Call this workflow from other workflows in the repository by specifying "uses: ./.github/workflows/build_wheels.yml" -on: [workflow_dispatch, workflow_call] +# Developers who are starting a new release should use this workflow to ensure wheels will be built correctly. +# Devs should check out their fork, add a tag to the last master commit on their fork, and run the release off of their fork on the added tag to ensure wheels will be built correctly. +on: + workflow_dispatch: + tags: + - 'v*.*.*' + workflow_call: jobs: get-version: @@ -60,7 +66,7 @@ jobs: uses: pypa/cibuildwheel@v2.7.0 env: CIBW_BUILD: "cp3*_x86_64" - CIBW_SKIP: "cp36-* *-musllinux_x86_64 cp310-macosx_x86_64" + CIBW_SKIP: "cp36-* cp37-* *-musllinux_x86_64 cp310-macosx_x86_64" CIBW_ARCHS: "native" CIBW_ENVIRONMENT: > COMPILE_GO=True PATH=$PATH:/usr/local/go/bin @@ -68,7 +74,12 @@ jobs: curl -o go.tar.gz https://dl.google.com/go/go1.18.2.linux-amd64.tar.gz tar -C /usr/local -xzf go.tar.gz go version + yum -y update && + yum install -y epel-release || yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-$(cut -d: -f5 /etc/system-release-cpe | cut -d. -f1).noarch.rpm && + yum install -y https://apache.jfrog.io/artifactory/arrow/centos/$(cut -d: -f5 /etc/system-release-cpe | cut -d. -f1)/apache-arrow-release-latest.rpm && + yum install -y --enablerepo=epel arrow-devel # For C++ CIBW_BEFORE_ALL_MACOS: | + brew install apache-arrow curl -o python.pkg https://www.python.org/ftp/python/3.9.12/python-3.9.12-macosx10.9.pkg sudo installer -pkg python.pkg -target / # There's a `git restore` in here because `make install-go-ci-dependencies` is actually messing up go.mod & go.sum. @@ -93,6 +104,7 @@ jobs: CIBW_BEFORE_BUILD: | git status git restore go.mod go.sum + brew install apache-arrow - uses: actions/upload-artifact@v2 with: name: wheels @@ -134,11 +146,11 @@ jobs: verify-python-wheels: runs-on: ${{ matrix.os }} - needs: [build-python-wheel, build-source-distribution] + needs: [build-python-wheel, build-source-distribution, get-version] strategy: matrix: os: [ubuntu-latest, macos-10.15 ] - python-version: [ "3.7", "3.8", "3.9", "3.10"] + python-version: [ "3.8", "3.9", "3.10"] from-source: [ True, False ] env: # this script is for testing servers @@ -153,6 +165,7 @@ jobs: else echo "Succeeded!" fi + VERSION_WITHOUT_PREFIX: ${{ needs.get-version.outputs.version_without_prefix }} steps: - name: Setup Python id: setup-python @@ -174,12 +187,24 @@ jobs: cd dist/ pip install wheel for f in *.whl; do pip install $f || true; done + - name: Install apache-arrow on ubuntu + if: ${{ matrix.from-source && matrix.os == 'ubuntu-latest' }} + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install apache-arrow on macos + if: ${{ matrix.from-source && matrix.os == 'macos-10.15' && matrix.python-version != '3.10' }} + run: brew install apache-arrow - name: Install dist with go if: ${{ matrix.from-source && (matrix.python-version != '3.10' || matrix.os == 'ubuntu-latest')}} env: COMPILE_GO: "True" run: | - pip install 'grpcio-tools==1.44.0' 'pybindgen==0.22.0' + pip install 'grpcio-tools==1.47.0' 'pybindgen==0.22.0' go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.26.0 go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.1.0 pip install dist/*tar.gz @@ -190,6 +215,20 @@ jobs: - name: Install OS X dependencies if: matrix.os == 'macos-10.15' run: brew install coreutils + # Validate that the feast version installed is not development and is the correct version of the tag we ran it off of. + - name: Validate Feast Version + run: | + VERSION_REGEX='[0-9]+\.[0-9]+\.[0-9]+' + OUTPUT_REGEX='^Feast SDK Version: "$VERSION_REGEX"$' + VERSION_OUTPUT=$(feast version) + VERSION=$(echo $VERSION_OUTPUT | grep -oE "$VERSION_REGEX") + OUTPUT=$(echo $VERSION_OUTPUT | grep -E "$REGEX") + if [ -n "$OUTPUT" ] && [ "$VERSION" = "$VERSION_WITHOUT_PREFIX" ]; then + echo "Correct Feast Version Installed" + else + echo "$VERSION_OUTPUT from installed wheel is not in the correct format or doesn't have the right version $VERSION." + exit 1 + fi - name: Smoke test run: | feast init test_repo @@ -206,5 +245,5 @@ jobs: feast apply echo "$TEST_SCRIPT" > run-and-wait.sh pip install cffi - printf "\ngo_feature_retrieval: True" >> feature_store.yaml + printf "\ngo_feature_serving: True" >> feature_store.yaml bash run-and-wait.sh feast serve \ No newline at end of file diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index a0a6d7dd38..ba475e2585 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -39,6 +39,14 @@ jobs: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- - name: Install pip-tools run: pip install pip-tools + - name: Install apache-arrow on ubuntu + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev - name: Install dependencies run: | make compile-protos-go @@ -63,7 +71,13 @@ jobs: - name: Upgrade pip version run: | pip install --upgrade "pip>=21.3.1,<22.1" - - name: Install dependencies - run: make install-go-proto-dependencies + - name: Install apache-arrow on ubuntu + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev - name: Lint go run: make lint-go \ No newline at end of file diff --git a/.github/workflows/master_only.yml b/.github/workflows/master_only.yml index 0cb49bb525..c9ebcdaf04 100644 --- a/.github/workflows/master_only.yml +++ b/.github/workflows/master_only.yml @@ -127,6 +127,18 @@ jobs: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- - name: Install pip-tools run: pip install pip-tools + - name: Install apache-arrow on ubuntu + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install apache-arrow on macos + if: matrix.os == 'macOS-latest' + run: brew install apache-arrow - name: Install dependencies run: make install-python-ci-dependencies - name: Setup Redis Cluster diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml new file mode 100644 index 0000000000..fead512408 --- /dev/null +++ b/.github/workflows/nightly-ci.yml @@ -0,0 +1,191 @@ +name: nightly-ci + +on: + schedule: + - cron: '00 08 * * *' # early morning 08:00 AM UTC, which is 1 AM PST/4 AM EST. + +# concurrency is currently broken, see details https://github.com/actions/runner/issues/1532 +#concurrency: +# group: pr-integration-tests-${{ github.event.pull_request.number }} +# cancel-in-progress: true + +jobs: + check_date: + runs-on: ubuntu-latest + name: Check latest commit + outputs: + WAS_EDITED: ${{ steps.check_date.outputs.WAS_EDITED }} + steps: + - uses: actions/checkout@v2 + with: + ref: master + - id: check_date + name: Check if there were commits in the last day + if: ${{ github.event_name == 'schedule' }} + run: echo '::set-output name=WAS_EDITED::'$(test -n "$(git log --format=%H --since='24 hours ago')" && echo 'true' || echo 'false') + build-docker-image: + needs: [check_date] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + ref: master + submodules: recursive + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + install: true + - name: Set up AWS SDK + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-west-2 + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + - name: Set ECR image tag + id: image-tag + run: echo "::set-output name=DOCKER_IMAGE_TAG::`git rev-parse HEAD`" + - name: Cache Public ECR Image + id: lambda_python_3_9 + uses: actions/cache@v2 + with: + path: ~/cache + key: lambda_python_3_9 + - name: Handle Cache Miss (pull public ECR image & save it to tar file) + if: steps.cache-primes.outputs.cache-hit != 'true' + run: | + mkdir -p ~/cache + docker pull public.ecr.aws/lambda/python:3.9 + docker save public.ecr.aws/lambda/python:3.9 -o ~/cache/lambda_python_3_9.tar + - name: Handle Cache Hit (load docker image from tar file) + if: steps.cache-primes.outputs.cache-hit == 'true' + run: | + docker load -i ~/cache/lambda_python_3_9.tar + - name: Build and push + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: feast-python-server + run: | + docker build \ + --file sdk/python/feast/infra/feature_servers/aws_lambda/Dockerfile \ + --tag $ECR_REGISTRY/$ECR_REPOSITORY:${{ steps.image-tag.outputs.DOCKER_IMAGE_TAG }} \ + --load \ + . + docker push $ECR_REGISTRY/$ECR_REPOSITORY:${{ steps.image-tag.outputs.DOCKER_IMAGE_TAG }} + outputs: + DOCKER_IMAGE_TAG: ${{ steps.image-tag.outputs.DOCKER_IMAGE_TAG }} + integration-test-python: + needs: [check_date, build-docker-image] + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8" ] + os: [ ubuntu-latest ] + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python-version }} + services: + redis: + image: redis + ports: + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: + - uses: actions/checkout@v2 + with: + ref: master + submodules: recursive + - name: Setup Python + uses: actions/setup-python@v2 + id: setup-python + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + - name: Setup Go + id: setup-go + uses: actions/setup-go@v2 + with: + go-version: 1.18.0 + - name: Set up gcloud SDK + uses: google-github-actions/setup-gcloud@v0 + with: + project_id: ${{ secrets.GCP_PROJECT_ID }} + service_account_key: ${{ secrets.GCP_SA_KEY }} + export_default_credentials: true + - name: Use gcloud CLI + run: gcloud info + - name: Set up AWS SDK + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-west-2 + - name: Use AWS CLI + run: aws sts get-caller-identity + - name: Upgrade pip version + run: | + pip install --upgrade "pip>=21.3.1,<22.1" + - name: Get pip cache dir + id: pip-cache + run: | + echo "::set-output name=dir::$(pip cache dir)" + - name: pip cache + uses: actions/cache@v2 + with: + path: | + ${{ steps.pip-cache.outputs.dir }} + /opt/hostedtoolcache/Python + /Users/runner/hostedtoolcache/Python + key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-${{ hashFiles(format('**/py{0}-ci-requirements.txt', env.PYTHON)) }} + restore-keys: | + ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- + - name: Install pip-tools + run: pip install pip-tools + - name: Install apache-arrow on ubuntu + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install apache-arrow on macos + if: matrix.os == 'macOS-latest' + run: brew install apache-arrow + - name: Install dependencies + run: make install-python-ci-dependencies + - name: Setup Redis Cluster + run: | + docker pull vishnunair/docker-redis-cluster:latest + docker run -d -p 6001:6379 -p 6002:6380 -p 6003:6381 -p 6004:6382 -p 6005:6383 -p 6006:6384 --name redis-cluster vishnunair/docker-redis-cluster + - name: Test python + if: ${{ always() }} # this will guarantee that step won't be canceled and resources won't leak + env: + FEAST_SERVER_DOCKER_IMAGE_TAG: ${{ needs.build-docker-image.outputs.DOCKER_IMAGE_TAG }} + FEAST_USAGE: "False" + IS_TEST: "True" + SNOWFLAKE_CI_DEPLOYMENT: ${{ secrets.SNOWFLAKE_CI_DEPLOYMENT }} + SNOWFLAKE_CI_USER: ${{ secrets.SNOWFLAKE_CI_USER }} + SNOWFLAKE_CI_PASSWORD: ${{ secrets.SNOWFLAKE_CI_PASSWORD }} + SNOWFLAKE_CI_ROLE: ${{ secrets.SNOWFLAKE_CI_ROLE }} + SNOWFLAKE_CI_WAREHOUSE: ${{ secrets.SNOWFLAKE_CI_WAREHOUSE }} + run: pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests --integration --durations=5 --timeout=1200 --timeout_method=thread + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: ./coverage.xml + flags: integrationtests + env_vars: OS,PYTHON + fail_ci_if_error: true + verbose: true \ No newline at end of file diff --git a/.github/workflows/pr_integration_tests.yml b/.github/workflows/pr_integration_tests.yml index e1c7ed2de2..58bf45c687 100644 --- a/.github/workflows/pr_integration_tests.yml +++ b/.github/workflows/pr_integration_tests.yml @@ -153,6 +153,18 @@ jobs: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- - name: Install pip-tools run: pip install pip-tools + - name: Install apache-arrow on ubuntu + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install apache-arrow on macos + if: matrix.os == 'macOS-latest' + run: brew install apache-arrow - name: Install dependencies run: make install-python-ci-dependencies - name: Setup Redis Cluster @@ -163,8 +175,6 @@ jobs: if: ${{ always() }} # this will guarantee that step won't be canceled and resources won't leak env: FEAST_SERVER_DOCKER_IMAGE_TAG: ${{ needs.build-docker-image.outputs.DOCKER_IMAGE_TAG }} - FEAST_USAGE: "False" - IS_TEST: "True" SNOWFLAKE_CI_DEPLOYMENT: ${{ secrets.SNOWFLAKE_CI_DEPLOYMENT }} SNOWFLAKE_CI_USER: ${{ secrets.SNOWFLAKE_CI_USER }} SNOWFLAKE_CI_PASSWORD: ${{ secrets.SNOWFLAKE_CI_PASSWORD }} @@ -179,4 +189,4 @@ jobs: flags: integrationtests env_vars: OS,PYTHON fail_ci_if_error: true - verbose: true + verbose: true \ No newline at end of file diff --git a/.github/workflows/pr_local_integration_tests.yml b/.github/workflows/pr_local_integration_tests.yml new file mode 100644 index 0000000000..d4db8a3a7c --- /dev/null +++ b/.github/workflows/pr_local_integration_tests.yml @@ -0,0 +1,80 @@ +name: pr-local-integration-tests +# This runs local tests with containerized stubs of online stores. This is the main dev workflow + +on: + pull_request_target: + types: + - opened + - synchronize + - labeled + +jobs: + integration-test-python-local: + # all jobs MUST have this if check for 'ok-to-test' or 'approved' for security purposes. + if: + (github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'lgtm' || github.event.label.name == 'ok-to-test')) || + (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved') || contains(github.event.pull_request.labels.*.name, 'lgtm'))) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8" ] + os: [ ubuntu-latest ] + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python-version }} + steps: + - uses: actions/checkout@v2 + with: + # pull_request_target runs the workflow in the context of the base repo + # as such actions/checkout needs to be explicit configured to retrieve + # code from the PR. + ref: refs/pull/${{ github.event.pull_request.number }}/merge + submodules: recursive + - name: Setup Python + uses: actions/setup-python@v2 + id: setup-python + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + - name: Upgrade pip version + run: | + pip install --upgrade "pip>=21.3.1,<22.1" + - name: Get pip cache dir + id: pip-cache + run: | + echo "::set-output name=dir::$(pip cache dir)" + - name: pip cache + uses: actions/cache@v2 + with: + path: | + ${{ steps.pip-cache.outputs.dir }} + /opt/hostedtoolcache/Python + /Users/runner/hostedtoolcache/Python + key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-${{ hashFiles(format('**/py{0}-ci-requirements.txt', env.PYTHON)) }} + restore-keys: | + ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- + - name: Install pip-tools + run: pip install pip-tools + - name: Install apache-arrow on ubuntu + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install dependencies + run: make install-python-ci-dependencies + - name: Set up gcloud SDK # TODO(adchia): remove this dependency + uses: google-github-actions/setup-gcloud@v0 + with: + project_id: ${{ secrets.GCP_PROJECT_ID }} + service_account_key: ${{ secrets.GCP_SA_KEY }} + export_default_credentials: true + - name: Use gcloud CLI + run: gcloud info + - name: Test local integration tests + if: ${{ always() }} # this will guarantee that step won't be canceled and resources won't leak + run: make test-python-integration-local diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index a9bf3deba3..7bbe9ad6ac 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -54,17 +54,28 @@ jobs: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip- - name: Install pip-tools run: pip install pip-tools + - name: Install apache-arrow on ubuntu + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev + - name: Install apache-arrow on macos + if: matrix.os == 'macOS-latest' + run: brew install apache-arrow - name: Install dependencies run: make install-python-ci-dependencies - name: Test Python env: - IS_TEST: "True" SNOWFLAKE_CI_DEPLOYMENT: ${{ secrets.SNOWFLAKE_CI_DEPLOYMENT }} SNOWFLAKE_CI_USER: ${{ secrets.SNOWFLAKE_CI_USER }} SNOWFLAKE_CI_PASSWORD: ${{ secrets.SNOWFLAKE_CI_PASSWORD }} SNOWFLAKE_CI_ROLE: ${{ secrets.SNOWFLAKE_CI_ROLE }} SNOWFLAKE_CI_WAREHOUSE: ${{ secrets.SNOWFLAKE_CI_WAREHOUSE }} - run: FEAST_USAGE=False pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests + run: pytest -n 8 --cov=./ --cov-report=xml --color=yes sdk/python/tests - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 with: @@ -92,9 +103,13 @@ jobs: uses: actions/setup-go@v2 with: go-version: 1.18.0 - - name: Install dependencies - run: make install-go-proto-dependencies - - name: Compile protos - run: make compile-protos-go + - name: Install apache-arrow on ubuntu + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + sudo apt install -y -V libarrow-dev - name: Test run: make test-go diff --git a/CHANGELOG.md b/CHANGELOG.md index bd7e8098f3..80852af83d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,51 @@ # Changelog +# [0.23.0](https://github.com/feast-dev/feast/compare/v0.22.0...v0.23.0) (2022-08-02) + + +### Bug Fixes + +* Add dummy alias to pull_all_from_table_or_query ([#2956](https://github.com/feast-dev/feast/issues/2956)) ([5e45228](https://github.com/feast-dev/feast/commit/5e45228a406e6ee7f82e41cab7f734730ff2e73f)) +* Bump version of Guava to mitigate cve ([#2896](https://github.com/feast-dev/feast/issues/2896)) ([51df8be](https://github.com/feast-dev/feast/commit/51df8be5d3b9bc702393d00e9a6370c703510358)) +* Change numpy version on setup.py and upgrade it to resolve dependabot warning ([#2887](https://github.com/feast-dev/feast/issues/2887)) ([80ea7a9](https://github.com/feast-dev/feast/commit/80ea7a93a9d7ea19f9a1218430e008a33eb6d788)) +* Change the feature store plan method to public modifier ([#2904](https://github.com/feast-dev/feast/issues/2904)) ([0ec7d1a](https://github.com/feast-dev/feast/commit/0ec7d1abd3f509e17870ca168ece356382fb7fe9)) +* Deprecate 3.7 wheels and fix verification workflow ([#2934](https://github.com/feast-dev/feast/issues/2934)) ([040c910](https://github.com/feast-dev/feast/commit/040c9107b719a7b3f3c70ab743f148e47b0a0982)) +* Do not allow same column to be reused in data sources ([#2965](https://github.com/feast-dev/feast/issues/2965)) ([661c053](https://github.com/feast-dev/feast/commit/661c0535f34b042846562a3fb4cdab4ab4403459)) +* Fix build wheels workflow to install apache-arrow correctly ([#2932](https://github.com/feast-dev/feast/issues/2932)) ([bdeb4ae](https://github.com/feast-dev/feast/commit/bdeb4aeaf2a5cfa144a65cc84f7bfb26e3077e7a)) +* Fix file offline store logic for feature views without ttl ([#2971](https://github.com/feast-dev/feast/issues/2971)) ([26f6b69](https://github.com/feast-dev/feast/commit/26f6b69b0e2c8a4ea37b43e3d1eaa4cdb8c085a9)) +* Fix grpc and update protobuf ([#2894](https://github.com/feast-dev/feast/issues/2894)) ([86e9efd](https://github.com/feast-dev/feast/commit/86e9efdc893de817a359feb939f06717716c0b17)) +* Fix night ci syntax error and update readme ([#2935](https://github.com/feast-dev/feast/issues/2935)) ([b917540](https://github.com/feast-dev/feast/commit/b917540c27052c01f872a2de686a6dd3b7a16e9c)) +* Fix nightly ci again ([#2939](https://github.com/feast-dev/feast/issues/2939)) ([1603c9e](https://github.com/feast-dev/feast/commit/1603c9e7765e08bb1832c03b66b754afbf8a9b4d)) +* Fix the go build and use CgoArrowAllocator to prevent incorrect garbage collection ([#2919](https://github.com/feast-dev/feast/issues/2919)) ([130746e](https://github.com/feast-dev/feast/commit/130746ea5cfadad6ef467c0cb0490d4745fdad70)) +* Fix typo in CONTRIBUTING.md ([#2955](https://github.com/feast-dev/feast/issues/2955)) ([8534f69](https://github.com/feast-dev/feast/commit/8534f69026d03e6e5964ef3e9bc69cc18397a879)) +* Fixing broken links to feast documentation on java readme and contribution ([#2892](https://github.com/feast-dev/feast/issues/2892)) ([d044588](https://github.com/feast-dev/feast/commit/d044588d702b3dc2dd6b9a9e28056df19d942a09)) +* Fixing Spark min / max entity df event timestamps range return order ([#2735](https://github.com/feast-dev/feast/issues/2735)) ([ac55ce2](https://github.com/feast-dev/feast/commit/ac55ce25388abfa35e93097bd14190eeba08a165)) +* Move gcp back to 1.47.0 since grpcio-tools 1.48.0 got yanked from pypi ([#2990](https://github.com/feast-dev/feast/issues/2990)) ([fc447eb](https://github.com/feast-dev/feast/commit/fc447eb3d0345dba6a45cdf5b1c1c2e982766cb9)) +* Refactor testing and sort out unit and integration tests ([#2975](https://github.com/feast-dev/feast/issues/2975)) ([2680f7b](https://github.com/feast-dev/feast/commit/2680f7b031717b64e6ea3addf150369dccebdbc1)) +* Remove hard-coded integration test setup for AWS & GCP ([#2970](https://github.com/feast-dev/feast/issues/2970)) ([e4507ac](https://github.com/feast-dev/feast/commit/e4507ac16540cb3a7e29c31121963a0fe8f79fe4)) +* Resolve small typo in README file ([#2930](https://github.com/feast-dev/feast/issues/2930)) ([16ae902](https://github.com/feast-dev/feast/commit/16ae902909911bbf45d0e430895b3bc20bba01e9)) +* Revert "feat: Add snowflake online store ([#2902](https://github.com/feast-dev/feast/issues/2902))" ([#2909](https://github.com/feast-dev/feast/issues/2909)) ([38fd001](https://github.com/feast-dev/feast/commit/38fd00195f8ed309b2e7bae06d48cb10ab82f5aa)) +* Snowflake_online_read fix ([#2988](https://github.com/feast-dev/feast/issues/2988)) ([651ce34](https://github.com/feast-dev/feast/commit/651ce341687034ce07ca959f805f3c90dccfd4cc)) +* Spark source support table with pattern "db.table" ([#2606](https://github.com/feast-dev/feast/issues/2606)) ([3ce5139](https://github.com/feast-dev/feast/commit/3ce51391e0b2ebdec68c81d93b54f5d06bb427a6)), closes [#2605](https://github.com/feast-dev/feast/issues/2605) +* Switch mysql log string to use regex ([#2976](https://github.com/feast-dev/feast/issues/2976)) ([5edf4b0](https://github.com/feast-dev/feast/commit/5edf4b0332a298a0e172dd58e0a627efe5705eec)) +* Update gopy to point to fork to resolve github annotation errors. ([#2940](https://github.com/feast-dev/feast/issues/2940)) ([ba2dcf1](https://github.com/feast-dev/feast/commit/ba2dcf13fe9dc4c082816a737100e00e3e9a8ad2)) +* Version entity serialization mechanism and fix issue with int64 vals ([#2944](https://github.com/feast-dev/feast/issues/2944)) ([d0d27a3](https://github.com/feast-dev/feast/commit/d0d27a35a0d63a139970cb17542764ff2aaf6aaf)) + + +### Features + +* Add an experimental lambda-based materialization engine ([#2923](https://github.com/feast-dev/feast/issues/2923)) ([6f79069](https://github.com/feast-dev/feast/commit/6f79069c561eba888d070c46aae920f7ad0c2319)) +* Add column reordering to `write_to_offline_store` ([#2876](https://github.com/feast-dev/feast/issues/2876)) ([8abc2ef](https://github.com/feast-dev/feast/commit/8abc2ef76d461b6b4bbd97e2dfdf29c1c335cb80)) +* Add custom JSON table tab w/ formatting ([#2851](https://github.com/feast-dev/feast/issues/2851)) ([0159f38](https://github.com/feast-dev/feast/commit/0159f3875de7c8509c465346bd13dd11fba0d467)) +* Add CustomSourceOptions to SavedDatasetStorage ([#2958](https://github.com/feast-dev/feast/issues/2958)) ([23c09c8](https://github.com/feast-dev/feast/commit/23c09c83bc530de830ba867b10ceb02f113db5d6)) +* Add Go option to `feast serve` command ([#2966](https://github.com/feast-dev/feast/issues/2966)) ([a36a695](https://github.com/feast-dev/feast/commit/a36a6950b34d718ad328b4faca0c178fb23a3100)) +* Add interfaces for batch materialization engine ([#2901](https://github.com/feast-dev/feast/issues/2901)) ([38b28ca](https://github.com/feast-dev/feast/commit/38b28ca0181610c65d966a2f09456dbb102fbced)) +* Add pages for individual Features to the Feast UI ([#2850](https://github.com/feast-dev/feast/issues/2850)) ([9b97fca](https://github.com/feast-dev/feast/commit/9b97fca876d9520d6e1f9025562036330cc0aabd)) +* Add snowflake online store ([#2902](https://github.com/feast-dev/feast/issues/2902)) ([f758f9e](https://github.com/feast-dev/feast/commit/f758f9e148212d08f63df155e864940c27d92155)), closes [#2903](https://github.com/feast-dev/feast/issues/2903) +* Add Snowflake online store (again) ([#2922](https://github.com/feast-dev/feast/issues/2922)) ([2ef71fc](https://github.com/feast-dev/feast/commit/2ef71fc6b3ec4fca3b543f2f64bed765b09c3af4)), closes [#2903](https://github.com/feast-dev/feast/issues/2903) +* Add to_remote_storage method to RetrievalJob ([#2916](https://github.com/feast-dev/feast/issues/2916)) ([109ee9c](https://github.com/feast-dev/feast/commit/109ee9cff5bcda46889583f2968003f6a3e375b3)) +* Support retrieval from multiple feature views with different join keys ([#2835](https://github.com/feast-dev/feast/issues/2835)) ([056cfa1](https://github.com/feast-dev/feast/commit/056cfa1b21db4ff092b9d1f9c06f7300a4c9f4b7)) + # [0.22.0](https://github.com/feast-dev/feast/compare/v0.21.0...v0.22.0) (2022-06-29) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4bd14d762a..a8671d9986 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -95,7 +95,7 @@ pip install --upgrade pip make build-ui ``` -5Install development dependencies for Feast Python SDK / CLI +5. Install development dependencies for Feast Python SDK / CLI ```sh pip install -e ".[dev]" ``` @@ -133,17 +133,19 @@ make test-python ### Integration Tests There are two sets of tests you can run: -1. Local integration tests (for faster development) +1. Local integration tests (for faster development, tests file offline store & key online stores) 2. Full integration tests (requires cloud environment setups) #### Local integration tests -To get local integration tests running, you'll need to have Redis setup: +For this approach of running tests, you'll need to have docker set up locally: [Get Docker](https://docs.docker.com/get-docker/) + +It leverages a file based offline store to test against emulated versions of Datastore, DynamoDB, and Redis, using ephemeral containers. -Redis -1. Install Redis: [Quickstart](https://redis.io/topics/quickstart) -2. Run `redis-server` +These tests create new temporary tables / datasets locally only, and they are cleaned up. when the containers are torn down. -Now run `make test-python-universal-local` +```sh +make test-python-integration-local +``` #### Full integration tests To test across clouds, on top of setting up Redis, you also need GCP / AWS / Snowflake setup. @@ -159,23 +161,78 @@ To test across clouds, on top of setting up Redis, you also need GCP / AWS / Sno gcloud auth login gcloud auth application-default login ``` -3. Export `GCLOUD_PROJECT=[your project]` to your .zshrc +- When you run `gcloud auth application-default login`, you should see some output of the form: + ``` + Credentials saved to file: [$HOME/.config/gcloud/application_default_credentials.json] + ``` +- You should run `export GOOGLE_APPLICATION_CREDENTIALS="$HOME/.config/gcloud/application_default_credentials.json”` to add the application credentials to your .zshrc or .bashrc. +3. Run `export GCLOUD_PROJECT=[your project]` to your .zshrc or .bashrc. +4. Running `gcloud config list` should give you something like this: +```sh +$ gcloud config list +[core] +account = [your email] +disable_usage_reporting = True +project = [your project] + +Your active configuration is: [default] +``` +5. Export gcp specific environment variables. Namely, +```sh +export GCS_REGION='[your gcs region e.g US]' +export GCS_STAGING_LOCATION='[your gcs staging location]' +``` **AWS** 1. TODO(adchia): flesh out setting up AWS login (or create helper script) -2. Modify `RedshiftDataSourceCreator` to use your credentials +2. To run the AWS Redshift and Dynamo integration tests you will have to export your own AWS credentials. Namely, + +```sh +export AWS_REGION='[your aws region]' +export AWS_CLUSTER_ID='[your aws cluster id]' +export AWS_USER='[your aws user]' +export AWS_DB='[your aws database]' +export AWS_STAGING_LOCATION='[your s3 staging location uri]' +export AWS_IAM_ROLE='[redshift and s3 access role]' +export AWS_LAMBDA_ROLE='[your aws lambda execution role]' +export AWS_REGISTRY_PATH='[your aws registry path]' +``` **Snowflake** -- See https://signup.snowflake.com/ +1. See https://signup.snowflake.com/ to setup a trial. +2. Then to run successfully, you'll need some environment variables setup: +```sh +export SNOWFLAKE_CI_DEPLOYMENT='[snowflake_deployment]' +export SNOWFLAKE_CI_USER='[your user]' +export SNOWFLAKE_CI_PASSWORD='[your pw]' +export SNOWFLAKE_CI_ROLE='[your CI role e.g. SYSADMIN]' +export SNOWFLAKE_CI_WAREHOUSE='[your warehouse]' +``` Then run `make test-python-integration`. Note that for Snowflake / GCP / AWS, this will create new temporary tables / datasets. +#### Running specific provider tests or running your test against specific online or offline stores + +1. If you don't need to have your test run against all of the providers(`gcp`, `aws`, and `snowflake`) or don't need to run against all of the online stores, you can tag your test with specific providers or stores that you need(`@pytest.mark.universal_online_stores` or `@pytest.mark.universal_online_stores` with the `only` parameter). The `only` parameter selects specific offline providers and online stores that your test will test against. Example: + +```python +# Only parametrizes this test with the sqlite online store +@pytest.mark.universal_online_stores(only=["sqlite"]) +def test_feature_get_online_features_types_match(): +``` + +2. You can also filter tests to run by using pytest's cli filtering. Instead of using the make commands to test Feast, you can filter tests by name with the `-k` parameter. The parametrized integration tests are all uniquely identified by their provider and online store so the `-k` option can select only the tests that you need to run. For example, to run only Redshift related tests, you can use the following command: + +```sh +python -m pytest -n 8 --integration -k Redshift sdk/python/tests +``` + #### (Experimental) Run full integration tests against containerized services Test across clouds requires existing accounts on GCP / AWS / Snowflake, and may incur costs when using these services. For this approach of running tests, you'll need to have docker set up locally: [Get Docker](https://docs.docker.com/get-docker/) -It's possible to run some integration tests against emulated local versions of these services, using ephemeral containers. +It's possible to run some integration tests against emulated local versions of these services, using ephemeral containers. These tests create new temporary tables / datasets locally only, and they are cleaned up. when the containers are torn down. The services with containerized replacements currently implemented are: diff --git a/Makefile b/Makefile index 88f04aa95d..ee2b7c8f1b 100644 --- a/Makefile +++ b/Makefile @@ -68,8 +68,30 @@ test-python: test-python-integration: FEAST_USAGE=False IS_TEST=True python -m pytest -n 8 --integration sdk/python/tests +test-python-integration-local: + @(docker info > /dev/null 2>&1 && \ + FEAST_USAGE=False \ + IS_TEST=True \ + FEAST_IS_LOCAL_TEST=True \ + FEAST_LOCAL_ONLINE_CONTAINER=True \ + python -m pytest -n 8 --integration \ + -k "not test_apply_entity_integration and \ + not test_apply_feature_view_integration and \ + not test_apply_data_source_integration and \ + not test_lambda_materialization and \ + not test_feature_view_inference_success and \ + not test_update_file_data_source_with_inferred_event_timestamp_col and \ + not test_nullable_online_store" \ + sdk/python/tests \ + ) || echo "This script uses Docker, and it isn't running - please start the Docker Daemon and try again!"; + test-python-integration-container: - FEAST_USAGE=False IS_TEST=True FEAST_LOCAL_ONLINE_CONTAINER=True python -m pytest -n 8 --integration sdk/python/tests + @(docker info > /dev/null 2>&1 && \ + FEAST_USAGE=False \ + IS_TEST=True \ + FEAST_LOCAL_ONLINE_CONTAINER=True \ + python -m pytest -n 8 --integration sdk/python/tests \ + ) || echo "This script uses Docker, and it isn't running - please start the Docker Daemon and try again!"; test-python-universal-contrib: PYTHONPATH='.' \ @@ -104,14 +126,11 @@ test-python-universal-postgres: not test_universal_types" \ sdk/python/tests -test-python-universal-local: - FEAST_USAGE=False IS_TEST=True FEAST_IS_LOCAL_TEST=True python -m pytest -n 8 --integration sdk/python/tests - test-python-universal: FEAST_USAGE=False IS_TEST=True python -m pytest -n 8 --integration sdk/python/tests test-python-go-server: compile-go-lib - FEAST_USAGE=False IS_TEST=True FEAST_GO_FEATURE_RETRIEVAL=True pytest --integration --goserver sdk/python/tests + FEAST_USAGE=False IS_TEST=True pytest --integration --goserver sdk/python/tests format-python: # Sort @@ -172,32 +191,35 @@ install-go-proto-dependencies: install-go-ci-dependencies: # TODO: currently gopy installation doesn't work w/o explicit go get in the next line # TODO: there should be a better way to install gopy - go get github.com/go-python/gopy@v0.4.0 + go get github.com/go-python/gopy@v0.4.4 go install golang.org/x/tools/cmd/goimports # The `go get` command on the previous lines download the lib along with replacing the dep to `feast-dev/gopy` # but the following command is needed to install it for some reason. go install github.com/go-python/gopy - python -m pip install pybindgen==0.22.0 + python -m pip install pybindgen==0.22.0 protobuf==3.20.1 install-protoc-dependencies: - pip install grpcio-tools==1.44.0 mypy-protobuf==3.1.0 + pip install grpcio-tools==1.47.0 mypy-protobuf==3.1.0 compile-protos-go: install-go-proto-dependencies install-protoc-dependencies python setup.py build_go_protos compile-go-lib: install-go-proto-dependencies install-go-ci-dependencies - COMPILE_GO=True python setup.py build_ext --inplace + CGO_LDFLAGS_ALLOW=".*" COMPILE_GO=True python setup.py build_ext --inplace -# Needs feast package to setup the feature store -test-go: compile-protos-go +install-feast-ci-locally: pip install -e ".[ci]" - go test ./... + +# Needs feast package to setup the feature store +# CGO flag is due to this issue: https://github.com/golang/go/wiki/InvalidFlag +test-go: compile-protos-go compile-go-lib install-feast-ci-locally + CGO_LDFLAGS_ALLOW=".*" go test -tags cgo,ccalloc ./... format-go: gofmt -s -w go/ -lint-go: compile-protos-go - go vet ./go/internal/feast ./go/embedded +lint-go: compile-protos-go compile-go-lib + go vet -tags cgo,ccalloc ./go/internal/feast ./go/embedded # Docker diff --git a/README.md b/README.md index 0f77fbd42c..ab69636a20 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,14 @@ ## Overview -Feast is an open source feature store for machine learning. Feast is the fastest path to productionizing analytic data for model training and online inference. +Feast (**Fea**ture **St**ore) is an open source feature store for machine learning. Feast is the fastest path to manage existing infrastructure to productionize analytic data for model training and online inference. + + +Feast allows ML platform teams to: + +* **Make features consistently available for training and serving** by managing an _offline store_ (to process historical data for scale-out batch scoring or model training), a low-latency _online store_ (to power real-time prediction)_,_ and a battle-tested _feature server_ (for serving pre-computed features online). +* **Avoid data leakage** by generating point-in-time correct feature sets so data scientists can focus on feature engineering rather than debugging error-prone dataset joining logic. This ensure that future feature values do not leak to models during training. +* **Decouple ML from data infrastructure** by providing a single data access layer that abstracts feature storage from feature retrieval, ensuring models remain portable as you move from training models to serving models, from batch models to realtime models, and from one data infra system to another. Please see our [documentation](https://docs.feast.dev/) for more information about the project. @@ -135,11 +142,10 @@ pprint(feature_vector) ## 📦 Functionality and Roadmap -The list below contains the functionality that contributors are planning to develop for Feast +The list below contains the functionality that contributors are planning to develop for Feast. -* Items below that are in development (or planned for development) will be indicated in parentheses. * We welcome contribution to all items in the roadmap! -* Want to speak to a Feast contributor? We are more than happy to jump on a call. Please schedule a time using [Calendly](https://calendly.com/d/x2ry-g5bb/meet-with-feast-team). +* Have questions about the roadmap? Go to the Slack channel to ask on #feast-development. * **Data Sources** * [x] [Snowflake source](https://docs.feast.dev/reference/data-sources/snowflake) @@ -151,7 +157,6 @@ The list below contains the functionality that contributors are planning to deve * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/data-sources/postgres) * [x] [Spark (contrib plugin)](https://docs.feast.dev/reference/data-sources/spark) * [x] Kafka / Kinesis sources (via [push support into the online store](https://docs.feast.dev/reference/data-sources/push)) - * [ ] HTTP source * **Offline Stores** * [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) * [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) @@ -164,6 +169,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [In-memory / Pandas](https://docs.feast.dev/reference/offline-stores/file) * [x] [Custom offline store support](https://docs.feast.dev/how-to-guides/adding-a-new-offline-store) * **Online Stores** + * [x] [Snowflake](https://docs.feast.dev/reference/online-stores/snowflake) * [x] [DynamoDB](https://docs.feast.dev/reference/online-stores/dynamodb) * [x] [Redis](https://docs.feast.dev/reference/online-stores/redis) * [x] [Datastore](https://docs.feast.dev/reference/online-stores/datastore) @@ -184,29 +190,19 @@ The list below contains the functionality that contributors are planning to deve * **Deployments** * [x] AWS Lambda (Alpha release. See [RFC](https://docs.google.com/document/d/1eZWKWzfBif66LDN32IajpaG-j82LSHCCOzY6R7Ax7MI/edit)) * [x] Kubernetes (See [guide](https://docs.feast.dev/how-to-guides/running-feast-in-production#4.3.-java-based-feature-server-deployed-on-kubernetes)) - * [ ] Cloud Run - * [ ] KNative * **Feature Serving** * [x] Python Client - * [x] REST Feature Server (Python) (Alpha release. See [RFC](https://docs.google.com/document/d/1iXvFhAsJ5jgAhPOpTdB3j-Wj1S9x3Ev\_Wr6ZpnLzER4/edit)) - * [x] gRPC Feature Server (Java) (See [#1497](https://github.com/feast-dev/feast/issues/1497)) - * [x] Push API - * [ ] Java Client - * [ ] Go Client - * [ ] Delete API - * [] Feature Logging (for training) + * [x] [Python feature server](https://docs.feast.dev/reference/feature-servers/python-feature-server) + * [x] [Go feature server](https://docs.feast.dev/reference/feature-servers/go-feature-server) * **Data Quality Management (See [RFC](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98/edit))** * [x] Data profiling and validation (Great Expectations) - * [ ] Training-serving skew detection (in progress) - * [ ] Metric production - * [ ] Drift detection * **Feature Discovery and Governance** * [x] Python SDK for browsing feature registry * [x] CLI for browsing feature registry * [x] Model-centric feature tracking (feature services) * [x] Amundsen integration (see [Feast extractor](https://github.com/amundsen-io/amundsen/blob/main/databuilder/databuilder/extractor/feast_extractor.py)) - * [x] Feast Web UI (Alpha release. See [documentation](https://docs.feast.dev/reference/alpha-web-ui)) - * [ ] REST API for browsing feature registry + * [x] DataHub integration (see [DataHub Feast docs](https://datahubproject.io/docs/generated/ingestion/sources/feast/)) + * [x] Feast Web UI (Alpha release. See [docs](https://docs.feast.dev/reference/alpha-web-ui)) ## 🎓 Important Resources diff --git a/docs/README.md b/docs/README.md index f8b9af3c32..1b70f8fedc 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,43 +2,59 @@ ## What is Feast? -Feast (**Fea**ture **St**ore) is an operational data system for managing and serving machine learning features to models in production. Feast is able to serve feature data to models from a low-latency online store (for real-time prediction) or from an offline store (for scale-out batch scoring or model training). +Feast (**Fea**ture **St**ore) is a customizable operational data system that re-uses existing infrastructure to manage and serve machine learning features to realtime models. -![](assets/feast-marchitecture.png) - -## Problems Feast Solves +Feast allows ML platform teams to: -**Models need consistent access to data:** Machine Learning (ML) systems built on traditional data infrastructure are often coupled to databases, object stores, streams, and files. A result of this coupling, however, is that any change in data infrastructure may break dependent ML systems. Another challenge is that dual implementations of data retrieval for training and serving can lead to inconsistencies in data, which in turn can lead to training-serving skew. +* **Make features consistently available for training and serving** by managing an _offline store_ (to process historical data for scale-out batch scoring or model training), a low-latency _online store_ (to power real-time prediction)_,_ and a battle-tested _feature server_ (for serving pre-computed features online). +* **Avoid data leakage** by generating point-in-time correct feature sets so data scientists can focus on feature engineering rather than debugging error-prone dataset joining logic. This ensure that future feature values do not leak to models during training. +* **Decouple ML from data infrastructure** by providing a single data access layer that abstracts feature storage from feature retrieval, ensuring models remain portable as you move from training models to serving models, from batch models to realtime models, and from one data infra system to another. -Feast decouples your models from your data infrastructure by providing a single data access layer that abstracts feature storage from feature retrieval. Feast also provides a consistent means of referencing feature data for retrieval, and therefore ensures that models remain portable when moving from training to serving. +{% hint style="info" %} +**Note:** Feast today primarily addresses _timestamped structured data_. +{% endhint %} -**Deploying new features into production is difficult:** Many ML teams consist of members with different objectives. Data scientists, for example, aim to deploy features into production as soon as possible, while engineers want to ensure that production systems remain stable. These differing objectives can create an organizational friction that slows time-to-market for new features. +![](assets/feast-marchitecture.png) -Feast addresses this friction by providing both a centralized registry to which data scientists can publish features and a battle-hardened serving layer. Together, these enable non-engineering teams to ship features into production with minimal oversight. +## Who is Feast for? -**Models need point-in-time correct data:** ML models in production require a view of data consistent with the one on which they are trained, otherwise the accuracy of these models could be compromised. Despite this need, many data science projects suffer from inconsistencies introduced by future feature values being leaked to models during training. +Feast helps ML platform teams with DevOps experience productionize real-time models. Feast can also help these teams build towards a feature platform that improves collaboration between engineers and data scientists. -Feast solves the challenge of data leakage by providing point-in-time correct feature retrieval when exporting feature datasets for model training. + -**Features aren't reused across projects:** Different teams within an organization are often unable to reuse features across projects. The siloed nature of development and the monolithic design of end-to-end ML systems contribute to duplication of feature creation and usage across teams and projects. +Feast is likely **not** the right tool if you -Feast addresses this problem by introducing feature reuse through a centralized registry. This registry enables multiple teams working on different projects not only to contribute features, but also to reuse these same features. With Feast, data scientists can start new ML projects by selecting previously engineered features from a centralized registry, and are no longer required to develop new features for each project. +* are in an organization that’s just getting started with ML and is not yet sure what the business impact of ML is +* rely primarily on unstructured data +* need very low latency feature retrieval (e.g. p99 feature retrieval << 10ms) +* have a small team to support a large number of use cases -## Problems Feast does not yet solve +## What Feast is not? -**Feature engineering:** We aim for Feast to support light-weight feature engineering as part of our API. +### Feast is not -**Feature discovery:** We also aim for Feast to include a first-class user interface for exploring and discovering entities and features. +* **an** [**ETL**](https://en.wikipedia.org/wiki/Extract,\_transform,\_load) / [**ELT**](https://en.wikipedia.org/wiki/Extract,\_load,\_transform) **system:** Feast is not (and does not plan to become) a general purpose data transformation or pipelining system. Users often leverage tools like [dbt](https://www.getdbt.com) to manage upstream data transformations. +* **a data orchestration tool:** Feast does not manage or orchestrate complex workflow DAGs. It relies on upstream data pipelines to produce feature values and integrations with tools like [Airflow](https://airflow.apache.org) to make features consistently available. +* **a data warehouse:** Feast is not a replacement for your data warehouse or the source of truth for all transformed data in your organization. Rather, Feast is a light-weight downstream layer that can serve data from an existing data warehouse (or other data sources) to models in production. +* **a database:** Feast is not a database, but helps manage data stored in other systems (e.g. BigQuery, Snowflake, DynamoDB, Redis) to make features consistently available at training / serving time -**Feature validation:** We additionally aim for Feast to improve support for statistics generation of feature data and subsequent validation of these statistics. Current support is limited. +### Feast does not _fully_ solve -## What Feast is not +* **reproducible model training / model backtesting / experiment management**: Feast captures feature and model metadata, but does not version-control datasets / labels or manage train / test splits. Other tools like [DVC](https://dvc.org/), [MLflow](https://www.mlflow.org/), and [Kubeflow](https://www.kubeflow.org/) are better suited for this. +* **batch + streaming feature engineering**: Feast primarily processes already transformed feature values (though it offers experimental light-weight transformations). Users usually integrate Feast with upstream systems (e.g. existing ETL/ELT pipelines). [Tecton](http://tecton.ai/) is a more fully featured feature platform which addresses these needs. +* **native streaming feature integration:** Feast enables users to push streaming features, but does not pull from streaming sources or manage streaming pipelines. [Tecton](http://tecton.ai/) is a more fully featured feature platform which orchestrates end to end streaming pipelines. +* **feature sharing**: Feast has experimental functionality to enable discovery and cataloguing of feature metadata with a [Feast web UI (alpha)](https://docs.feast.dev/reference/alpha-web-ui). Feast also has community contributed plugins with [DataHub](https://datahubproject.io/docs/generated/ingestion/sources/feast/) and [Amundsen](https://github.com/amundsen-io/amundsen/blob/4a9d60176767c4d68d1cad5b093320ea22e26a49/databuilder/databuilder/extractor/feast\_extractor.py). [Tecton](http://tecton.ai/) also more robustly addresses these needs. +* **lineage:** Feast helps tie feature values to model versions, but is not a complete solution for capturing end-to-end lineage from raw data sources to model versions. Feast also has community contributed plugins with [DataHub](https://datahubproject.io/docs/generated/ingestion/sources/feast/) and [Amundsen](https://github.com/amundsen-io/amundsen/blob/4a9d60176767c4d68d1cad5b093320ea22e26a49/databuilder/databuilder/extractor/feast\_extractor.py). [Tecton](http://tecton.ai/) captures more end-to-end lineage by also managing feature transformations. +* **data quality / drift detection**: Feast has experimental integrations with [Great Expectations](https://greatexpectations.io/), but is not purpose built to solve data drift / data quality issues. This requires more sophisticated monitoring across data pipelines, served feature values, labels, and model versions. -[**ETL**](https://en.wikipedia.org/wiki/Extract,\_transform,\_load) **or** [**ELT**](https://en.wikipedia.org/wiki/Extract,\_load,\_transform) **system:** Feast is not (and does not plan to become) a general purpose data transformation or pipelining system. Feast plans to include a light-weight feature engineering toolkit, but we encourage teams to integrate Feast with upstream ETL/ELT systems that are specialized in transformation. +## Example use cases -**Data warehouse:** Feast is not a replacement for your data warehouse or the source of truth for all transformed data in your organization. Rather, Feast is a light-weight downstream layer that can serve data from an existing data warehouse (or other data sources) to models in production. +Many companies have used Feast to power real-world ML use cases such as: -**Data catalog:** Feast is not a general purpose data catalog for your organization. Feast is purely focused on cataloging features for use in ML pipelines or systems, and only to the extent of facilitating the reuse of features. +* Personalizing online recommendations by leveraging pre-computed historical user or item features. +* Online fraud detection, using features that compare against (pre-computed) historical transaction patterns +* Churn prediction (an offline model), generating feature values for all users at a fixed cadence in batch +* Credit scoring, using pre-computed historical features to compute probability of default ## How can I get started? diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 23049455e3..b0e88b413f 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -25,6 +25,7 @@ * [Offline store](getting-started/architecture-and-components/offline-store.md) * [Online store](getting-started/architecture-and-components/online-store.md) * [Provider](getting-started/architecture-and-components/provider.md) + * [Batch Materialization Engine](getting-started/architecture-and-components/batch-materialization-engine.md) * [Learning by example](getting-started/feast-workshop.md) * [Third party integrations](getting-started/third-party-integrations.md) * [FAQ](getting-started/faq.md) @@ -50,9 +51,9 @@ * [Load data into the online store](how-to-guides/feast-snowflake-gcp-aws/load-data-into-the-online-store.md) * [Read features from the online store](how-to-guides/feast-snowflake-gcp-aws/read-features-from-the-online-store.md) * [Running Feast in production](how-to-guides/running-feast-in-production.md) -* [Deploying a Java feature server on Kubernetes](how-to-guides/fetching-java-features-k8s.md) * [Upgrading from Feast 0.9](https://docs.google.com/document/u/1/d/1AOsr\_baczuARjCpmZgVd8mCqTF4AZ49OEyU4Cn-uTT0/edit) -* [Adding a custom provider](how-to-guides/creating-a-custom-provider.md) +* [Upgrading for Feast 0.20+](how-to-guides/automated-feast-upgrade.md) +* [Adding a custom batch materialization engine](how-to-guides/creating-a-custom-materialization-engine.md) * [Adding a new online store](how-to-guides/adding-support-for-a-new-online-store.md) * [Adding a new offline store](how-to-guides/adding-a-new-offline-store.md) * [Adding or reusing tests](how-to-guides/adding-or-reusing-tests.md) @@ -78,6 +79,7 @@ * [PostgreSQL (contrib)](reference/offline-stores/postgres.md) * [Online stores](reference/online-stores/README.md) * [SQLite](reference/online-stores/sqlite.md) + * [Snowflake](reference/online-stores/snowflake.md) * [Redis](reference/online-stores/redis.md) * [Datastore](reference/online-stores/datastore.md) * [DynamoDB](reference/online-stores/dynamodb.md) @@ -91,7 +93,7 @@ * [.feastignore](reference/feature-repository/feast-ignore.md) * [Feature servers](reference/feature-servers/README.md) * [Python feature server](reference/feature-servers/python-feature-server.md) - * [Go-based feature retrieval](reference/feature-servers/go-feature-retrieval.md) + * [Go feature server](reference/feature-servers/go-feature-server.md) * [\[Alpha\] Web UI](reference/alpha-web-ui.md) * [\[Alpha\] Data quality monitoring](reference/dqm.md) * [\[Alpha\] On demand feature view](reference/alpha-on-demand-feature-view.md) diff --git a/docs/community.md b/docs/community.md index c0ead3dda1..dc1cc8a0fe 100644 --- a/docs/community.md +++ b/docs/community.md @@ -1,16 +1,11 @@ # Community -{% hint style="success" %} -**Speak to us:** Have a question, feature request, idea, or just looking to speak to a real person? Set up a meeting with a Feast maintainer over [here](https://calendly.com/d/x2ry-g5bb/meet-with-feast-team)! -{% endhint %} - ## Links & Resources * [Slack](https://slack.feast.dev): Feel free to ask questions or say hello! * [Mailing list](https://groups.google.com/d/forum/feast-dev): We have both a user and developer mailing list. * Feast users should join [feast-discuss@googlegroups.com](mailto:feast-discuss@googlegroups.com) group by clicking [here](https://groups.google.com/g/feast-discuss). * Feast developers should join [feast-dev@googlegroups.com](mailto:feast-dev@googlegroups.com) group by clicking [here](https://groups.google.com/d/forum/feast-dev). - * People interested in the Feast community newsletter should join feast-announce by clicking [here](https://groups.google.com/d/forum/feast-announce). * [Community Calendar](https://calendar.google.com/calendar/u/0?cid=ZTFsZHVhdGM3MDU3YTJucTBwMzNqNW5rajBAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ): Includes community calls and design meetings. * [Google Folder](https://drive.google.com/drive/u/0/folders/1jgMHOPDT2DvBlJeO9LCM79DP4lm4eOrR): This folder is used as a central repository for all Feast resources. For example: * Design proposals in the form of Request for Comments (RFC). @@ -27,7 +22,7 @@ ## Community Calls -We have a user and contributor community call every two weeks (Asia & US friendly). +We have a user and contributor community call every two weeks (US & EU friendly). {% hint style="info" %} Please join the above Feast user groups in order to see calendar invites to the community calls diff --git a/docs/getting-started/architecture-and-components/README.md b/docs/getting-started/architecture-and-components/README.md index c3286b8315..8a6e181ea7 100644 --- a/docs/getting-started/architecture-and-components/README.md +++ b/docs/getting-started/architecture-and-components/README.md @@ -12,5 +12,4 @@ {% page-ref page="provider.md" %} - - +{% page-reg page="batch-materialization-engine.md" %} diff --git a/docs/getting-started/architecture-and-components/batch-materialization-engine.md b/docs/getting-started/architecture-and-components/batch-materialization-engine.md new file mode 100644 index 0000000000..fb3c83ccb4 --- /dev/null +++ b/docs/getting-started/architecture-and-components/batch-materialization-engine.md @@ -0,0 +1,10 @@ +# Batch Materialization Engine + +A batch materialization engine is a component of Feast that's responsible for moving data from the offline store into the online store. + +A materialization engine abstracts over specific technologies or frameworks that are used to materialize data. It allows users to use a pure local serialized approach (which is the default LocalMaterializationEngine), or delegates the materialization to seperate components (e.g. AWS Lambda, as implemented by the the LambdaMaterializaionEngine). + +If the built-in engines are not sufficient, you can create your own custom materialization engine. Please see [this guide](../../how-to-guides/creating-a-custom-materialization-engine.md) for more details. + +Please see [feature\_store.yaml](../../reference/feature-repository/feature-store-yaml.md#overview) for configuring engines. + diff --git a/docs/getting-started/architecture-and-components/overview.md b/docs/getting-started/architecture-and-components/overview.md index 0c47fb2753..97bd779503 100644 --- a/docs/getting-started/architecture-and-components/overview.md +++ b/docs/getting-started/architecture-and-components/overview.md @@ -5,6 +5,7 @@ ## Functionality * **Create Batch Features:** ELT/ETL systems like Spark and SQL are used to transform data in the batch store. +* **Create Stream Features:** Stream features are created from streaming services such as Kafka or Kinesis, and can be pushed directly into Feast. * **Feast Apply:** The user (or CI) publishes versioned controlled feature definitions using `feast apply`. This CLI command updates infrastructure and persists definitions in the object store registry. * **Feast Materialize:** The user (or scheduler) executes `feast materialize` which loads features from the offline store into the online store. * **Model Training:** A model training pipeline is launched. It uses the Feast Python SDK to retrieve a training dataset and trains a model. @@ -23,8 +24,10 @@ A complete Feast deployment contains the following components: * Materialize (load) feature values into the online store. * Build and retrieve training datasets from the offline store. * Retrieve online features. +* **Stream Processor:** The Stream Processor can be used to ingest feature data from streams and write it into the online or offline stores. Currently, there's an experimental Spark processor that's able to consume data from Kafka. +* **Batch Materialization Engine:** The [Batch Materialization Engine](batch-materialization-engine.md) component launches a process which loads data into the online store from the offline store. By default, Feast uses a local in-process engine implementation to materialize data. However, additional infrastructure can be used for a more scalable materialization process. * **Online Store:** The online store is a database that stores only the latest feature values for each entity. The online store is populated by materialization jobs and from [stream ingestion](../../reference/data-sources/push.md). -* **Offline Store:** The offline store persists batch data that has been ingested into Feast. This data is used for producing training datasets. Feast does not manage the offline store directly, but runs queries against it. +* **Offline Store:** The offline store persists batch data that has been ingested into Feast. This data is used for producing training datasets. For feature retrieval and materialization, Feast does not manage the offline store directly, but runs queries against it. However, offline stores can be configured to write data to the offline store if Feast is configured to log served features and the offline store supports this functionality. {% hint style="info" %} Java and Go Clients are also available for online feature retrieval. diff --git a/docs/getting-started/architecture-and-components/stream-processor.md b/docs/getting-started/architecture-and-components/stream-processor.md new file mode 100644 index 0000000000..13b6e5b304 --- /dev/null +++ b/docs/getting-started/architecture-and-components/stream-processor.md @@ -0,0 +1,8 @@ +# Stream Processor + +A Stream Processor is responsible for consuming data from stream sources (such as Kafka, Kinesis, etc.) and loading it directly into the online (and optionally the offline store). + +A Stream Processor abstracts over specific technologies or frameworks that are used to materialize data. An experimental Spark Processor for Kafka is available in Feast. + +If the built-in processor is not sufficient, you can create your own custom processor. Please see [this tutorial](../../tutorials/building-streaming-features.md) for more details. + diff --git a/docs/getting-started/concepts/README.md b/docs/getting-started/concepts/README.md index 6f2f64955d..0fc415f059 100644 --- a/docs/getting-started/concepts/README.md +++ b/docs/getting-started/concepts/README.md @@ -18,4 +18,4 @@ {% page-ref page="point-in-time-joins.md" %} -{% page-ref page="registry.md" %} \ No newline at end of file +{% page-ref page="registry.md" %} diff --git a/docs/getting-started/feast-workshop.md b/docs/getting-started/feast-workshop.md index c883625dac..8b6778c2d3 100644 --- a/docs/getting-started/feast-workshop.md +++ b/docs/getting-started/feast-workshop.md @@ -30,13 +30,15 @@ _See also:_ [_Feast quickstart_](https://docs.feast.dev/getting-started/quicksta These are meant mostly to be done in order, with examples building on previous concepts. -| Time (min) | Description | Module | -| :--------: | ----------------------------------------------------------------------- | --------------------------------------------------------------------------- | -| 30-45 | Setting up Feast projects & CI/CD + powering batch predictions | [Module 0](https://github.com/feast-dev/feast-workshop/tree/main/module\_0) | -| 15-20 | Streaming ingestion & online feature retrieval with Kafka, Spark, Redis | [Module 1](https://github.com/feast-dev/feast-workshop/tree/main/module\_1) | -| 10-15 | Real-time feature engineering with on demand transformations | [Module 2](https://github.com/feast-dev/feast-workshop/tree/main/module\_2) | -| TBD | Feature server deployment (embed, as a service, AWS Lambda) | TBD | -| TBD | Versioning features / models in Feast | TBD | -| TBD | Data quality monitoring in Feast | TBD | -| TBD | Batch transformations | TBD | -| TBD | Stream transformations | TBD | +See https://github.com/feast-dev/feast-workshop + +| Time (min) | Description | Module | +| :--------: | ----------------------------------------------------------------------- |-----------| +| 30-45 | Setting up Feast projects & CI/CD + powering batch predictions | Module 0 | +| 15-20 | Streaming ingestion & online feature retrieval with Kafka, Spark, Redis | Module 1 | +| 10-15 | Real-time feature engineering with on demand transformations | Module 2 | +| TBD | Feature server deployment (embed, as a service, AWS Lambda) | TBD | +| TBD | Versioning features / models in Feast | TBD | +| TBD | Data quality monitoring in Feast | TBD | +| TBD | Batch transformations | TBD | +| TBD | Stream transformations | TBD | diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index b5fe7bad4b..7bbcb78732 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -82,7 +82,7 @@ online_store: from datetime import timedelta -from feast import Entity, FeatureView, Field, FileSource, ValueType +from feast import Entity, FeatureService, FeatureView, Field, FileSource, ValueType from feast.types import Float32, Int64 # Read data from parquet files. Parquet is convenient for local development mode. For @@ -224,7 +224,7 @@ To train a model, we need features and labels. Often, this label data is stored The user can query that table of labels with timestamps and pass that into Feast as an _entity dataframe_ for training data generation. In many cases, Feast will also intelligently join relevant tables to create the relevant feature vectors. -* Note that we include timestamps because want the features for the same driver at various timestamps to be used in a model. +* Note that we include timestamps because we want the features for the same driver at various timestamps to be used in a model. {% tabs %} {% tab title="Python" %} diff --git a/docs/getting-started/third-party-integrations.md b/docs/getting-started/third-party-integrations.md index 8a862891f8..ef47a11029 100644 --- a/docs/getting-started/third-party-integrations.md +++ b/docs/getting-started/third-party-integrations.md @@ -11,55 +11,11 @@ Don't see your offline store or online store of choice here? Check out our guide ## Integrations -### **Data Sources** - -* [x] [Snowflake source](https://docs.feast.dev/reference/data-sources/snowflake) -* [x] [Redshift source](https://docs.feast.dev/reference/data-sources/redshift) -* [x] [BigQuery source](https://docs.feast.dev/reference/data-sources/bigquery) -* [x] [Parquet file source](https://docs.feast.dev/reference/data-sources/file) -* [x] [Synapse source (community plugin)](https://github.com/Azure/feast-azure) -* [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) -* [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/data-sources/postgres) -* [x] [Spark (contrib plugin)](https://docs.feast.dev/reference/data-sources/spark) -* [x] Kafka / Kinesis sources (via [push support into the online store](https://docs.feast.dev/reference/data-sources/push)) -* [ ] HTTP source - -### Offline Stores - -* [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) -* [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) -* [x] [BigQuery](https://docs.feast.dev/reference/offline-stores/bigquery) -* [x] [Synapse (community plugin)](https://github.com/Azure/feast-azure) -* [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) -* [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/offline-stores/postgres) -* [x] [Trino (contrib plugin)](https://github.com/Shopify/feast-trino) -* [x] [Spark (contrib plugin)](https://docs.feast.dev/reference/offline-stores/spark) -* [x] [In-memory / Pandas](https://docs.feast.dev/reference/offline-stores/file) -* [x] [Custom offline store support](https://docs.feast.dev/how-to-guides/adding-a-new-offline-store) - -### Online Stores - -* [x] [DynamoDB](https://docs.feast.dev/reference/online-stores/dynamodb) -* [x] [Redis](https://docs.feast.dev/reference/online-stores/redis) -* [x] [Datastore](https://docs.feast.dev/reference/online-stores/datastore) -* [x] [SQLite](https://docs.feast.dev/reference/online-stores/sqlite) -* [x] [Azure Cache for Redis (community plugin)](https://github.com/Azure/feast-azure) -* [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/online-stores/postgres) -* [x] [Custom online store support](https://docs.feast.dev/how-to-guides/adding-support-for-a-new-online-store) -* [x] [Cassandra / AstraDB](https://github.com/datastaxdevs/feast-cassandra-online-store) -* [ ] Bigtable (in progress) - -### **Deployments** - -* [x] AWS Lambda (Alpha release. See [guide](../reference/alpha-aws-lambda-feature-server.md) and [RFC](https://docs.google.com/document/d/1eZWKWzfBif66LDN32IajpaG-j82LSHCCOzY6R7Ax7MI/edit)) -* [x] Kubernetes (See [guide](https://docs.feast.dev/how-to-guides/running-feast-in-production#4.3.-java-based-feature-server-deployed-on-kubernetes)) -* [ ] Cloud Run -* [ ] KNative - +See [Functionality and Roadmap](../../README.md#-functionality-and-roadmap) ## Standards -In order for a plugin integration to be highlighted on this page, it must meet the following requirements: +In order for a plugin integration to be highlighted, it must meet the following requirements: 1. The plugin must have tests. Ideally it would use the Feast universal tests (see this [guide](../how-to-guides/adding-or-reusing-tests.md) for an example), but custom tests are fine. 2. The plugin must have some basic documentation on how it should be used. diff --git a/docs/how-to-guides/adding-a-new-offline-store.md b/docs/how-to-guides/adding-a-new-offline-store.md index 8eeac7bcf4..c548538fce 100644 --- a/docs/how-to-guides/adding-a-new-offline-store.md +++ b/docs/how-to-guides/adding-a-new-offline-store.md @@ -2,13 +2,13 @@ ## Overview -Feast makes adding support for a new offline store (database) easy. Developers can simply implement the [OfflineStore](../../sdk/python/feast/infra/offline\_stores/offline\_store.py#L41) interface to add support for a new store (other than the existing stores like Parquet files, Redshift, and Bigquery). +Feast makes adding support for a new offline store easy. Developers can simply implement the [OfflineStore](../../sdk/python/feast/infra/offline\_stores/offline\_store.py#L41) interface to add support for a new store (other than the existing stores like Parquet files, Redshift, and Bigquery). In this guide, we will show you how to extend the existing File offline store and use in a feature repo. While we will be implementing a specific store, this guide should be representative for adding support for any new offline store. The full working code for this guide can be found at [feast-dev/feast-custom-offline-store-demo](https://github.com/feast-dev/feast-custom-offline-store-demo). -The process for using a custom offline store consists of 4 steps: +The process for using a custom offline store consists of 8 steps: 1. Defining an `OfflineStore` class. 2. Defining an `OfflineStoreConfig` class. @@ -16,6 +16,8 @@ The process for using a custom offline store consists of 4 steps: 4. Defining a `DataSource` class for the offline store 5. Referencing the `OfflineStore` in a feature repo's `feature_store.yaml` file. 6. Testing the `OfflineStore` class. +7. Updating dependencies. +8. Adding documentation. ## 1. Defining an OfflineStore class @@ -23,16 +25,37 @@ The process for using a custom offline store consists of 4 steps: OfflineStore class names must end with the OfflineStore suffix! {% endhint %} +### Contrib offline stores + +New offline stores go in `sdk/python/feast/infra/offline_stores/contrib/`. + +#### What is a contrib plugin? + +- Not guaranteed to implement all interface methods +- Not guaranteed to be stable. +- Should have warnings for users to indicate this is a contrib plugin that is not maintained by the maintainers. + +#### How do I make a contrib plugin an "official" plugin? +To move an offline store plugin out of contrib, you need: +- GitHub actions (i.e `make test-python-integration`) is setup to run all tests against the offline store and pass. +- At least two contributors own the plugin (ideally tracked in our `OWNERS` / `CODEOWNERS` file). + +#### Define the offline store class The OfflineStore class contains a couple of methods to read features from the offline store. Unlike the OnlineStore class, Feast does not manage any infrastructure for the offline store. -There are two methods that deal with reading data from the offline stores`get_historical_features`and `pull_latest_from_table_or_query`. +To fully implement the interface for the offline store, you will need to implement these methods: * `pull_latest_from_table_or_query` is invoked when running materialization (using the `feast materialize` or `feast materialize-incremental` commands, or the corresponding `FeatureStore.materialize()` method. This method pull data from the offline store, and the `FeatureStore` class takes care of writing this data into the online store. * `get_historical_features` is invoked when reading values from the offline store using the `FeatureStore.get_historical_features()` method. Typically, this method is used to retrieve features when training ML models. -* `pull_all_from_table_or_query` is a method that pulls all the data from an offline store from a specified start date to a specified end date. +* (optional) `offline_write_batch` is a method that supports directly pushing a pyarrow table to a feature view. Given a feature view with a specific schema, this function should write the pyarrow table to the batch source defined. More details about the push api can be found [here](docs/reference/data-sources/push.md). This method only needs implementation if you want to support the push api in your offline store. +* (optional) `pull_all_from_table_or_query` is a method that pulls all the data from an offline store from a specified start date to a specified end date. This method is only used for **SavedDatasets** as part of data quality monitoring validation. +* (optional) `write_logged_features` is a method that takes a pyarrow table or a path that points to a parquet file and writes the data to a defined source defined by `LoggingSource` and `LoggingConfig`. This method is only used internally for **SavedDatasets**. {% code title="feast_custom_offline_store/file.py" %} ```python + # Only prints out runtime warnings once. + warnings.simplefilter("once", RuntimeWarning) + def get_historical_features(self, config: RepoConfig, feature_views: List[FeatureView], @@ -40,14 +63,15 @@ There are two methods that deal with reading data from the offline stores`get_hi entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False) -> RetrievalJob: - print("Getting historical features from my offline store") - return super().get_historical_features(config, - feature_views, - feature_refs, - entity_df, - registry, - project, - full_feature_names) + """ Perform point-in-time correct join of features onto an entity dataframe(entity key and timestamp). More details about how this should work at https://docs.feast.dev/v/v0.6-branch/user-guide/feature-retrieval#3.-historical-feature-retrieval. + print("Getting historical features from my offline store").""" + warnings.warn( + "This offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + # Implementation here. + pass def pull_latest_from_table_or_query(self, config: RepoConfig, @@ -58,18 +82,78 @@ There are two methods that deal with reading data from the offline stores`get_hi created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime) -> RetrievalJob: + """ Pulls data from the offline store for use in materialization.""" print("Pulling latest features from my offline store") - return super().pull_latest_from_table_or_query(config, - data_source, - join_key_columns, - feature_name_columns, - timestamp_field=timestamp_field, - created_timestamp_column, - start_date, - end_date) + warnings.warn( + "This offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + # Implementation here. + pass + + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + """ Optional method that returns a Retrieval Job for all join key columns, feature name columns, and the event timestamp columns that occur between the start_date and end_date.""" + warnings.warn( + "This offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + # Implementation here. + pass + + def write_logged_features( + config: RepoConfig, + data: Union[pyarrow.Table, Path], + source: LoggingSource, + logging_config: LoggingConfig, + registry: BaseRegistry, + ): + """ Optional method to have Feast support logging your online features.""" + warnings.warn( + "This offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + # Implementation here. + pass + + def offline_write_batch( + config: RepoConfig, + feature_view: FeatureView, + table: pyarrow.Table, + progress: Optional[Callable[[int], Any]], + ): + """ Optional method to have Feast support the offline push api for your offline store.""" + warnings.warn( + "This offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + # Implementation here. + pass + ``` {% endcode %} +### 1.1 Type Mapping + +Most offline stores will have to perform some custom mapping of offline store datatypes to feast value types. +- The function to implement here are `source_datatype_to_feast_value_type` and `get_column_names_and_types` in your `DataSource` class. +* `source_datatype_to_feast_value_type` is used to convert your DataSource's datatypes to feast value types. +* `get_column_names_and_types` retrieves the column names and corresponding datasource types. + +Add any helper functions for type conversion to `sdk/python/feast/type_map.py`. +- Be sure to implement correct type mapping so that Feast can process your feature columns without casting incorrectly that can potentially cause loss of information or incorrect data. + ## 2. Defining an OfflineStoreConfig class Additional configuration may be needed to allow the OfflineStore to talk to the backing store. For example, Redshift needs configuration information like the connection information for the Redshift instance, credentials for connecting to the database, etc. @@ -91,6 +175,8 @@ class CustomFileOfflineStoreConfig(FeastConfigBaseModel): type: Literal["feast_custom_offline_store.file.CustomFileOfflineStore"] \ = "feast_custom_offline_store.file.CustomFileOfflineStore" + + uri: str # URI for your offline store(in this case it would be a path) ``` {% endcode %} @@ -98,11 +184,18 @@ This configuration can be specified in the `feature_store.yaml` as follows: {% code title="feature_repo/feature_store.yaml" %} ```yaml -type: feast_custom_offline_store.file.CustomFileOfflineStore +project: my_project +registry: data/registry.db +provider: local +offline_store: + type: feast_custom_offline_store.file.CustomFileOfflineStore + uri: +online_store: + path: data/online_store.db ``` {% endcode %} -This configuration information is available to the methods of the OfflineStore, via the`config: RepoConfig` parameter which is passed into the methods of the OfflineStore interface, specifically at the `config.offline_store` field of the `config` parameter. +This configuration information is available to the methods of the OfflineStore, via the `config: RepoConfig` parameter which is passed into the methods of the OfflineStore interface, specifically at the `config.offline_store` field of the `config` parameter. This fields in the `feature_store.yaml` should map directly to your `OfflineStoreConfig` class that is detailed above in Section 2. {% code title="feast_custom_offline_store/file.py" %} ```python @@ -113,7 +206,11 @@ This configuration information is available to the methods of the OfflineStore, entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False) -> RetrievalJob: - + warnings.warn( + "This offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) offline_store_config = config.offline_store assert isinstance(offline_store_config, CustomFileOfflineStoreConfig) store_type = offline_store_config.type @@ -128,6 +225,8 @@ Custom offline stores may need to implement their own instances of the `Retrieva The `RetrievalJob` interface exposes two methods - `to_df` and `to_arrow`. The expectation is for the retrieval job to be able to return the rows read from the offline store as a parquet DataFrame, or as an Arrow table respectively. +Users who want to have their offline store support **scalable batch materialization** for online use cases (detailed in this [RFC](https://docs.google.com/document/d/1J7XdwwgQ9dY_uoV9zkRVGQjK9Sy43WISEW6D5V9qzGo/edit#heading=h.9gaqqtox9jg6)) will also need to implement `to_remote_storage` to distribute the reading and writing of offline store records to blob storage (such as S3). This may be used by a custom [Materialization Engine](https://github.com/feast-dev/feast/blob/master/sdk/python/feast/infra/materialization/batch_materialization_engine.py#L72) to parallelize the materialization of data by processing it in chunks. If this is not implemented, Feast will default to local materialization (pulling all records into memory to materialize). + {% code title="feast_custom_offline_store/file.py" %} ```python class CustomFileRetrievalJob(RetrievalJob): @@ -148,6 +247,10 @@ class CustomFileRetrievalJob(RetrievalJob): print("Getting a pandas DataFrame from a File is easy!") df = self.evaluation_function() return pyarrow.Table.from_pandas(df) + + def to_remote_storage(self): + # Optional method to write to an offline storage location to support scalable batch materialization. + pass ``` {% endcode %} @@ -171,6 +274,9 @@ class CustomFileDataSource(FileSource): created_timestamp_column: Optional[str] = "", date_partition_column: Optional[str] = "", ): + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) super(CustomFileDataSource, self).__init__( timestamp_field=timestamp_field, created_timestamp_column, @@ -225,11 +331,12 @@ project: test_custom registry: data/registry.db provider: local offline_store: + # Make sure to specify the type as the fully qualified path that Feast can import. type: feast_custom_offline_store.file.CustomFileOfflineStore ``` {% endcode %} -If additional configuration for the offline store is **not **required, then we can omit the other fields and only specify the `type` of the offline store class as the value for the `offline_store`. +If additional configuration for the offline store is **not** required, then we can omit the other fields and only specify the `type` of the offline store class as the value for the `offline_store`. {% code title="feature_repo/feature_store.yaml" %} ```yaml @@ -244,7 +351,7 @@ Finally, the custom data source class can be use in the feature repo to define a {% code title="feature_repo/repo.py" %} ```python -pdriver_hourly_stats = CustomFileDataSource( +driver_hourly_stats = CustomFileDataSource( path="feature_repo/data/driver_stats.parquet", timestamp_field="event_timestamp", created_timestamp_column="created", @@ -260,23 +367,70 @@ driver_hourly_stats_view = FeatureView( ## 6. Testing the OfflineStore class -Even if you have created the `OfflineStore` class in a separate repo, you can still test your implementation against the Feast test suite, as long as you have Feast as a submodule in your repo. In the Feast submodule, we can run all the unit tests with: +### Integrating with the integration test suite and unit test suite. -``` -make test -``` +Even if you have created the `OfflineStore` class in a separate repo, you can still test your implementation against the Feast test suite, as long as you have Feast as a submodule in your repo. + +1. In order to test against the test suite, you need to create a custom `DataSourceCreator` that implement our testing infrastructure methods, `create_data_source` and optionally, `created_saved_dataset_destination`. + * `create_data_source` should create a datasource based on the dataframe passed in. It may be implemented by uploading the contents of the dataframe into the offline store and returning a datasource object pointing to that location. See `BigQueryDataSourceCreator` for an implementation of a data source creator. + * `created_saved_dataset_destination` is invoked when users need to save the dataset for use in data validation. This functionality is still in alpha and is **optional**. + +2. Make sure that your offline store doesn't break any unit tests first by running: + ``` + make test-python + ``` + +3. Next, set up your offline store to run the universal integration tests. These are integration tests specifically intended to test offline and online stores against Feast API functionality, to ensure that the Feast APIs works with your offline store. + - Feast parametrizes integration tests using the `FULL_REPO_CONFIGS` variable defined in `sdk/python/tests/integration/feature_repos/repo_configuration.py` which stores different offline store classes for testing. + - To overwrite the default configurations to use your own offline store, you can simply create your own file that contains a `FULL_REPO_CONFIGS` dictionary, and point Feast to that file by setting the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. The module should add new `IntegrationTestRepoConfig` classes to the `AVAILABLE_OFFLINE_STORES` by defining an offline store that you would like Feast to test with. + + A sample `FULL_REPO_CONFIGS_MODULE` looks something like this: + + ```python + # Should go in sdk/python/feast/infra/offline_stores/contrib/postgres_repo_configuration.py + from feast.infra.offline_stores.contrib.postgres_offline_store.tests.data_source import ( + PostgreSQLDataSourceCreator, + ) -The universal tests, which are integration tests specifically intended to test offline and online stores, can be run with: + AVAILABLE_OFFLINE_STORES = [("local", PostgreSQLDataSourceCreator)] + ``` + +4. You should swap out the `FULL_REPO_CONFIGS` environment variable and run the integration tests against your offline store. In the example repo, the file that overwrites `FULL_REPO_CONFIGS` is `feast_custom_offline_store/feast_tests.py`, so you would run: + + ```bash + export FULL_REPO_CONFIGS_MODULE='feast_custom_offline_store.feast_tests' + make test-python-universal + ``` + + If the integration tests fail, this indicates that there is a mistake in the implementation of this offline store! + +5. Remember to add your datasource to `repo_config.py` similar to how we added `spark`, `trino`, etc, to the dictionary `OFFLINE_STORE_CLASS_FOR_TYPE` and add the necessary configuration to `repo_configuration.py`. Namely, `AVAILABLE_OFFLINE_STORES` should load your repo configuration module. + +### 7. Dependencies + +Add any dependencies for your offline store to our `sdk/python/setup.py` under a new `__REQUIRED` list with the packages and add it to the setup script so that if your offline store is needed, users can install the necessary python packages. These packages should be defined as extras so that they are not installed by users by default. +You will need to regenerate our requirements files. To do this, create separate pyenv environments for python 3.8, 3.9, and 3.10. In each environment, run the following commands: ``` -make test-python-universal +export PYTHON= +make lock-python-ci-dependencies ``` -The unit tests should succeed, but the universal tests will likely fail. The tests are parametrized based on the `FULL_REPO_CONFIGS` variable defined in `sdk/python/tests/integration/feature_repos/repo_configuration.py`. To overwrite these configurations, you can simply create your own file that contains a `FULL_REPO_CONFIGS`, and point Feast to that file by setting the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. The main challenge there will be to write a `DataSourceCreator` for the offline store. In this repo, the file that overwrites `FULL_REPO_CONFIGS` is `feast_custom_offline_store/feast_tests.py`, so you would run -``` -export FULL_REPO_CONFIGS_MODULE='feast_custom_offline_store.feast_tests' -make test-python-universal +### 8. Add Documentation + +Remember to add documentation for your offline store. + +1. Add a new markdown file to `docs/reference/offline-stores/` and `docs/reference/data-sources/`. Use these files to document your offline store functionality similar to how the other offline stores are documented. +2. You should also add a reference in `docs/reference/data-sources/README.md` and `docs/SUMMARY.md` to these markdown files. + +**NOTE**: Be sure to document the following things about your offline store: +- How to create the datasource and most what configuration is needed in the `feature_store.yaml` file in order to create the datasource. +- Make sure to flag that the datasource is in alpha development. +- Add some documentation on what the data model is for the specific offline store for more clarity. +- Finally, generate the python code docs by running: + +```bash +make build-sphinx ``` -to test the offline store against the Feast universal tests. You should notice that some of the tests actually fail; this indicates that there is a mistake in the implementation of this offline store! diff --git a/docs/how-to-guides/adding-support-for-a-new-online-store.md b/docs/how-to-guides/adding-support-for-a-new-online-store.md index fee47945bf..d1f5986f18 100644 --- a/docs/how-to-guides/adding-support-for-a-new-online-store.md +++ b/docs/how-to-guides/adding-support-for-a-new-online-store.md @@ -8,12 +8,15 @@ In this guide, we will show you how to integrate with MySQL as an online store. The full working code for this guide can be found at [feast-dev/feast-custom-online-store-demo](https://github.com/feast-dev/feast-custom-online-store-demo). -The process of using a custom online store consists of 3 steps: + +The process of using a custom online store consists of 6 steps: 1. Defining the `OnlineStore` class. 2. Defining the `OnlineStoreConfig` class. 3. Referencing the `OnlineStore` in a feature repo's `feature_store.yaml` file. 4. Testing the `OnlineStore` class. +5. Update dependencies. +6. Add documentation. ## 1. Defining an OnlineStore class @@ -21,6 +24,21 @@ The process of using a custom online store consists of 3 steps: OnlineStore class names must end with the OnlineStore suffix! {% endhint %} +### Contrib online stores + +New online stores go in `sdk/python/feast/infra/online_stores/contrib/`. + +#### What is a contrib plugin? + +- Not guaranteed to implement all interface methods +- Not guaranteed to be stable. +- Should have warnings for users to indicate this is a contrib plugin that is not maintained by the maintainers. + +#### How do I make a contrib plugin an "official" plugin? +To move an online store plugin out of contrib, you need: +- GitHub actions (i.e `make test-python-integration`) is setup to run all tests against the online store and pass. +- At least two contributors own the plugin (ideally tracked in our `OWNERS` / `CODEOWNERS` file). + The OnlineStore class broadly contains two sets of methods * One set deals with managing infrastructure that the online store needed for operations @@ -40,6 +58,9 @@ The `teardown` method should be used to perform any clean-up operations. `teardo {% code title="feast_custom_online_store/mysql.py" %} ```python +# Only prints out runtime warnings once. +warnings.simplefilter("once", RuntimeWarning) + def update( self, config: RepoConfig, @@ -50,8 +71,13 @@ def update( partial: bool, ): """ - An example of creating manging the tables needed for a mysql-backed online store. + An example of creating managing the tables needed for a mysql-backed online store. """ + warnings.warn( + "This online store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) conn = self._get_conn(config) cur = conn.cursor(buffered=True) @@ -78,9 +104,11 @@ def teardown( tables: Sequence[Union[FeatureTable, FeatureView]], entities: Sequence[Entity], ): - """ - - """ + warnings.warn( + "This online store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) conn = self._get_conn(config) cur = conn.cursor(buffered=True) project = config.project @@ -102,6 +130,9 @@ There are two methods that deal with writing data to and from the online stores. {% code title="feast_custom_online_store/mysql.py" %} ```python +# Only prints out runtime warnings once. +warnings.simplefilter("once", RuntimeWarning) + def online_write_batch( self, config: RepoConfig, @@ -111,6 +142,11 @@ def online_write_batch( ], progress: Optional[Callable[[int], Any]], ) -> None: + warnings.warn( + "This online store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) conn = self._get_conn(config) cur = conn.cursor(buffered=True) @@ -135,6 +171,11 @@ def online_read( entity_keys: List[EntityKeyProto], requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + warnings.warn( + "This online store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) conn = self._get_conn(config) cur = conn.cursor(buffered=True) @@ -166,6 +207,16 @@ def online_read( ``` {% endcode %} +### 1.3 Type Mapping + +Most online stores will have to perform some custom mapping of online store datatypes to feast value types. +- The function to implement here are `source_datatype_to_feast_value_type` and `get_column_names_and_types` in your `DataSource` class. +* `source_datatype_to_feast_value_type` is used to convert your DataSource's datatypes to feast value types. +* `get_column_names_and_types` retrieves the column names and corresponding datasource types. + +Add any helper functions for type conversion to `sdk/python/feast/type_map.py`. +- Be sure to implement correct type mapping so that Feast can process your feature columns without casting incorrectly that can potentially cause loss of information or incorrect data. + ## 2. Defining an OnlineStoreConfig class Additional configuration may be needed to allow the OnlineStore to talk to the backing store. For example, MySQL may need configuration information like the host at which the MySQL instance is running, credentials for connecting to the database, etc. @@ -243,7 +294,8 @@ To use our MySQL online store, we can use the following `feature_store.yaml`: project: test_custom registry: data/registry.db provider: local -online_store: +online_store: + # Make sure to specify the type as the fully qualified path that Feast can import. type: feast_custom_online_store.mysql.MySQLOnlineStore user: foo password: bar @@ -263,23 +315,99 @@ online_store: feast_custom_online_store.mysql.MySQLOnlineStore ## 4. Testing the OnlineStore class -Even if you have created the `OnlineStore` class in a separate repo, you can still test your implementation against the Feast test suite, as long as you have Feast as a submodule in your repo. In the Feast submodule, we can run all the unit tests with: +### Integrating with the integration test suite and unit test suite. + +Even if you have created the `OnlineStore` class in a separate repo, you can still test your implementation against the Feast test suite, as long as you have Feast as a submodule in your repo. + +1. In the Feast submodule, we can run all the unit tests and make sure they pass: + ``` + make test-python + ``` + +2. The universal tests, which are integration tests specifically intended to test offline and online stores, should be run against Feast to ensure that the Feast APIs works with your online store. + - Feast parametrizes integration tests using the `FULL_REPO_CONFIGS` variable defined in `sdk/python/tests/integration/feature_repos/repo_configuration.py` which stores different online store classes for testing. + - To overwrite these configurations, you can simply create your own file that contains a `FULL_REPO_CONFIGS` variable, and point Feast to that file by setting the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. + +A sample `FULL_REPO_CONFIGS_MODULE` looks something like this: + +{% code title="sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py" %} +```python +from feast.infra.offline_stores.contrib.postgres_offline_store.tests.data_source import ( + PostgreSQLDataSourceCreator, +) + +AVAILABLE_ONLINE_STORES = {"postgres": (None, PostgreSQLDataSourceCreator)} ``` -make test +{% endcode %} + + +If you are planning to start the online store up locally(e.g spin up a local Redis Instance) for testing, then the dictionary entry should be something like: + + +```python +{ + "sqlite": ({"type": "sqlite"}, None), + # Specifies sqlite as the online store. The `None` object specifies to not use a containerized docker container. +} ``` -The universal tests, which are integration tests specifically intended to test offline and online stores, can be run with: +If you are planning instead to use a Dockerized container to run your tests against your online store, you can define a `OnlineStoreCreator` and replace the `None` object above with your `OnlineStoreCreator` class. + + +If you create a containerized docker image for testing, developers who are trying to test with your online store will not have to spin up their own instance of the online store for testing. An example of an `OnlineStoreCreator` is shown below: + +{% code title="sdk/python/tests/integration/feature_repos/universal/online_store/redis.py" %} +```python +class RedisOnlineStoreCreator(OnlineStoreCreator): + def __init__(self, project_name: str, **kwargs): + super().__init__(project_name) + + def create_online_store(self) -> Dict[str, str]: + self.container.start() + log_string_to_wait_for = "Ready to accept connections" + wait_for_logs( + container=self.container, predicate=log_string_to_wait_for, timeout=10 + ) + self.container.stop() ``` +{% endcode %} + +3\. You should swap out the `FULL_REPO_CONFIGS` environment variable and run the integration tests against your online store. In the example repo, the file that overwrites `FULL_REPO_CONFIGS` is `feast_custom_online_store/feast_tests.py`, so you would run: + +```bash +export FULL_REPO_CONFIGS_MODULE='feast_custom_online_store.feast_tests' make test-python-universal ``` -The unit tests should succeed, but the universal tests will likely fail. The tests are parametrized based on the `FULL_REPO_CONFIGS` variable defined in `sdk/python/tests/integration/feature_repos/repo_configuration.py`. To overwrite these configurations, you can simply create your own file that contains a `FULL_REPO_CONFIGS`, and point Feast to that file by setting the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. In this repo, the file that overwrites `FULL_REPO_CONFIGS` is `feast_custom_online_store/feast_tests.py`, so you would run +- If there are some tests that fail, this indicates that there is a mistake in the implementation of this online store! + + +### 5. Add Dependencies + +Add any dependencies for your online store to our `sdk/python/setup.py` under a new `_REQUIRED` list with the packages and add it to the setup script so that if your online store is needed, users can install the necessary python packages. These packages should be defined as extras so that they are not installed by users by default. +- You will need to regenerate our requirements files. To do this, create separate pyenv environments for python 3.8, 3.9, and 3.10. In each environment, run the following commands: ``` -export FULL_REPO_CONFIGS_MODULE='feast_custom_online_store.feast_tests' -make test-python-universal +export PYTHON= +make lock-python-ci-dependencies +``` + + +### 6. Add Documentation + +Remember to add the documentation for your online store. +1. Add a new markdown file to `docs/reference/online-stores/`. +2. You should also add a reference in `docs/reference/online-stores/README.md` and `docs/SUMMARY.md`. Add a new markdown document to document your online store functionality similar to how the other online stores are documented. + +**NOTE**:Be sure to document the following things about your online store: +- Be sure to cover how to create the datasource and what configuration is needed in the `feature_store.yaml` file in order to create the datasource. +- Make sure to flag that the online store is in alpha development. +- Add some documentation on what the data model is for the specific online store for more clarity. +- Finally, generate the python code docs by running: + +```bash +make build-sphinx ``` -to test the MySQL online store against the Feast universal tests. You should notice that some of the tests actually fail; this indicates that there is a mistake in the implementation of this online store! diff --git a/docs/how-to-guides/automated-feast-upgrade.md b/docs/how-to-guides/automated-feast-upgrade.md new file mode 100644 index 0000000000..ff17748537 --- /dev/null +++ b/docs/how-to-guides/automated-feast-upgrade.md @@ -0,0 +1,78 @@ +# Automated upgrades for Feast 0.20+ + +## Overview + +Starting with Feast 0.20, the APIs of many core objects (e.g. feature views and entities) have been changed. +For example, many parameters have been renamed. +These changes were made in a backwards-compatible fashion; existing Feast repositories will continue to work until Feast 0.23, without any changes required. +However, Feast 0.24 will fully deprecate all of the old parameters, so in order to use Feast 0.24+ users must modify their Feast repositories. + +There are currently deprecation warnings that indicate to users exactly how to modify their repos. +In order to make the process somewhat easier, Feast 0.23 also introduces a new CLI command, `repo-upgrade`, that will partially automate the process of upgrading Feast repositories. + +The upgrade command aims to automatically modify the object definitions in a feature repo to match the API required by Feast 0.24+. When running the command, the Feast CLI analyzes the source code in the feature repo files using [bowler](https://pybowler.io/), and attempted to rewrite the files in a best-effort way. It's possible for there to be parts of the API that are not upgraded automatically. + +The `repo-upgrade` command is specifically meant for upgrading Feast repositories that were initially created in versions 0.23 and below to be compatible with versions 0.24 and above. +It is not intended to work for any future upgrades. + +## Usage + +At the root of a feature repo, you can run `feast repo-upgrade`. By default, the CLI only echos the changes it's planning on making, and does not modify any files in place. If the changes look reasonably, you can specify the `--write` flag to have the changes be written out to disk. + +An example: +```bash +$ feast repo-upgrade --write +--- /Users/achal/feast/prompt_dory/example.py ++++ /Users/achal/feast/prompt_dory/example.py +@@ -13,7 +13,6 @@ + path="/Users/achal/feast/prompt_dory/data/driver_stats.parquet", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", +- date_partition_column="created" + ) + + # Define an entity for the driver. You can think of entity as a primary key used to +--- /Users/achal/feast/prompt_dory/example.py ++++ /Users/achal/feast/prompt_dory/example.py +@@ -3,7 +3,7 @@ + from google.protobuf.duration_pb2 import Duration + import pandas as pd + +-from feast import Entity, Feature, FeatureView, FileSource, ValueType, FeatureService, OnDemandFeatureView ++from feast import Entity, FeatureView, FileSource, ValueType, FeatureService, OnDemandFeatureView + + # Read data from parquet files. Parquet is convenient for local development mode. For + # production, you can use your favorite DWH, such as BigQuery. See Feast documentation +--- /Users/achal/feast/prompt_dory/example.py ++++ /Users/achal/feast/prompt_dory/example.py +@@ -4,6 +4,7 @@ + import pandas as pd + + from feast import Entity, Feature, FeatureView, FileSource, ValueType, FeatureService, OnDemandFeatureView ++from feast import Field + + # Read data from parquet files. Parquet is convenient for local development mode. For + # production, you can use your favorite DWH, such as BigQuery. See Feast documentation +--- /Users/achal/feast/prompt_dory/example.py ++++ /Users/achal/feast/prompt_dory/example.py +@@ -28,9 +29,9 @@ + entities=["driver_id"], + ttl=Duration(seconds=86400 * 365), + features=[ +- Feature(name="conv_rate", dtype=ValueType.FLOAT), +- Feature(name="acc_rate", dtype=ValueType.FLOAT), +- Feature(name="avg_daily_trips", dtype=ValueType.INT64), ++ Field(name="conv_rate", dtype=ValueType.FLOAT), ++ Field(name="acc_rate", dtype=ValueType.FLOAT), ++ Field(name="avg_daily_trips", dtype=ValueType.INT64), + ], + online=True, + batch_source=driver_hourly_stats, +``` +--- +To write these changes out, you can run the same command with the `--write` flag: +```bash +$ feast repo-upgrade --write +``` + +You should see the same output, but also see the changes reflected in your feature repo on disk. \ No newline at end of file diff --git a/docs/how-to-guides/creating-a-custom-materialization-engine.md b/docs/how-to-guides/creating-a-custom-materialization-engine.md new file mode 100644 index 0000000000..935ac3dc99 --- /dev/null +++ b/docs/how-to-guides/creating-a-custom-materialization-engine.md @@ -0,0 +1,125 @@ +# Adding a custom materialization engine + +### Overview + +Feast batch materialization operations (`materialize` and `materialize-incremental`) execute through a `BatchMaterializationEngine`. + +Custom batch materialization engines allow Feast users to extend Feast to customize the materialization process. Examples include: + +* Setting up custom materialization-specific infrastructure during `feast apply` (e.g. setting up Spark clusters or Lambda Functions) +* Launching custom batch ingestion \(materialization\) jobs \(Spark, Beam, AWS Lambda\) +* Tearing down custom materialization-specific infrastructure during `feast teardown` (e.g. tearing down Spark clusters, or deleting Lambda Functions) + +Feast comes with built-in materialization engines, e.g, `LocalMaterializationEngine`, and an experimental `LambdaMaterializationEngine`. However, users can develop their own materialization engines by creating a class that implements the contract in the [BatchMaterializationEngine class](https://github.com/feast-dev/feast/blob/6d7b38a39024b7301c499c20cf4e7aef6137c47c/sdk/python/feast/infra/materialization/batch_materialization_engine.py#L72). + +### Guide + +The fastest way to add custom logic to Feast is to extend an existing materialization engine. The most generic engine is the `LocalMaterializationEngine` which contains no cloud-specific logic. The guide that follows will extend the `LocalProvider` with operations that print text to the console. It is up to you as a developer to add your custom code to the engine methods, but the guide below will provide the necessary scaffolding to get you started. + +#### Step 1: Define an Engine class + +The first step is to define a custom materialization engine class. We've created the `MyCustomEngine` below. + +```python +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union + +from feast.entity import Entity +from feast.feature_view import FeatureView +from feast.batch_feature_view import BatchFeatureView +from feast.stream_feature_view import StreamFeatureView +from feast.infra.materialization import LocalMaterializationEngine, LocalMaterializationJob, MaterializationTask +from feast.infra.offline_stores.offline_store import OfflineStore +from feast.infra.online_stores.online_store import OnlineStore +from feast.repo_config import RepoConfig + + +class MyCustomEngine(LocalMaterializationEngine): + def __init__( + self, + *, + repo_config: RepoConfig, + offline_store: OfflineStore, + online_store: OnlineStore, + **kwargs, + ): + super().__init__( + repo_config=repo_config, + offline_store=offline_store, + online_store=online_store, + **kwargs, + ) + + def update( + self, + project: str, + views_to_delete: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + views_to_keep: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + ): + print("Creating new infrastructure is easy here!") + pass + + def materialize( + self, registry, tasks: List[MaterializationTask] + ) -> List[LocalMaterializationJob]: + print("Launching custom batch jobs or multithreading things is pretty easy...") + return [ + self._materialize_one( + registry, + task.feature_view, + task.start_time, + task.end_time, + task.project, + task.tqdm_builder, + ) + for task in tasks + ] + +``` + +Notice how in the above engine we have only overwritten two of the methods on the `LocalMaterializatinEngine`, namely `update` and `materialize`. These two methods are convenient to replace if you are planning to launch custom batch jobs. + +#### Step 2: Configuring Feast to use the engine + +Configure your [feature\_store.yaml](../reference/feature-repository/feature-store-yaml.md) file to point to your new engine class: + +```yaml +project: repo +registry: registry.db +batch_engine: feast_custom_engine.MyCustomEngine +online_store: + type: sqlite + path: online_store.db +offline_store: + type: file +``` + +Notice how the `batch_engine` field above points to the module and class where your engine can be found. + +#### Step 3: Using the engine + +Now you should be able to use your engine by running a Feast command: + +```bash +feast apply +``` + +```text +Registered entity driver_id +Registered feature view driver_hourly_stats +Deploying infrastructure for driver_hourly_stats +Creating new infrastructure is easy here! +``` + +It may also be necessary to add the module root path to your `PYTHONPATH` as follows: + +```bash +PYTHONPATH=$PYTHONPATH:/home/my_user/my_custom_engine feast apply +``` + +That's it. You should now have a fully functional custom engine! diff --git a/docs/how-to-guides/fetching-java-features-k8s.md b/docs/how-to-guides/fetching-java-features-k8s.md deleted file mode 100644 index 1aa6abd52b..0000000000 --- a/docs/how-to-guides/fetching-java-features-k8s.md +++ /dev/null @@ -1,15 +0,0 @@ -# How to set up a Java feature server - -This tutorial guides you on how to: - -* Define features and data sources in Feast using the Feast CLI -* Materialize features to a Redis cluster deployed on Kubernetes. -* Deploy a Feast Java feature server into a Kubernetes cluster using the Feast helm charts -* Retrieve features using the gRPC API exposed by the Feast Java server - -Try it and let us know what you think! - -| ![](../.gitbook/assets/github-mark-32px.png)[ View guide in Github](../../examples/java-demo/README.md) | -|:--------------------------------------------------------------------------------------------------------| - - diff --git a/docs/how-to-guides/running-feast-in-production.md b/docs/how-to-guides/running-feast-in-production.md index 6023c5ac66..f03629ea4b 100644 --- a/docs/how-to-guides/running-feast-in-production.md +++ b/docs/how-to-guides/running-feast-in-production.md @@ -242,14 +242,12 @@ This service will provide an HTTP API with JSON I/O, which can be easily used wi [Read more about this feature](../reference/alpha-aws-lambda-feature-server.md) -### 4.3. Java based Feature Server deployed on Kubernetes +### 4.3. Go feature server deployed on Kubernetes -For users with very latency-sensitive and high QPS use-cases, Feast offers a high-performance Java feature server. -Besides the benefits of running on JVM, this implementation also provides a gRPC API, which guarantees good connection utilization and -small request / response body size (compared to JSON). -You will need the Feast Java SDK to retrieve features from this service. This SDK wraps all the gRPC logic for you and provides more convenient APIs. +For users with very latency-sensitive and high QPS use-cases, Feast offers a high-performance [Go feature server](../reference/feature-servers/go-feature-server.md). +It can use either HTTP or gRPC. -The Java based feature server can be deployed to Kubernetes cluster via Helm charts in a few simple steps: +The Go feature server can be deployed to a Kubernetes cluster via Helm charts in a few simple steps: 1. Install [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) and [helm 3](https://helm.sh/) 2. Add the Feast Helm repository and download the latest charts: @@ -259,18 +257,15 @@ helm repo update ``` 3. Run Helm Install ``` -helm install feast-release feast-charts/feast \ +helm install feast-release feast-charts/feast-feature-server \ --set global.registry.path=s3://feast/registries/prod \ --set global.project= ``` -This chart will deploy two services: `feature-server` and `transformation-service`. -Both must have read access to the registry file on cloud storage. Both will keep a copy of the registry in their memory and periodically refresh it, so expect some delays in update propagation in exchange for better performance. - -#### Load balancing - -The next step would be to install an L7 Load Balancer (eg, [Envoy](https://www.envoyproxy.io/)) in front of the Java feature server. -For seamless integration with Kubernetes (including services created by Feast Helm chart) we recommend using [Istio](https://istio.io/) as Envoy's orchestrator. +This chart will deploy a single service. +The service must have read access to the registry file on cloud storage. +It will keep a copy of the registry in their memory and periodically refresh it, so expect some delays in update propagation in exchange for better performance. +In order for the Go feature server to be enabled, you should set `go_feature_serving: True` in the `feature_store.yaml`. ## 5. Ingesting features from a stream source @@ -344,8 +339,8 @@ Summarizing it all together we want to show several options of architecture that * Feast SDK is being triggered by CI (eg, Github Actions). It applies the latest changes from the feature repo to the Feast registry * Airflow manages materialization jobs to ingest data from DWH to the online store periodically * For the stream ingestion Feast Python SDK is used in the existing Spark / Beam pipeline -* Online features are served via either a Python feature server or a high performance Java feature server - * Both the Java feature server and the transformation server are deployed on Kubernetes cluster (via Helm charts) +* Online features are served via either a Python feature server or a high performance Go feature server + * The Go feature server can be deployed on a Kubernetes cluster (via Helm charts) * Feast Python SDK is called locally to generate a training dataset ![From Repository to Production: Feast Production Architecture](production-spark.png) diff --git a/docs/project/development-guide.md b/docs/project/development-guide.md index 6d5bee16af..58e29a5ca7 100644 --- a/docs/project/development-guide.md +++ b/docs/project/development-guide.md @@ -11,85 +11,6 @@ This guide is targeted at developers looking to contribute to Feast: > Learn How the Feast [Contributing Process](contributing.md) works. -## Project Structure - -Feast is composed of [multiple components](../getting-started/architecture-and-components/) distributed into multiple repositories: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RepositoryDescriptionComponent(s)
Main Feast Repository - Hosts all required code to run Feast. This includes the Feast Python SDK - and Protobuf definitions. For legacy reasons this repository still contains - Terraform config and a Go Client for Feast. -
    -
  • Python SDK / CLI -
  • -
  • Protobuf APIs -
  • -
  • Documentation -
  • -
  • Go Client -
  • -
  • Terraform -
  • -
-
Feast Java - Java-specific Feast components. Includes the Feast Core Registry, Feast - Serving for serving online feature values, and the Feast Java Client for - retrieving feature values. -
    -
  • Core -
  • -
  • Serving -
  • -
  • Java Client -
  • -
-
Feast Spark - Feast Spark SDK & Feast Job Service for launching ingestion jobs and - for building training datasets with Spark -
    -
  • Spark SDK -
  • -
  • Job Service -
  • -
-
Feast Helm Chart - Helm Chart for deploying Feast on Kubernetes & Spark. -
    -
  • Helm Chart -
  • -
-
- ## Making a Pull Request {% hint style="info" %} @@ -148,5 +69,3 @@ The language specific bindings have to be regenerated when changes are made to t | :--- | :--- | :--- | | [Main Feast Repository](https://github.com/feast-dev/feast) | Python | Run `make compile-protos-python` to generate bindings | | [Main Feast Repository](https://github.com/feast-dev/feast) | Golang | Run `make compile-protos-go` to generate bindings | -| [Feast Java](https://github.com/feast-dev/feast-java) | Java | No action required: bindings are generated automatically during compilation. | - diff --git a/docs/project/new_branch_part_1.png b/docs/project/new_branch_part_1.png new file mode 100644 index 0000000000..e8e59d8214 Binary files /dev/null and b/docs/project/new_branch_part_1.png differ diff --git a/docs/project/new_branch_part_2.png b/docs/project/new_branch_part_2.png new file mode 100644 index 0000000000..f94c2e3227 Binary files /dev/null and b/docs/project/new_branch_part_2.png differ diff --git a/docs/project/new_branch_part_3.png b/docs/project/new_branch_part_3.png new file mode 100644 index 0000000000..34cbb80751 Binary files /dev/null and b/docs/project/new_branch_part_3.png differ diff --git a/docs/project/new_branch_part_4.png b/docs/project/new_branch_part_4.png new file mode 100644 index 0000000000..c9c3cc4352 Binary files /dev/null and b/docs/project/new_branch_part_4.png differ diff --git a/docs/project/new_branch_part_5.png b/docs/project/new_branch_part_5.png new file mode 100644 index 0000000000..89b3a08cc6 Binary files /dev/null and b/docs/project/new_branch_part_5.png differ diff --git a/docs/project/release-process.md b/docs/project/release-process.md index af573c92c7..e9f3295d91 100644 --- a/docs/project/release-process.md +++ b/docs/project/release-process.md @@ -4,60 +4,54 @@ For Feast maintainers, these are the concrete steps for making a new release. -1. For new major or minor release, create and check out the release branch for the new stream, e.g. `v0.6-branch`. For a patch version, check out the stream's release branch. -2. Update the [CHANGELOG.md](../../CHANGELOG.md). See the [Creating a change log](release-process.md#creating-a-change-log) guide and commit - * Make to review each PR in the changelog to [flag any breaking changes and deprecation.](release-process.md#flag-breaking-changes-and-deprecations) -3. Update versions for the release/release candidate with a commit: - 1. In the root `pom.xml`, remove `-SNAPSHOT` from the `` property, update versions, and commit. - 2. Tag the commit with the release version, using a `v` and `sdk/go/v` prefixes - * for a release candidate, create tags `vX.Y.Z-rc.N`and `sdk/go/vX.Y.Z-rc.N` - * for a stable release `X.Y.Z` create tags `vX.Y.Z` and `sdk/go/vX.Y.Z` - 3. Check that versions are updated with `make lint-versions`. - 4. If changes required are flagged by the version lint, make the changes, amend the commit and move the tag to the new commit. -4. Push the commits and tags. Make sure the CI passes. - * If the CI does not pass, or if there are new patches for the release fix, repeat step 2 & 3 with release candidates until stable release is achieved. -5. Bump to the next patch version in the release branch, append `-SNAPSHOT` in `pom.xml` and push. -6. Create a PR against master to: - 1. Bump to the next major/minor version and append `-SNAPSHOT` . - 2. Add the change log by applying the change log commit created in step 2. - 3. Check that versions are updated with `env TARGET_MERGE_BRANCH=master make lint-versions` -7. Create a [GitHub release](https://github.com/feast-dev/feast/releases) which includes a summary of im~~p~~ortant changes as well as any artifacts associated with the release. Make sure to include the same change log as added in [CHANGELOG.md](../../CHANGELOG.md). Use `Feast vX.Y.Z` as the title. - -When a tag that matches a Semantic Version string is pushed, CI will automatically build and push the relevant artifacts to their repositories or package managers (docker images, Python wheels, etc). JVM artifacts are promoted from Sonatype OSSRH to Maven Central, but it sometimes takes some time for them to be available. The `sdk/go/v tag` is required to version the Go SDK go module so that users can go get a specific tagged release of the Go SDK. - -### Creating a change log - -We use an [open source change log generator](https://hub.docker.com/r/ferrarimarco/github-changelog-generator/) to generate change logs. The process still requires a little bit of manual effort. - -1. Create a GitHub token as [per these instructions](https://github.com/github-changelog-generator/github-changelog-generator#github-token). The token is used as an input argument (`-t`) to the change log generator. -2. The change log generator configuration below will look for unreleased changes on a specific branch. The branch will be `master` for a major/minor release, or a release branch (`v0.4-branch`) for a patch release. You will need to set the branch using the `--release-branch` argument. -3. You should also set the `--future-release` argument. This is the version you are releasing. The version can still be changed at a later date. -4. Update the arguments below and run the command to generate the change log to the console. - -``` -docker run -it --rm ferrarimarco/github-changelog-generator \ ---user feast-dev \ ---project feast \ ---release-branch \ ---future-release \ ---unreleased-only \ ---no-issues \ ---bug-labels kind/bug \ ---enhancement-labels kind/feature \ ---breaking-labels compat/breaking \ --t \ ---max-issues 1 \ --o -``` - -1. Review each change log item. - * Make sure that sentences are grammatically correct and well formatted (although we will try to enforce this at the PR review stage). - * Make sure that each item is categorised correctly. You will see the following categories: `Breaking changes`, `Implemented enhancements`, `Fixed bugs`, and `Merged pull requests`. Any unlabelled PRs will be found in `Merged pull requests`. It's important to make sure that any `breaking changes`, `enhancements`, or `bug fixes` are pulled up out of `merged pull requests` into the correct category. Housekeeping, tech debt clearing, infra changes, or refactoring do not count as `enhancements`. Only enhancements a user benefits from should be listed in that category. - * Make sure that the "Full Change log" link is actually comparing the correct tags (normally your released version against the previously version). - * Make sure that release notes and breaking changes are present. - -### Flag Breaking Changes & Deprecations - -It's important to flag breaking changes and deprecation to the API for each release so that we can maintain API compatibility. - -Developers should have flagged PRs with breaking changes with the `compat/breaking` label. However, it's important to double check each PR's release notes and contents for changes that will break API compatibility and manually label `compat/breaking` to PRs with undeclared breaking changes. The change log will have to be regenerated if any new labels have to be added. +### Pre-release Verification (Verification that wheels are built correctly) for minor release. +1. Merge upstream master changes into your **fork**. Make sure you are running the workflow off of your fork! +2. Create a tag manually for the release on your fork. For example, if you are doing a release for version 0.22.0, create a tag by doing the following. + - Checkout master branch and run `git tag v0.22.0`. + - Run `git push --tags` to push the tag to your forks master branch. +3. Access the `Actions` tab on your github UI on your fork and click the `build_wheels` action. This workflow will build the python sdk wheels for Python 3.8-3.10 on MacOS 10.15 and Linux and verify that these wheels are correct. The publish workflow uses this action to publish the python wheels for a new release to pypi. +4. Look for the header `This workflow has a workflow_dispatch event trigger` and click `Run Workflow` on the right. +5. Run the workflow off of the tag you just created(`v0.22.0` in this case) and verify that the workflow worked (i.e ensure that all jobs are green). + +### Pre-release Verification (Verification that wheels are built correctly) for patch release. +1. Check out the branch of your release (e.g `v0.22-branch` on your local **fork**) and push this to your fork (`git push -u origin `). +2. Cherry pick commits that are relevant to the patch release onto your forked branch. +3. Checkout the release branch and add a patch release tag (e.g `v0.22.1`) by running `git tag `. +4. Push tags to your origin branch with `git push origin `. +5. Kick off `build_wheels` workflow in the same way as is detailed in the last section on of the patch release tag. + +### Release for Python and Java SDK +1. Generate a [Personal Access Token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) or retrieve your saved personal access token. + - The personal access token should have all of the permissions under the `repo` checkbox. +2. Access the `Actions` tab on the main `feast-dev/feast` repo and find the `release` action. +3. Look for the header `This workflow has a workflow_dispatch event trigger` again and click `Run Workflow` on the right. +4. Try the dry run first with your personal access token. If this succeeds, uncheck `Dry Run` and run the release workflow. +5. All of the jobs should succeed besides the UI job which needs to be released separately. Ping a maintainer on Slack to run the UI release manually. +6. Try to install the feast release in your local environment and test out the `feast init` -> `feast apply` workflow to verify as a sanity check that the release worked correctly. + +### (for minor releases) Post-release steps +1. Create a new branch based on master (i.e. v0.22-branch) and push to the main Feast repo. This will be where cherry-picks go for future patch releases and where documentation will point. +2. Write a summary of the release in the GitHub release + 1. By default, Semantic Release will pull in messages from commits (features vs fixes, etc). But this is hard to digest still, so it helps to have a high level overview. + +### Update documentation + +In the Feast Gitbook (ask [Danny Chiao](https://tectonfeast.slack.com/team/U029405HFEU) in Slack for access): +1. Create a new space within the Feast collection +2. Go to the overflow menu on the top -> Synchronize with Git + 1. Specify GitHub as the provider + + ![](new_branch_part_1.png) + 2. Configure to point to the new release branch + + ![](new_branch_part_2.png) +3. Publish the new page for this branch as part of the collection + + ![](new_branch_part_3.png) +4. Go back to the main Feast collection and go to the overflow menu -> "Customize collection" + + ![](new_branch_part_3.png) +5. Configure the default space to be your new branch and save + + ![](new_branch_part_5.png) +6. Verify on docs.feast.dev that this new space is the default (this may take a few minutes to propagate, and your browser cache may be caching the old branch as the default) \ No newline at end of file diff --git a/docs/project/versioning-policy.md b/docs/project/versioning-policy.md index 8e51676355..b1ff2c75e7 100644 --- a/docs/project/versioning-policy.md +++ b/docs/project/versioning-policy.md @@ -23,24 +23,18 @@ In general, unless you're committing code that only applies to a particular rele The following table shows the **status** \(stable, beta, or alpha\) of Feast components. -Application status indicators for Feast: +Component status indicators for Feast: * **Stable** means that the component has reached a sufficient level of stability and adoption that the Feast community has deemed the component stable. Please see the stability criteria below. * **Beta** means that the component is working towards a version 1.0 release. Beta does not mean a component is unstable, it simply means the component has not met the full criteria of stability. * **Alpha** means that the component is in the early phases of development and/or integration into Feast. -| Application | Status | Notes | -| :--- | :--- | :--- | -| [Feast Serving](https://github.com/feast-dev/feast-java) | Beta | APIs are considered stable and will not have breaking changes within 3 minor versions. | -| [Feast Core](https://github.com/feast-dev/feast-java) | Beta | At risk of deprecation | -| [Feast Java Client](https://github.com/feast-dev/feast-java) | Beta | | -| [Feast Python SDK](https://github.com/feast-dev/feast) | Beta | | -| [Feast Go Client](https://github.com/feast-dev/feast) | Beta | | -| [Feast Spark Python SDK](https://github.com/feast-dev/feast-spark) | Alpha | | -| [Feast Spark Launchers](https://github.com/feast-dev/feast-spark) | Alpha | | -| [Feast Job Service](https://github.com/feast-dev/feast-spark) | Alpha | Scheduled for deprecation | -| [Feast Helm Chart](https://github.com/feast-dev/feast-helm-charts) | Beta | | -| | | | +| Component | Status | Notes | +|:---------------------------------------------------------------------------------|:-------| :--- | +| [Feast Python SDK](https://github.com/feast-dev/feast/tree/master/sdk/python) | Stable | | +| [Feast Go Feature Server](https://github.com/feast-dev/feast/tree/master/) | Beta | | +| [Feast Java Feature Server](https://github.com/feast-dev/feast/tree/master/java) | Alpha | | +| | | | Criteria for reaching _**stable**_ status: diff --git a/docs/reference/feature-repository/feature-store-yaml.md b/docs/reference/feature-repository/feature-store-yaml.md index fa10149cfe..a87e09ba43 100644 --- a/docs/reference/feature-repository/feature-store-yaml.md +++ b/docs/reference/feature-repository/feature-store-yaml.md @@ -24,5 +24,6 @@ The following top-level configuration options exist in the `feature_store.yaml` * **online_store** — Configures the online store. * **offline_store** — Configures the offline store. * **project** — Defines a namespace for the entire feature store. Can be used to isolate multiple deployments in a single installation of Feast. Should only contain letters, numbers, and underscores. +* **engine** - Configures the batch materialization engine. Please see the [RepoConfig](https://rtd.feast.dev/en/latest/#feast.repo_config.RepoConfig) API reference for the full list of configuration options. diff --git a/docs/reference/feature-servers/go-feature-retrieval.md b/docs/reference/feature-servers/go-feature-retrieval.md deleted file mode 100644 index 685e7201cb..0000000000 --- a/docs/reference/feature-servers/go-feature-retrieval.md +++ /dev/null @@ -1,73 +0,0 @@ -# Go-based Feature Retrieval - -## Overview - -The Go Feature Retrieval component is a Go implementation of the core feature serving logic, embedded in the Python SDK. It supports retrieval of feature references, feature services, and on demand feature views, and can be used either through the Python SDK or the [Python feature server](python-feature-server.md). - -Currently, this component only supports online serving and does not have an offline component including APIs to create feast feature repositories or apply configuration to the registry to facilitate online materialization. It also does not expose its own dedicated cli to perform feast actions. Furthermore, this component is only meant to expose an online serving API that can be called through the python SDK to facilitate faster online feature retrieval. - -The Go Feature Retrieval component currently only supports Redis and Sqlite as online stores; support for other online stores will be added soon. Initial benchmarks indicate that it is significantly faster than the Python feature server for online feature retrieval. We plan to release a more comprehensive set of benchmarks. For more details, see the [RFC](https://docs.google.com/document/d/1Lgqv6eWYFJgQ7LA_jNeTh8NzOPhqI9kGTeyESRpNHnE). - -## Installation - -As long as you are running macOS or linux, on x86, with python version 3.7-3.10, the go component comes pre-compiled when you install feast. - -However, some additional dependencies are required for Go <-> Python interoperability. To install these dependencies run the following command in your console: -``` -pip install feast[go] -``` - -For developers, if you want to build from source, run `make compile-go-lib` to build and compile the go server. - -## Usage - -To enable the Go online feature retrieval component, set `go_feature_retrieval: True` in your `feature_store.yaml`. This will direct all online feature retrieval to Go instead of Python. This flag will be enabled by default in the future. - -{% code title="feature_store.yaml" %} -```yaml -project: my_feature_repo -registry: data/registry.db -provider: local -online_store: - type: redis - connection_string: "localhost:6379" -go_feature_retrieval: True -``` -{% endcode %} - -## Feature logging - -Go feature server can log all requested entities and served features to a configured destination inside an offline store. -This allows users to create new datasets from features served online. Those datasets could be used for future trainings or for -feature validations. To enable feature logging we need to edit `feature_store.yaml`: -```yaml -project: my_feature_repo -registry: data/registry.db -provider: local -online_store: - type: redis - connection_string: "localhost:6379" -go_feature_retrieval: True -feature_server: - feature_logging: - enable: True -``` - -Feature logging configuration in `feature_store.yaml` also allows to tweak some low-level parameters to achieve the best performance: -```yaml -feature_server: - feature_logging: - enable: True - flush_interval_secs: 300 - write_to_disk_interval_secs: 30 - emit_timeout_micro_secs: 10000 - queue_capacity: 10000 -``` -All these parameters are optional. - -## Future/Current Work - -The Go feature retrieval online feature logging for Data Quality Monitoring is currently in development. More information can be found [here](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98/edit#heading=h.9gaqqtox9jg6). - -We also plan on adding support for the Java feature server (e.g. the capability to call into the Go component and execute Java UDFs). - diff --git a/docs/reference/feature-servers/go-feature-server.md b/docs/reference/feature-servers/go-feature-server.md new file mode 100644 index 0000000000..f83b765c3a --- /dev/null +++ b/docs/reference/feature-servers/go-feature-server.md @@ -0,0 +1,93 @@ +# Go feature server + +## Overview + +The Go feature server is an HTTP/gRPC endpoint that serves features. +It is written in Go, and is therefore significantly faster than the Python feature server. +See this [blog post](https://feast.dev/blog/go-feature-server-benchmarks/) for more details on the comparison between Python and Go. +In general, we recommend the Go feature server for all production use cases that require extremely low-latency feature serving. +Currently only the Redis and SQLite online stores are supported. + +## CLI + +By default, the Go feature server is turned off. +To turn it on you can add `go_feature_serving: True` to your `feature_store.yaml`: + +{% code title="feature_store.yaml" %} +```yaml +project: my_feature_repo +registry: data/registry.db +provider: local +online_store: + type: redis + connection_string: "localhost:6379" +go_feature_serving: True +``` +{% endcode %} + +Then the `feast serve` CLI command will start the Go feature server. +As with Python, the Go feature server uses port 6566 by default; the port be overridden with a `--port` flag. +Moreover, the server uses HTTP by default, but can be set to use gRPC with `--type=grpc`. + +Alternatively, if you wish to experiment with the Go feature server instead of permanently turning it on, you can just run `feast serve --go`. + +## Installation + +The Go component comes pre-compiled when you install Feast with Python versions 3.8-3.10 on macOS or Linux (on x86). +In order to install the additional Python dependencies, you should install Feast with +``` +pip install feast[go] +``` +You must also install the Apache Arrow C++ libraries. +This is because the Go feature server uses the cgo memory allocator from the Apache Arrow C++ library for interoperability between Go and Python, to prevent memory from being accidentally garbage collected when executing on-demand feature views. +You can read more about the usage of the cgo memory allocator in these [docs](https://pkg.go.dev/github.com/apache/arrow/go/arrow@v0.0.0-20211112161151-bc219186db40/cdata#ExportArrowRecordBatch). + +For macOS, run `brew install apache-arrow`. +For linux users, you have to install `libarrow-dev`. +``` +sudo apt update +sudo apt install -y -V ca-certificates lsb-release wget +wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +sudo apt update +sudo apt install -y -V libarrow-dev # For C++ +``` +For developers, if you want to build from source, run `make compile-go-lib` to build and compile the go server. In order to build the go binaries, you will need to install the `apache-arrow` c++ libraries. + +## Alpha features + +### Feature logging + +The Go feature server can log all requested entities and served features to a configured destination inside an offline store. +This allows users to create new datasets from features served online. Those datasets could be used for future trainings or for +feature validations. To enable feature logging we need to edit `feature_store.yaml`: +```yaml +project: my_feature_repo +registry: data/registry.db +provider: local +online_store: + type: redis + connection_string: "localhost:6379" +go_feature_serving: True +feature_server: + feature_logging: + enable: True +``` + +Feature logging configuration in `feature_store.yaml` also allows to tweak some low-level parameters to achieve the best performance: +```yaml +feature_server: + feature_logging: + enable: True + flush_interval_secs: 300 + write_to_disk_interval_secs: 30 + emit_timeout_micro_secs: 10000 + queue_capacity: 10000 +``` +All these parameters are optional. + +### Python SDK retrieval + +The logic for the Go feature server can also be used to retrieve features during a Python `get_online_features` call. +To enable this behavior, you must add `go_feature_retrieval: True` to your `feature_store.yaml`. +You must also have all the dependencies installed as detailed above. diff --git a/docs/reference/feature-servers/python-feature-server.md b/docs/reference/feature-servers/python-feature-server.md index ecc12dd12d..2646c28ef4 100644 --- a/docs/reference/feature-servers/python-feature-server.md +++ b/docs/reference/feature-servers/python-feature-server.md @@ -2,23 +2,22 @@ ## Overview -The feature server is an HTTP endpoint that serves features with JSON I/O. This enables users to write + read features from Feast online stores using any programming language that can make HTTP requests. +The Python feature server is an HTTP endpoint that serves features with JSON I/O. This enables users to write and read features from the online store using any programming language that can make HTTP requests. ## CLI -There is a CLI command that starts the server: `feast serve`. By default, Feast uses port 6566; the port be overridden by a `--port` flag. +There is a CLI command that starts the server: `feast serve`. By default, Feast uses port 6566; the port be overridden with a `--port` flag. ## Deploying as a service -One can also deploy a feature server by building a docker image that bundles in the project's `feature_store.yaml`. See [helm chart](https://github.com/feast-dev/feast/blob/master/infra/charts/feast-python-server) for example. - -A [remote feature server](../alpha-aws-lambda-feature-server.md) on AWS Lambda is available. A remote feature server on GCP Cloud Run is currently being developed. +One can deploy a feature server by building a docker image that bundles in the project's `feature_store.yaml`. See this [helm chart](https://github.com/feast-dev/feast/blob/master/infra/charts/feast-python-server) for an example. +A [remote feature server](../alpha-aws-lambda-feature-server.md) on AWS Lambda is also available. ## Example ### Initializing a feature server -Here's the local feature server usage example with the local template: +Here's an example of how to start the Python feature server with a local feature repo: ```bash $ feast init feature_repo @@ -27,9 +26,11 @@ Creating a new Feast repository in /home/tsotne/feast/feature_repo. $ cd feature_repo $ feast apply -Registered entity driver_id -Registered feature view driver_hourly_stats -Deploying infrastructure for driver_hourly_stats +Created entity driver +Created feature view driver_hourly_stats +Created feature service driver_activity + +Created sqlite table feature_repo_driver_hourly_stats $ feast materialize-incremental $(date +%Y-%m-%d) Materializing 1 feature views to 2021-09-09 17:00:00-07:00 into the sqlite online store. @@ -38,8 +39,6 @@ driver_hourly_stats from 2021-09-09 16:51:08-07:00 to 2021-09-09 17:00:00-07:00: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 295.24it/s] $ feast serve -This is an experimental feature. It's intended for early testing and feedback, and could change without warnings in future releases. -INFO: Started server process [8889] 09/10/2021 10:42:11 AM INFO:Started server process [8889] INFO: Waiting for application startup. 09/10/2021 10:42:11 AM INFO:Waiting for application startup. @@ -49,7 +48,7 @@ INFO: Uvicorn running on http://127.0.0.1:6566 (Press CTRL+C to quit) 09/10/2021 10:42:11 AM INFO:Uvicorn running on http://127.0.0.1:6566 (Press CTRL+C to quit) ``` -### Retrieving features from the online store +### Retrieving features After the server starts, we can execute cURL commands from another terminal tab: ```bash @@ -153,11 +152,9 @@ curl -X POST \ ``` ### Pushing features to the online and offline stores -You can push data corresponding to a push source to the online and offline stores (note that timestamps need to be strings): - -You can also define a pushmode to push stream or batch data, either to the online store, offline store, or both. The feature server will throw an error if the online/offline store doesn't support the push api functionality. +The Python feature server also exposes an endpoint for [push sources](../../data-sources/push.md). This endpoint allows you to push data to the online and/or offline store. -The request definition for pushmode is a string parameter `to` where the options are: ["online", "offline", "online_and_offline"]. +The request definition for pushmode is a string parameter `to` where the options are: ["online", "offline", "online_and_offline"]. Note that timestamps need to be strings. ```text curl -X POST "http://localhost:6566/push" -d '{ "push_source_name": "driver_hourly_stats_push_source", diff --git a/docs/reference/offline-stores/snowflake.md b/docs/reference/offline-stores/snowflake.md index aa006b43bb..e2afaef90d 100644 --- a/docs/reference/offline-stores/snowflake.md +++ b/docs/reference/offline-stores/snowflake.md @@ -2,7 +2,7 @@ ## Description -The Snowflake offline store provides support for reading [SnowflakeSources](../data-sources/snowflake.md). +The [Snowflake](https://trial.snowflake.com) offline store provides support for reading [SnowflakeSources](../data-sources/snowflake.md). * Snowflake tables and views are allowed as sources. * All joins happen within Snowflake. @@ -11,7 +11,7 @@ The Snowflake offline store provides support for reading [SnowflakeSources](../d * This allows you to call * `to_snowflake` to save the dataset into Snowflake * `to_sql` to get the SQL query that would execute on `to_df` - * `to_arrow_chunks` to get the result in batches ([Snowflake python connector docs](https://docs.snowflake.com/en/user-guide/python-connector-api.html#get_result_batches)) + * `to_arrow_chunks` to get the result in batches ([Snowflake python connector docs](https://docs.snowflake.com/en/user-guide/python-connector-api.html#get_result_batches)) ## Example diff --git a/docs/reference/online-stores/README.md b/docs/reference/online-stores/README.md index b3578b8539..5eb566af3c 100644 --- a/docs/reference/online-stores/README.md +++ b/docs/reference/online-stores/README.md @@ -4,6 +4,8 @@ Please see [Online Store](../../getting-started/architecture-and-components/onli {% page-ref page="sqlite.md" %} +{% page-ref page="snowflake.md" %} + {% page-ref page="redis.md" %} {% page-ref page="datastore.md" %} diff --git a/docs/reference/online-stores/snowflake.md b/docs/reference/online-stores/snowflake.md new file mode 100644 index 0000000000..ccf3d526da --- /dev/null +++ b/docs/reference/online-stores/snowflake.md @@ -0,0 +1,35 @@ +# Snowflake + +## Description + +The [Snowflake](https://trial.snowflake.com) online store provides support for materializing feature values into a Snowflake Transient Table for serving online features. + +* Only the latest feature values are persisted + +The data model for using a Snowflake Transient Table as an online store follows a tall format (one row per feature)): +* "entity_feature_key" (BINARY) -- unique key used when reading specific feature_view x entity combination +* "entity_key" (BINARY) -- repeated key currently unused for reading entity_combination +* "feature_name" (VARCHAR) +* "value" (BINARY) +* "event_ts" (TIMESTAMP) +* "created_ts" (TIMESTAMP) + + (This model may be subject to change when Snowflake Hybrid Tables are released) + +## Example + +{% code title="feature_store.yaml" %} +```yaml +project: my_feature_repo +registry: data/registry.db +provider: local +online_store: + type: snowflake.online + account: SNOWFLAKE_DEPLOYMENT_URL + user: SNOWFLAKE_USER + password: SNOWFLAKE_PASSWORD + role: SNOWFLAKE_ROLE + warehouse: SNOWFLAKE_WAREHOUSE + database: SNOWFLAKE_DATABASE +``` +{% endcode %} diff --git a/docs/roadmap.md b/docs/roadmap.md index 19af4f95c9..e481453dff 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -1,10 +1,9 @@ # Roadmap -The list below contains the functionality that contributors are planning to develop for Feast +The list below contains the functionality that contributors are planning to develop for Feast. -* Items below that are in development (or planned for development) will be indicated in parentheses. * We welcome contribution to all items in the roadmap! -* Want to speak to a Feast contributor? We are more than happy to jump on a call. Please schedule a time using [Calendly](https://calendly.com/d/x2ry-g5bb/meet-with-feast-team). +* Have questions about the roadmap? Go to the Slack channel to ask on #feast-development. * **Data Sources** * [x] [Snowflake source](https://docs.feast.dev/reference/data-sources/snowflake) @@ -16,7 +15,6 @@ The list below contains the functionality that contributors are planning to deve * [x] [Postgres (contrib plugin)](https://docs.feast.dev/reference/data-sources/postgres) * [x] [Spark (contrib plugin)](https://docs.feast.dev/reference/data-sources/spark) * [x] Kafka / Kinesis sources (via [push support into the online store](https://docs.feast.dev/reference/data-sources/push)) - * [ ] HTTP source * **Offline Stores** * [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) * [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) @@ -29,6 +27,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [In-memory / Pandas](https://docs.feast.dev/reference/offline-stores/file) * [x] [Custom offline store support](https://docs.feast.dev/how-to-guides/adding-a-new-offline-store) * **Online Stores** + * [x] [Snowflake](https://docs.feast.dev/reference/online-stores/snowflake) * [x] [DynamoDB](https://docs.feast.dev/reference/online-stores/dynamodb) * [x] [Redis](https://docs.feast.dev/reference/online-stores/redis) * [x] [Datastore](https://docs.feast.dev/reference/online-stores/datastore) @@ -49,26 +48,16 @@ The list below contains the functionality that contributors are planning to deve * **Deployments** * [x] AWS Lambda (Alpha release. See [RFC](https://docs.google.com/document/d/1eZWKWzfBif66LDN32IajpaG-j82LSHCCOzY6R7Ax7MI/edit)) * [x] Kubernetes (See [guide](https://docs.feast.dev/how-to-guides/running-feast-in-production#4.3.-java-based-feature-server-deployed-on-kubernetes)) - * [ ] Cloud Run - * [ ] KNative * **Feature Serving** * [x] Python Client - * [x] REST Feature Server (Python) (Alpha release. See [RFC](https://docs.google.com/document/d/1iXvFhAsJ5jgAhPOpTdB3j-Wj1S9x3Ev\_Wr6ZpnLzER4/edit)) - * [x] gRPC Feature Server (Java) (See [#1497](https://github.com/feast-dev/feast/issues/1497)) - * [x] Push API - * [ ] Java Client - * [ ] Go Client - * [ ] Delete API - * [] Feature Logging (for training) + * [x] [Python feature server](https://docs.feast.dev/reference/feature-servers/python-feature-server) + * [x] [Go feature server](https://docs.feast.dev/reference/feature-servers/go-feature-server) * **Data Quality Management (See [RFC](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98/edit))** * [x] Data profiling and validation (Great Expectations) - * [ ] Training-serving skew detection (in progress) - * [ ] Metric production - * [ ] Drift detection * **Feature Discovery and Governance** * [x] Python SDK for browsing feature registry * [x] CLI for browsing feature registry * [x] Model-centric feature tracking (feature services) * [x] Amundsen integration (see [Feast extractor](https://github.com/amundsen-io/amundsen/blob/main/databuilder/databuilder/extractor/feast_extractor.py)) - * [x] Feast Web UI (Alpha release. See [documentation](https://docs.feast.dev/reference/alpha-web-ui)) - * [ ] REST API for browsing feature registry + * [x] DataHub integration (see [DataHub Feast docs](https://datahubproject.io/docs/generated/ingestion/sources/feast/)) + * [x] Feast Web UI (Alpha release. See [docs](https://docs.feast.dev/reference/alpha-web-ui)) diff --git a/docs/tutorials/driver-stats-on-snowflake.md b/docs/tutorials/driver-stats-on-snowflake.md index 01b158cb1a..306ae2f59b 100644 --- a/docs/tutorials/driver-stats-on-snowflake.md +++ b/docs/tutorials/driver-stats-on-snowflake.md @@ -1,6 +1,6 @@ --- description: >- - Initial demonstration of Snowflake as an offline store with Feast, using the Snowflake demo template. + Initial demonstration of Snowflake as an offline+online store with Feast, using the Snowflake demo template. --- # Drivers stats on Snowflake @@ -61,6 +61,14 @@ offline_store: role: ROLE_NAME #case sensitive warehouse: WAREHOUSE_NAME #case sensitive database: DATABASE_NAME #case cap sensitive +online_store: + type: snowflake.online + account: SNOWFLAKE_DEPLOYMENT_URL #drop .snowflakecomputing.com + user: USERNAME + password: PASSWORD + role: ROLE_NAME #case sensitive + warehouse: WAREHOUSE_NAME #case sensitive + database: DATABASE_NAME #case cap sensitive ``` {% endcode %} diff --git a/examples/quickstart/quickstart.ipynb b/examples/quickstart/quickstart.ipynb index d29ee4fa35..c7d0fcfe54 100644 --- a/examples/quickstart/quickstart.ipynb +++ b/examples/quickstart/quickstart.ipynb @@ -949,45 +949,6 @@ "pprint(feature_vector)" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "_dBcqkaCnOYv" - }, - "source": [ - "## Step 7: Explore registered features with the Web UI" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - }, - "id": "mCUPypyhl5TH", - "outputId": "fb2475c3-b254-42e6-b638-7982d52d2a19" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "nohup: appending output to 'nohup.out'\n", - "Open the Web UI at https://c6cuffvc4qm-496ff2e9c6d22116-8888-colab.googleusercontent.com/\n" - ] - } - ], - "source": [ - "from google.colab.output import eval_js\n", - "host = eval_js(\"google.colab.kernel.proxyPort(8888)\")\n", - "\n", - "!nohup feast ui &\n", - "\n", - "print(f\"Open the Web UI at {host}\")" - ] - }, { "cell_type": "markdown", "metadata": { diff --git a/go.mod b/go.mod index fbbc95e1bf..3c05383ffc 100644 --- a/go.mod +++ b/go.mod @@ -2,12 +2,12 @@ module github.com/feast-dev/feast go 1.17 -replace github.com/go-python/gopy v0.4.0 => github.com/feast-dev/gopy v0.4.1-0.20220429180328-4257ac71a4d0 +replace github.com/go-python/gopy v0.4.4 => github.com/feast-dev/gopy v0.4.1-0.20220714211711-252048177d85 require ( github.com/apache/arrow/go/v8 v8.0.0 github.com/ghodss/yaml v1.0.0 - github.com/go-python/gopy v0.4.0 + github.com/go-python/gopy v0.4.4 github.com/go-redis/redis/v8 v8.11.4 github.com/golang/protobuf v1.5.2 github.com/google/uuid v1.3.0 @@ -15,7 +15,7 @@ require ( github.com/pkg/errors v0.9.1 github.com/spaolacci/murmur3 v1.1.0 github.com/stretchr/testify v1.7.0 - google.golang.org/grpc v1.45.0 + google.golang.org/grpc v1.47.0 google.golang.org/protobuf v1.28.0 ) @@ -28,6 +28,8 @@ require ( github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/goccy/go-json v0.9.6 // indirect github.com/golang/snappy v0.0.4 // indirect + github.com/gonuts/commander v0.1.0 // indirect + github.com/gonuts/flag v0.1.0 // indirect github.com/google/flatbuffers v2.0.6+incompatible // indirect github.com/klauspost/asmfmt v1.3.2 // indirect github.com/klauspost/compress v1.15.1 // indirect @@ -38,7 +40,7 @@ require ( github.com/pmezard/go-difflib v1.0.0 // indirect github.com/zeebo/xxh3 v1.0.2 // indirect golang.org/x/exp v0.0.0-20220407100705-7b9b53b0aca4 // indirect - golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3 // indirect + golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 // indirect golang.org/x/net v0.0.0-20220407224826-aac1ed45d8e3 // indirect golang.org/x/sys v0.0.0-20220412211240-33da011f77ad // indirect golang.org/x/text v0.3.7 // indirect diff --git a/go.sum b/go.sum index 698d1ef640..11f1ba4d74 100644 --- a/go.sum +++ b/go.sum @@ -55,6 +55,7 @@ github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4/go.mod h1:6pvJx4me5XP github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20210805033703-aa0b78936158/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20210922020428-25de7278fc84/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/cncf/xds/go v0.0.0-20211001041855-01bcc9b48dfe/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8= github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI= @@ -83,10 +84,17 @@ github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1m github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ= github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go.mod h1:AFq3mo9L8Lqqiid3OhADV3RfLJnjiw63cSpi+fDTRC0= +github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go.mod h1:KJwIaB5Mv44NWtYuAOFCVOjcI94vtpEz2JU/D2v6IjE= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= -github.com/feast-dev/gopy v0.4.1-0.20220429180328-4257ac71a4d0 h1:Go714ObVP1O+a6qK7haXVL28QNm6WMD8bwnN9EA8PlM= -github.com/feast-dev/gopy v0.4.1-0.20220429180328-4257ac71a4d0/go.mod h1:ZO6vpitQ61NVoQP/2yOubPS6ET5pP3CAWCiMYn5eqCc= +github.com/feast-dev/gopy v0.4.1-0.20220714205859-591500e3215f h1:tTjEpVu4H/ZGh4wo3WETbA9dutNM6bXMXvyZbb9GLCs= +github.com/feast-dev/gopy v0.4.1-0.20220714205859-591500e3215f/go.mod h1:tlA/KcD7rM8B+NQJR4SASwiinfKY0aiMFanHszR8BZA= +github.com/feast-dev/gopy v0.4.1-0.20220714211038-aa312c13fd79 h1:oFj6GDGR8E4S5GeMyLBvaKtvMZxj3hHqsB5Xndjxjz8= +github.com/feast-dev/gopy v0.4.1-0.20220714211038-aa312c13fd79/go.mod h1:tlA/KcD7rM8B+NQJR4SASwiinfKY0aiMFanHszR8BZA= +github.com/feast-dev/gopy v0.4.1-0.20220714211330-67b016d61ed4 h1:UfzPdqqAfrt8f+jDIY61lbzqFZYsX2BhVyNcCbdpE+U= +github.com/feast-dev/gopy v0.4.1-0.20220714211330-67b016d61ed4/go.mod h1:tlA/KcD7rM8B+NQJR4SASwiinfKY0aiMFanHszR8BZA= +github.com/feast-dev/gopy v0.4.1-0.20220714211711-252048177d85 h1:BKmfqWiDbxvviB6vemPbbNjF+ywRsBMCdk1QvrcGgkc= +github.com/feast-dev/gopy v0.4.1-0.20220714211711-252048177d85/go.mod h1:tlA/KcD7rM8B+NQJR4SASwiinfKY0aiMFanHszR8BZA= github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4= @@ -145,7 +153,9 @@ github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiu github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/gonuts/commander v0.1.0 h1:EcDTiVw9oAVORFjQOEOuHQqcl6OXMyTgELocTq6zJ0I= github.com/gonuts/commander v0.1.0/go.mod h1:qkb5mSlcWodYgo7vs8ulLnXhfinhZsZcm6+H/z1JjgY= +github.com/gonuts/flag v0.1.0 h1:fqMv/MZ+oNGu0i9gp0/IQ/ZaPIDoAZBOBaJoV7viCWM= github.com/gonuts/flag v0.1.0/go.mod h1:ZTmTGtrSPejTo/SRNhCqwLTmiAgyBdCkLYhHrAoBdz4= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= @@ -437,8 +447,9 @@ golang.org/x/mod v0.5.1/go.mod h1:5OXOZSfqPIIbmVBIIKWRFfZjPR0E5r58TLhUjH0a2Ro= golang.org/x/mod v0.5.1/go.mod h1:5OXOZSfqPIIbmVBIIKWRFfZjPR0E5r58TLhUjH0a2Ro= golang.org/x/mod v0.6.0-dev.0.20211013180041-c96bc1413d57/go.mod h1:3p9vT2HGsQu2K1YbXdKPJLVgG5VJdoTa1poYQBtP1AY= golang.org/x/mod v0.6.0-dev.0.20211013180041-c96bc1413d57/go.mod h1:3p9vT2HGsQu2K1YbXdKPJLVgG5VJdoTa1poYQBtP1AY= -golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3 h1:kQgndtyPBW/JIYERgdxfwMYh3AVStj88WQTlNDi2a+o= golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3/go.mod h1:3p9vT2HGsQu2K1YbXdKPJLVgG5VJdoTa1poYQBtP1AY= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 h1:6zppjxzCulZykYSLyVDYbneBfbaBIQPYMevg0bEwv2s= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -588,8 +599,9 @@ google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTp google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.40.0/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9KAK34= google.golang.org/grpc v1.44.0/go.mod h1:k+4IHHFw41K8+bbowsex27ge2rCb65oeWqe4jJ590SU= -google.golang.org/grpc v1.45.0 h1:NEpgUqV3Z+ZjkqMsxMg11IaDrXY4RY6CQukSGK0uI1M= google.golang.org/grpc v1.45.0/go.mod h1:lN7owxKUQEqMfSyQikvvk5tf/6zMPsrK+ONuO11+0rQ= +google.golang.org/grpc v1.47.0 h1:9n77onPX5F3qfFCqjy9dhn8PbNQsIKeVU04J9G7umt8= +google.golang.org/grpc v1.47.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= diff --git a/go/README.md b/go/README.md new file mode 100644 index 0000000000..0bca470919 --- /dev/null +++ b/go/README.md @@ -0,0 +1,109 @@ +This directory contains the Go logic that's executed by the `EmbeddedOnlineFeatureServer` from Python. + +## Building and Linking +[gopy](https://github.com/go-python/gopy) generates (and compiles) a CPython extension module from a Go package. That's what we're using here, as visible in [setup.py](../setup.py). + +Under the hood, gopy invokes `go build`, and then templates `cgo` stubs for the Go module that exposes the public functions from the Go module as C functions. +For our project, this stuff can be found at `sdk/python/feast/embedded_go/lib/embedded.go` & `sdk/python/feast/embedded_go/lib/embedded_go.h` after running `make compile-go-lib`. + +## Arrow memory management +Understanding this is the trickiest part of this integration. + +At a high level, when using the Python<>Go integration, the Python layer exports request data into an [Arrow Record batch](https://arrow.apache.org/docs/python/data.html) which is transferred to Go using Arrow's zero copy mechanism. +Similarly, the Go layer converts feature values read from the online store into a Record Batch that's exported to Python using the same mechanics. + +The first thing to note is that from the Python perspective, all the export logic assumes that we're exporting to & importing from C, not Go. This is because pyarrow only interops with C, and the fact we're using Go is an implementation detail not relevant to the Python layer. + +### Export Entities & Request data from Python to Go +The code exporting to C is this, in [online_feature_service.py](../sdk/python/feast/embedded_go/online_features_service.py) +``` +( + entities_c_schema, + entities_ptr_schema, + entities_c_array, + entities_ptr_array, +) = allocate_schema_and_array() +( + req_data_c_schema, + req_data_ptr_schema, + req_data_c_array, + req_data_ptr_array, +) = allocate_schema_and_array() + +batch, schema = map_to_record_batch(entities, join_keys_types) +schema._export_to_c(entities_ptr_schema) +batch._export_to_c(entities_ptr_array) + +batch, schema = map_to_record_batch(request_data) +schema._export_to_c(req_data_ptr_schema) +batch._export_to_c(req_data_ptr_array) +``` + +Under the hood, `allocate_schema_and_array` allocates a pointer (`struct ArrowSchema*` and `struct ArrowArray*`) in native memory (i.e. the C layer) using `cffi`. +Next, the RecordBatch exports to this pointer using [`_export_to_c`](https://github.com/apache/arrow/blob/master/python/pyarrow/table.pxi#L2509), which uses [`ExportRecordBatch`](https://arrow.apache.org/docs/cpp/api/c_abi.html#_CPPv417ExportRecordBatchRK11RecordBatchP10ArrowArrayP11ArrowSchema) under the hood. + +As per the documentation for ExportRecordBatch: +> Status ExportRecordBatch(const RecordBatch &batch, struct ArrowArray *out, struct ArrowSchema *out_schema = NULLPTR) +> Export C++ RecordBatch using the C data interface format. +> +> The record batch is exported as if it were a struct array. The resulting ArrowArray struct keeps the record batch data and buffers alive until its release callback is called by the consumer. + +This is why `GetOnlineFeatures()` in `online_features.go` calls `record.Release()` as below: +``` +entitiesRecord, err := readArrowRecord(entities) +if err != nil { + return err +} +defer entitiesRecord.Release() +... +requestDataRecords, err := readArrowRecord(requestData) +if err != nil { + return err +} +defer requestDataRecords.Release() +``` + +Additionally, we need to pass in a pair of pointers to `GetOnlineFeatures()` that are populated by the Go layer, and the resultant feature values can be passed back to Python (via the C layer) using zero-copy semantics. +That happens as follows: +``` +( + features_c_schema, + features_ptr_schema, + features_c_array, + features_ptr_array, +) = allocate_schema_and_array() + +... + +record_batch = pa.RecordBatch._import_from_c( + features_ptr_array, features_ptr_schema +) +``` + +The corresponding Go code that exports this data is: +``` +result := array.NewRecord(arrow.NewSchema(outputFields, nil), outputColumns, int64(numRows)) + +cdata.ExportArrowRecordBatch(result, + cdata.ArrayFromPtr(output.DataPtr), + cdata.SchemaFromPtr(output.SchemaPtr)) +``` + +The documentation for `ExportArrowRecordBatch` is great. It has this super useful caveat: + +> // The release function on the populated CArrowArray will properly decrease the reference counts, +> // and release the memory if the record has already been released. But since this must be explicitly +> // done, make sure it is released so that you do not create a memory leak. + +This implies that the reciever is on the hook for explicitly releasing this memory. + +However, we're using `_import_from_c`, which uses [`ImportRecordBatch`](https://arrow.apache.org/docs/cpp/api/c_abi.html#_CPPv417ImportRecordBatchP10ArrowArrayP11ArrowSchema), which implies that the receiver of the RecordBatch is the new owner of the data. +This is wrapped by pyarrow - and when the corresponding python object goes out of scope, it should clean up the underlying record batch. + +Another thing to note (which I'm not sure may be the source of issues) is that Arrow has the concept of [Memory Pools](https://arrow.apache.org/docs/python/api/memory.html#memory-pools). +Memory pools can be set in python as well as in Go. I *believe* that if we use the CGoArrowAllocator, that uses whatever pool C++ uses, which should be the same as the one used by PyArrow. But this should be vetted. + + +### References +- https://arrow.apache.org/docs/format/CDataInterface.html#memory-management +- https://arrow.apache.org/docs/python/memory.html \ No newline at end of file diff --git a/go/embedded/online_features.go b/go/embedded/online_features.go index f6b21169e1..7fd34d16e4 100644 --- a/go/embedded/online_features.go +++ b/go/embedded/online_features.go @@ -33,6 +33,11 @@ type OnlineFeatureService struct { grpcStopCh chan os.Signal httpStopCh chan os.Signal + statusColumnBuildersToRelease []*array.Int32Builder + tsColumnBuildersToRelease []*array.Int64Builder + arraysToRelease []arrow.Array + resultsToRelease []arrow.Record + err error } @@ -143,6 +148,7 @@ func (s *OnlineFeatureService) GetOnlineFeatures( if err != nil { return err } + defer entitiesRecord.Release() numRows := entitiesRecord.Column(0).Len() @@ -155,6 +161,7 @@ func (s *OnlineFeatureService) GetOnlineFeatures( if err != nil { return err } + defer requestDataRecords.Release() requestDataProto, err := recordToProto(requestDataRecords) if err != nil { @@ -178,9 +185,27 @@ func (s *OnlineFeatureService) GetOnlineFeatures( return err } + // Release all objects that are no longer required. + for _, statusColumnBuilderToRelease := range s.statusColumnBuildersToRelease { + statusColumnBuilderToRelease.Release() + } + for _, tsColumnBuilderToRelease := range s.tsColumnBuildersToRelease { + tsColumnBuilderToRelease.Release() + } + for _, arrayToRelease := range s.arraysToRelease { + arrayToRelease.Release() + } + for _, resultsToRelease := range s.resultsToRelease { + resultsToRelease.Release() + } + s.statusColumnBuildersToRelease = nil + s.tsColumnBuildersToRelease = nil + s.arraysToRelease = nil + s.resultsToRelease = nil + outputFields := make([]arrow.Field, 0) outputColumns := make([]arrow.Array, 0) - pool := memory.NewGoAllocator() + pool := memory.NewCgoArrowAllocator() for _, featureVector := range resp { outputFields = append(outputFields, arrow.Field{ @@ -210,13 +235,19 @@ func (s *OnlineFeatureService) GetOnlineFeatures( } tsColumn := tsColumnBuilder.NewArray() outputColumns = append(outputColumns, tsColumn) + + // Mark builders and arrays for release. + s.statusColumnBuildersToRelease = append(s.statusColumnBuildersToRelease, statusColumnBuilder) + s.tsColumnBuildersToRelease = append(s.tsColumnBuildersToRelease, tsColumnBuilder) + s.arraysToRelease = append(s.arraysToRelease, statusColumn) + s.arraysToRelease = append(s.arraysToRelease, tsColumn) + s.arraysToRelease = append(s.arraysToRelease, featureVector.Values) } result := array.NewRecord(arrow.NewSchema(outputFields, nil), outputColumns, int64(numRows)) + s.resultsToRelease = append(s.resultsToRelease, result) - cdata.ExportArrowRecordBatch(result, - cdata.ArrayFromPtr(output.DataPtr), - cdata.SchemaFromPtr(output.SchemaPtr)) + cdata.ExportArrowRecordBatch(result, cdata.ArrayFromPtr(output.DataPtr), cdata.SchemaFromPtr(output.SchemaPtr)) return nil } diff --git a/go/internal/feast/featurestore.go b/go/internal/feast/featurestore.go index ad1f94a4ba..ed38411460 100644 --- a/go/internal/feast/featurestore.go +++ b/go/internal/feast/featurestore.go @@ -113,7 +113,7 @@ func (fs *FeatureStore) GetOnlineFeatures( } result := make([]*onlineserving.FeatureVector, 0) - arrowMemory := memory.NewGoAllocator() + arrowMemory := memory.NewCgoArrowAllocator() featureViews := make([]*model.FeatureView, len(requestedFeatureViews)) index := 0 for _, featuresAndView := range requestedFeatureViews { diff --git a/go/internal/feast/onlineserving/serving.go b/go/internal/feast/onlineserving/serving.go index e2a2df923b..3c6f545153 100644 --- a/go/internal/feast/onlineserving/serving.go +++ b/go/internal/feast/onlineserving/serving.go @@ -415,6 +415,8 @@ func KeepOnlyRequestedFeatures( vectorsByName := make(map[string]*FeatureVector) expectedVectors := make([]*FeatureVector, 0) + usedVectors := make(map[string]bool) + for _, vector := range vectors { vectorsByName[vector.Name] = vector } @@ -438,6 +440,14 @@ func KeepOnlyRequestedFeatures( return nil, fmt.Errorf("requested feature %s can't be retrieved", featureRef) } expectedVectors = append(expectedVectors, vectorsByName[qualifiedName]) + usedVectors[qualifiedName] = true + } + + // Free arrow arrays for vectors that were not used. + for _, vector := range vectors { + if _, ok := usedVectors[vector.Name]; !ok { + vector.Values.Release() + } } return expectedVectors, nil diff --git a/go/internal/feast/onlinestore/onlinestore.go b/go/internal/feast/onlinestore/onlinestore.go index 64a05f144c..88cd3dbd9b 100644 --- a/go/internal/feast/onlinestore/onlinestore.go +++ b/go/internal/feast/onlinestore/onlinestore.go @@ -61,7 +61,7 @@ func NewOnlineStore(config *registry.RepoConfig) (OnlineStore, error) { onlineStore, err := NewSqliteOnlineStore(config.Project, config, config.OnlineStore) return onlineStore, err } else if onlineStoreType == "redis" { - onlineStore, err := NewRedisOnlineStore(config.Project, config.OnlineStore) + onlineStore, err := NewRedisOnlineStore(config.Project, config, config.OnlineStore) return onlineStore, err } else { return nil, fmt.Errorf("%s online store type is currently not supported; only redis and sqlite are supported", onlineStoreType) diff --git a/go/internal/feast/onlinestore/redisonlinestore.go b/go/internal/feast/onlinestore/redisonlinestore.go index 26f34cf896..8fb85085d4 100644 --- a/go/internal/feast/onlinestore/redisonlinestore.go +++ b/go/internal/feast/onlinestore/redisonlinestore.go @@ -6,6 +6,7 @@ import ( "encoding/binary" "errors" "fmt" + "github.com/feast-dev/feast/go/internal/feast/registry" "sort" "strconv" "strings" @@ -13,7 +14,7 @@ import ( "github.com/go-redis/redis/v8" "github.com/golang/protobuf/proto" "github.com/spaolacci/murmur3" - timestamppb "google.golang.org/protobuf/types/known/timestamppb" + "google.golang.org/protobuf/types/known/timestamppb" "github.com/feast-dev/feast/go/protos/feast/serving" "github.com/feast-dev/feast/go/protos/feast/types" @@ -37,10 +38,15 @@ type RedisOnlineStore struct { // Redis client connector client *redis.Client + + config *registry.RepoConfig } -func NewRedisOnlineStore(project string, onlineStoreConfig map[string]interface{}) (*RedisOnlineStore, error) { - store := RedisOnlineStore{project: project} +func NewRedisOnlineStore(project string, config *registry.RepoConfig, onlineStoreConfig map[string]interface{}) (*RedisOnlineStore, error) { + store := RedisOnlineStore{ + project: project, + config: config, + } var address []string var password string @@ -161,7 +167,7 @@ func (r *RedisOnlineStore) OnlineRead(ctx context.Context, entityKeys []*types.E redisKeyToEntityIndex := make(map[string]int) for i := 0; i < len(entityKeys); i++ { - var key, err = buildRedisKey(r.project, entityKeys[i]) + var key, err = buildRedisKey(r.project, entityKeys[i], r.config.EntityKeySerializationVersion) if err != nil { return nil, err } @@ -270,8 +276,8 @@ func (r *RedisOnlineStore) Destruct() { } -func buildRedisKey(project string, entityKey *types.EntityKey) (*[]byte, error) { - serKey, err := serializeEntityKey(entityKey) +func buildRedisKey(project string, entityKey *types.EntityKey, entityKeySerializationVersion int64) (*[]byte, error) { + serKey, err := serializeEntityKey(entityKey, entityKeySerializationVersion) if err != nil { return nil, err } @@ -279,7 +285,7 @@ func buildRedisKey(project string, entityKey *types.EntityKey) (*[]byte, error) return &fullKey, nil } -func serializeEntityKey(entityKey *types.EntityKey) (*[]byte, error) { +func serializeEntityKey(entityKey *types.EntityKey, entityKeySerializationVersion int64) (*[]byte, error) { // Serialize entity key to a bytestring so that it can be used as a lookup key in a hash table. // Ensure that we have the right amount of join keys and entity values @@ -316,7 +322,7 @@ func serializeEntityKey(entityKey *types.EntityKey) (*[]byte, error) { offset := (2 * len(keys)) + (i * 3) value := m[keys[i]].GetVal() - valueBytes, valueTypeBytes, err := serializeValue(value) + valueBytes, valueTypeBytes, err := serializeValue(value, entityKeySerializationVersion) if err != nil { return valueBytes, err } @@ -341,7 +347,7 @@ func serializeEntityKey(entityKey *types.EntityKey) (*[]byte, error) { return &entityKeyBuffer, nil } -func serializeValue(value interface{}) (*[]byte, types.ValueType_Enum, error) { +func serializeValue(value interface{}, entityKeySerializationVersion int64) (*[]byte, types.ValueType_Enum, error) { // TODO: Implement support for other types (at least the major types like ints, strings, bytes) switch x := (value).(type) { case *types.Value_StringVal: @@ -354,10 +360,16 @@ func serializeValue(value interface{}) (*[]byte, types.ValueType_Enum, error) { binary.LittleEndian.PutUint32(valueBuffer, uint32(x.Int32Val)) return &valueBuffer, types.ValueType_INT32, nil case *types.Value_Int64Val: - // TODO (woop): We unfortunately have to use 32 bit here for backward compatibility :( - valueBuffer := make([]byte, 4) - binary.LittleEndian.PutUint32(valueBuffer, uint32(x.Int64Val)) - return &valueBuffer, types.ValueType_INT64, nil + if entityKeySerializationVersion <= 1 { + // We unfortunately have to use 32 bit here for backward compatibility :( + valueBuffer := make([]byte, 4) + binary.LittleEndian.PutUint32(valueBuffer, uint32(x.Int64Val)) + return &valueBuffer, types.ValueType_INT64, nil + } else { + valueBuffer := make([]byte, 8) + binary.LittleEndian.PutUint64(valueBuffer, uint64(x.Int64Val)) + return &valueBuffer, types.ValueType_INT64, nil + } case nil: return nil, types.ValueType_INVALID, fmt.Errorf("could not detect type for %v", x) default: diff --git a/go/internal/feast/onlinestore/redisonlinestore_test.go b/go/internal/feast/onlinestore/redisonlinestore_test.go index 43cdbe06a2..ad9ef1e1e4 100644 --- a/go/internal/feast/onlinestore/redisonlinestore_test.go +++ b/go/internal/feast/onlinestore/redisonlinestore_test.go @@ -1,6 +1,7 @@ package onlinestore import ( + "github.com/feast-dev/feast/go/internal/feast/registry" "testing" "github.com/stretchr/testify/assert" @@ -10,7 +11,11 @@ func TestNewRedisOnlineStore(t *testing.T) { var config = map[string]interface{}{ "connection_string": "redis://localhost:6379", } - store, err := NewRedisOnlineStore("test", config) + rc := ®istry.RepoConfig{ + OnlineStore: config, + EntityKeySerializationVersion: 2, + } + store, err := NewRedisOnlineStore("test", rc, config) assert.Nil(t, err) var opts = store.client.Options() assert.Equal(t, opts.Addr, "redis://localhost:6379") @@ -23,7 +28,11 @@ func TestNewRedisOnlineStoreWithPassword(t *testing.T) { var config = map[string]interface{}{ "connection_string": "redis://localhost:6379,password=secret", } - store, err := NewRedisOnlineStore("test", config) + rc := ®istry.RepoConfig{ + OnlineStore: config, + EntityKeySerializationVersion: 2, + } + store, err := NewRedisOnlineStore("test", rc, config) assert.Nil(t, err) var opts = store.client.Options() assert.Equal(t, opts.Addr, "redis://localhost:6379") @@ -34,7 +43,11 @@ func TestNewRedisOnlineStoreWithDB(t *testing.T) { var config = map[string]interface{}{ "connection_string": "redis://localhost:6379,db=1", } - store, err := NewRedisOnlineStore("test", config) + rc := ®istry.RepoConfig{ + OnlineStore: config, + EntityKeySerializationVersion: 2, + } + store, err := NewRedisOnlineStore("test", rc, config) assert.Nil(t, err) var opts = store.client.Options() assert.Equal(t, opts.Addr, "redis://localhost:6379") @@ -45,7 +58,11 @@ func TestNewRedisOnlineStoreWithSsl(t *testing.T) { var config = map[string]interface{}{ "connection_string": "redis://localhost:6379,ssl=true", } - store, err := NewRedisOnlineStore("test", config) + rc := ®istry.RepoConfig{ + OnlineStore: config, + EntityKeySerializationVersion: 2, + } + store, err := NewRedisOnlineStore("test", rc, config) assert.Nil(t, err) var opts = store.client.Options() assert.Equal(t, opts.Addr, "redis://localhost:6379") diff --git a/go/internal/feast/onlinestore/sqliteonlinestore.go b/go/internal/feast/onlinestore/sqliteonlinestore.go index 94ba0c0d56..1f407ad39c 100644 --- a/go/internal/feast/onlinestore/sqliteonlinestore.go +++ b/go/internal/feast/onlinestore/sqliteonlinestore.go @@ -16,7 +16,7 @@ import ( _ "github.com/mattn/go-sqlite3" "google.golang.org/protobuf/proto" - timestamppb "google.golang.org/protobuf/types/known/timestamppb" + "google.golang.org/protobuf/types/known/timestamppb" "github.com/feast-dev/feast/go/protos/feast/serving" "github.com/feast-dev/feast/go/protos/feast/types" @@ -24,15 +24,16 @@ import ( type SqliteOnlineStore struct { // Feast project name - project string - path string - db *sql.DB - db_mu sync.Mutex + project string + path string + db *sql.DB + db_mu sync.Mutex + repoConfig *registry.RepoConfig } // Creates a new sqlite online store object. onlineStoreConfig should have relative path of database file with respect to repoConfig.repoPath. func NewSqliteOnlineStore(project string, repoConfig *registry.RepoConfig, onlineStoreConfig map[string]interface{}) (*SqliteOnlineStore, error) { - store := SqliteOnlineStore{project: project} + store := SqliteOnlineStore{project: project, repoConfig: repoConfig} if db_path, ok := onlineStoreConfig["path"]; !ok { return nil, fmt.Errorf("cannot find sqlite path %s", db_path) } else { @@ -69,7 +70,7 @@ func (s *SqliteOnlineStore) OnlineRead(ctx context.Context, entityKeys []*types. in_query := make([]string, len(entityKeys)) serialized_entities := make([]interface{}, len(entityKeys)) for i := 0; i < len(entityKeys); i++ { - serKey, err := serializeEntityKey(entityKeys[i]) + serKey, err := serializeEntityKey(entityKeys[i], s.repoConfig.EntityKeySerializationVersion) if err != nil { return nil, err } diff --git a/go/internal/feast/onlinestore/sqliteonlinestore_test.go b/go/internal/feast/onlinestore/sqliteonlinestore_test.go index 5af1c1f4ce..e5e6e85e56 100644 --- a/go/internal/feast/onlinestore/sqliteonlinestore_test.go +++ b/go/internal/feast/onlinestore/sqliteonlinestore_test.go @@ -17,9 +17,9 @@ import ( func TestSqliteAndFeatureRepoSetup(t *testing.T) { dir := t.TempDir() feature_repo_path := filepath.Join(dir, "feature_repo") + err := test.SetupCleanFeatureRepo(dir) assert.Nil(t, err) - config, err := registry.NewRepoConfigFromFile(feature_repo_path) assert.Nil(t, err) assert.Equal(t, "feature_repo", config.Project) @@ -37,9 +37,9 @@ func TestSqliteOnlineRead(t *testing.T) { dir := t.TempDir() feature_repo_path := filepath.Join(dir, "feature_repo") test.SetupCleanFeatureRepo(dir) - config, err := registry.NewRepoConfigFromFile(feature_repo_path) assert.Nil(t, err) + store, err := NewSqliteOnlineStore("feature_repo", config, config.OnlineStore) defer store.Destruct() assert.Nil(t, err) diff --git a/go/internal/feast/registry/repoconfig.go b/go/internal/feast/registry/repoconfig.go index 59d125b1bf..b034b632dc 100644 --- a/go/internal/feast/registry/repoconfig.go +++ b/go/internal/feast/registry/repoconfig.go @@ -30,6 +30,8 @@ type RepoConfig struct { Flags map[string]interface{} `json:"flags"` // RepoPath RepoPath string `json:"repo_path"` + // EntityKeySerializationVersion + EntityKeySerializationVersion int64 `json:"entity_key_serialization_version"` } type RegistryConfig struct { diff --git a/go/internal/feast/server/logging/memorybuffer.go b/go/internal/feast/server/logging/memorybuffer.go index 9ffb0ff73b..c9f00218df 100644 --- a/go/internal/feast/server/logging/memorybuffer.go +++ b/go/internal/feast/server/logging/memorybuffer.go @@ -128,7 +128,7 @@ func getArrowSchema(schema *FeatureServiceSchema) (*arrow.Schema, error) { // and writes them to arrow table. // Returns arrow table that contains all of the logs in columnar format. func (b *MemoryBuffer) convertToArrowRecord() (arrow.Record, error) { - arrowMemory := memory.NewGoAllocator() + arrowMemory := memory.NewCgoArrowAllocator() numRows := len(b.logs) columns := make(map[string][]*types.Value) diff --git a/go/internal/feast/server/logging/memorybuffer_test.go b/go/internal/feast/server/logging/memorybuffer_test.go index 94f0f86ef0..ec83680f4f 100644 --- a/go/internal/feast/server/logging/memorybuffer_test.go +++ b/go/internal/feast/server/logging/memorybuffer_test.go @@ -118,7 +118,7 @@ func TestSerializeToArrowTable(t *testing.T) { LogTimestamp: time.Now(), }) - pool := memory.NewGoAllocator() + pool := memory.NewCgoArrowAllocator() builder := array.NewRecordBuilder(pool, b.arrowSchema) defer builder.Release() @@ -159,7 +159,7 @@ func TestSerializeToArrowTable(t *testing.T) { expectedRecord := builder.NewRecord() assert.Nil(t, err) for colIdx := 0; colIdx < int(record.NumCols()); colIdx++ { - assert.Equal(t, expectedRecord.Column(colIdx), record.Column(colIdx), "Columns with idx %d are not equal", colIdx) + assert.True(t, array.Equal(expectedRecord.Column(colIdx), record.Column(colIdx)), "Columns with idx %d are not equal", colIdx) } } diff --git a/infra/charts/feast-feature-server/.helmignore b/infra/charts/feast-feature-server/.helmignore new file mode 100644 index 0000000000..0e8a0eb36f --- /dev/null +++ b/infra/charts/feast-feature-server/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/infra/charts/feast-feature-server/Chart.yaml b/infra/charts/feast-feature-server/Chart.yaml new file mode 100644 index 0000000000..6c1afc9540 --- /dev/null +++ b/infra/charts/feast-feature-server/Chart.yaml @@ -0,0 +1,12 @@ +apiVersion: v2 +name: feast-feature-server +description: Feast Feature Server in Go or Python +type: application +version: 0.22.0 +keywords: + - machine learning + - big data + - mlops +home: https://feast.dev/ +sources: + - https://github.com/feast-dev/feast diff --git a/infra/charts/feast-feature-server/README.md b/infra/charts/feast-feature-server/README.md new file mode 100644 index 0000000000..a55451e788 --- /dev/null +++ b/infra/charts/feast-feature-server/README.md @@ -0,0 +1,82 @@ +# feast-feature-server + +![Version: 0.22.0](https://img.shields.io/badge/Version-0.22.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) + +Feast Feature Server in Go or Python + +**Homepage:** + +## Source Code + +* + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| affinity | object | `{}` | | +| fullnameOverride | string | `""` | | +| image.pullPolicy | string | `"IfNotPresent"` | | +| image.repository | string | `""` | | +| image.tag | string | `""` | | +| imagePullSecrets | list | `[]` | | +| livenessProbe.initialDelaySeconds | int | `30` | | +| livenessProbe.periodSeconds | int | `30` | | +| nameOverride | string | `""` | | +| nodeSelector | object | `{}` | | +| podAnnotations | object | `{}` | | +| podSecurityContext | object | `{}` | | +| readinessProbe.initialDelaySeconds | int | `20` | | +| readinessProbe.periodSeconds | int | `10` | | +| replicaCount | int | `1` | | +| resources | object | `{}` | | +| securityContext | object | `{}` | | +| service.port | int | `80` | | +| service.type | string | `"ClusterIP"` | | +| tolerations | list | `[]` | | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) + + +Docker repository and tag are required. Helm install example: +``` +helm install feast-feature-server . --set image.repository=REPO --set image.tag=TAG +``` + +Deployment assumes that `feature_store.yaml` exists on docker image. Example docker image: +``` +FROM python:3.8 + +RUN apt update && \ + apt install -y jq + +RUN pip install pip --upgrade + +RUN pip install feast + +COPY feature_store.yaml /feature_store.yaml +``` + +Furthermore, if you wish to use the Go feature server, then you must install the Apache Arrow C++ libraries, and your `feature_store.yaml` should include `go_feature_server: True`. +For more details, see the [docs](https://docs.feast.dev/reference/feature-servers/go-feature-server). +The docker image might look like: +``` +FROM python:3.8 + +RUN apt update && \ + apt install -y jq + +RUN pip install pip --upgrade + +RUN pip install feast + +RUN apt update +RUN apt install -y -V ca-certificates lsb-release wget +RUN wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +RUN apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +RUN apt update +RUN apt -y install libarrow-dev + +COPY feature_store.yaml /feature_store.yaml +``` \ No newline at end of file diff --git a/infra/charts/feast-feature-server/templates/_helpers.tpl b/infra/charts/feast-feature-server/templates/_helpers.tpl new file mode 100644 index 0000000000..19c2febd13 --- /dev/null +++ b/infra/charts/feast-feature-server/templates/_helpers.tpl @@ -0,0 +1,52 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "feast-feature-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "feast-feature-server.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "feast-feature-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "feast-feature-server.labels" -}} +helm.sh/chart: {{ include "feast-feature-server.chart" . }} +{{ include "feast-feature-server.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "feast-feature-server.selectorLabels" -}} +app.kubernetes.io/name: {{ include "feast-feature-server.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} diff --git a/infra/charts/feast-feature-server/templates/deployment.yaml b/infra/charts/feast-feature-server/templates/deployment.yaml new file mode 100644 index 0000000000..69cf92f6c0 --- /dev/null +++ b/infra/charts/feast-feature-server/templates/deployment.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "feast-feature-server.fullname" . }} + labels: + {{- include "feast-feature-server.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "feast-feature-server.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "feast-feature-server.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["feast", "serve", "-h", "0.0.0.0"] + ports: + - name: http + containerPort: 6566 + protocol: TCP + livenessProbe: + tcpSocket: + port: http + initialDelaySeconds: {{ .Values.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.livenessProbe.periodSeconds }} + readinessProbe: + tcpSocket: + port: http + initialDelaySeconds: {{ .Values.readinessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.readinessProbe.periodSeconds }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/infra/charts/feast-feature-server/templates/service.yaml b/infra/charts/feast-feature-server/templates/service.yaml new file mode 100644 index 0000000000..d6914828e4 --- /dev/null +++ b/infra/charts/feast-feature-server/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "feast-feature-server.name" . }} + labels: + {{- include "feast-feature-server.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "feast-feature-server.selectorLabels" . | nindent 4 }} diff --git a/infra/charts/feast-feature-server/values.yaml b/infra/charts/feast-feature-server/values.yaml new file mode 100644 index 0000000000..f62f95a757 --- /dev/null +++ b/infra/charts/feast-feature-server/values.yaml @@ -0,0 +1,57 @@ +# Default values for feast. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: "" + pullPolicy: IfNotPresent + tag: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 80 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +livenessProbe: + initialDelaySeconds: 30 + periodSeconds: 30 + +readinessProbe: + initialDelaySeconds: 20 + periodSeconds: 10 diff --git a/infra/charts/feast-python-server/Chart.yaml b/infra/charts/feast-python-server/Chart.yaml index 6c4751e3b7..6ab82b7a65 100644 --- a/infra/charts/feast-python-server/Chart.yaml +++ b/infra/charts/feast-python-server/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: feast-python-server description: Feast Feature Server in Python type: application -version: 0.22.0 +version: 0.23.0 keywords: - machine learning - big data diff --git a/infra/charts/feast-python-server/README.md b/infra/charts/feast-python-server/README.md index 3f60cc6c54..e3da9b1d29 100644 --- a/infra/charts/feast-python-server/README.md +++ b/infra/charts/feast-python-server/README.md @@ -1,6 +1,6 @@ # feast-python-server -![Version: 0.22.0](https://img.shields.io/badge/Version-0.22.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) +![Version: 0.23.0](https://img.shields.io/badge/Version-0.23.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) Feast Feature Server in Python @@ -46,7 +46,7 @@ helm install feast-python-server . --set image.repository=REPO --set image.tag=T Deployment assumes that `feature_store.yaml` exists on docker image. Example docker image: ``` -FROM python:3.7 +FROM python:3.8 RUN apt update && \ apt install -y jq diff --git a/infra/charts/feast/Chart.yaml b/infra/charts/feast/Chart.yaml index 012dc47de9..f4e33de7f3 100644 --- a/infra/charts/feast/Chart.yaml +++ b/infra/charts/feast/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v1 description: Feature store for machine learning name: feast -version: 0.22.0 +version: 0.23.0 keywords: - machine learning - big data diff --git a/infra/charts/feast/README.md b/infra/charts/feast/README.md index 8b5e9718ef..f71dcf6124 100644 --- a/infra/charts/feast/README.md +++ b/infra/charts/feast/README.md @@ -8,7 +8,7 @@ This repo contains Helm charts for Feast components that are being installed on ## Chart: Feast -Feature store for machine learning Current chart version is `0.22.0` +Feature store for machine learning Current chart version is `0.23.0` ## Installation @@ -55,8 +55,8 @@ For more details, please see: https://docs.feast.dev/how-to-guides/running-feast | Repository | Name | Version | |------------|------|---------| | https://charts.helm.sh/stable | redis | 10.5.6 | -| https://feast-helm-charts.storage.googleapis.com | feature-server(feature-server) | 0.22.0 | -| https://feast-helm-charts.storage.googleapis.com | transformation-service(transformation-service) | 0.22.0 | +| https://feast-helm-charts.storage.googleapis.com | feature-server(feature-server) | 0.23.0 | +| https://feast-helm-charts.storage.googleapis.com | transformation-service(transformation-service) | 0.23.0 | ## Values diff --git a/infra/charts/feast/charts/feature-server/Chart.yaml b/infra/charts/feast/charts/feature-server/Chart.yaml index b88ad599b4..ee08b0b0f8 100644 --- a/infra/charts/feast/charts/feature-server/Chart.yaml +++ b/infra/charts/feast/charts/feature-server/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v1 description: "Feast Feature Server: Online feature serving service for Feast" name: feature-server -version: 0.22.0 -appVersion: v0.22.0 +version: 0.23.0 +appVersion: v0.23.0 keywords: - machine learning - big data diff --git a/infra/charts/feast/charts/feature-server/README.md b/infra/charts/feast/charts/feature-server/README.md index 28570b0fc6..4717cfff3a 100644 --- a/infra/charts/feast/charts/feature-server/README.md +++ b/infra/charts/feast/charts/feature-server/README.md @@ -1,6 +1,6 @@ # feature-server -![Version: 0.22.0](https://img.shields.io/badge/Version-0.22.0-informational?style=flat-square) ![AppVersion: v0.22.0](https://img.shields.io/badge/AppVersion-v0.22.0-informational?style=flat-square) +![Version: 0.23.0](https://img.shields.io/badge/Version-0.23.0-informational?style=flat-square) ![AppVersion: v0.23.0](https://img.shields.io/badge/AppVersion-v0.23.0-informational?style=flat-square) Feast Feature Server: Online feature serving service for Feast @@ -17,7 +17,7 @@ Feast Feature Server: Online feature serving service for Feast | envOverrides | object | `{}` | Extra environment variables to set | | image.pullPolicy | string | `"IfNotPresent"` | Image pull policy | | image.repository | string | `"feastdev/feature-server-java"` | Docker image for Feature Server repository | -| image.tag | string | `"0.22.0"` | Image tag | +| image.tag | string | `"0.23.0"` | Image tag | | ingress.grpc.annotations | object | `{}` | Extra annotations for the ingress | | ingress.grpc.auth.enabled | bool | `false` | Flag to enable auth | | ingress.grpc.class | string | `"nginx"` | Which ingress controller to use | diff --git a/infra/charts/feast/charts/feature-server/values.yaml b/infra/charts/feast/charts/feature-server/values.yaml index df8367fede..011ce9dc33 100644 --- a/infra/charts/feast/charts/feature-server/values.yaml +++ b/infra/charts/feast/charts/feature-server/values.yaml @@ -5,7 +5,7 @@ image: # image.repository -- Docker image for Feature Server repository repository: feastdev/feature-server-java # image.tag -- Image tag - tag: 0.22.0 + tag: 0.23.0 # image.pullPolicy -- Image pull policy pullPolicy: IfNotPresent diff --git a/infra/charts/feast/charts/transformation-service/Chart.yaml b/infra/charts/feast/charts/transformation-service/Chart.yaml index 148e136acf..07055730c5 100644 --- a/infra/charts/feast/charts/transformation-service/Chart.yaml +++ b/infra/charts/feast/charts/transformation-service/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v1 description: "Transformation service: to compute on-demand features" name: transformation-service -version: 0.22.0 -appVersion: v0.22.0 +version: 0.23.0 +appVersion: v0.23.0 keywords: - machine learning - big data diff --git a/infra/charts/feast/charts/transformation-service/README.md b/infra/charts/feast/charts/transformation-service/README.md index 4cbc1048f6..9bc7a1e5d6 100644 --- a/infra/charts/feast/charts/transformation-service/README.md +++ b/infra/charts/feast/charts/transformation-service/README.md @@ -1,6 +1,6 @@ # transformation-service -![Version: 0.22.0](https://img.shields.io/badge/Version-0.22.0-informational?style=flat-square) ![AppVersion: v0.22.0](https://img.shields.io/badge/AppVersion-v0.22.0-informational?style=flat-square) +![Version: 0.23.0](https://img.shields.io/badge/Version-0.23.0-informational?style=flat-square) ![AppVersion: v0.23.0](https://img.shields.io/badge/AppVersion-v0.23.0-informational?style=flat-square) Transformation service: to compute on-demand features @@ -13,7 +13,7 @@ Transformation service: to compute on-demand features | envOverrides | object | `{}` | Extra environment variables to set | | image.pullPolicy | string | `"IfNotPresent"` | Image pull policy | | image.repository | string | `"feastdev/feature-transformation-server"` | Docker image for Transformation Server repository | -| image.tag | string | `"0.22.0"` | Image tag | +| image.tag | string | `"0.23.0"` | Image tag | | nodeSelector | object | `{}` | Node labels for pod assignment | | podLabels | object | `{}` | Labels to be added to Feast Serving pods | | replicaCount | int | `1` | Number of pods that will be created | diff --git a/infra/charts/feast/charts/transformation-service/values.yaml b/infra/charts/feast/charts/transformation-service/values.yaml index 1264ea4f7b..c1e506a476 100644 --- a/infra/charts/feast/charts/transformation-service/values.yaml +++ b/infra/charts/feast/charts/transformation-service/values.yaml @@ -5,7 +5,7 @@ image: # image.repository -- Docker image for Transformation Server repository repository: feastdev/feature-transformation-server # image.tag -- Image tag - tag: 0.22.0 + tag: 0.23.0 # image.pullPolicy -- Image pull policy pullPolicy: IfNotPresent diff --git a/infra/charts/feast/requirements.yaml b/infra/charts/feast/requirements.yaml index 0b69f295e7..c88fb7a4fa 100644 --- a/infra/charts/feast/requirements.yaml +++ b/infra/charts/feast/requirements.yaml @@ -1,12 +1,12 @@ dependencies: - name: feature-server alias: feature-server - version: 0.22.0 + version: 0.23.0 condition: feature-server.enabled repository: https://feast-helm-charts.storage.googleapis.com - name: transformation-service alias: transformation-service - version: 0.22.0 + version: 0.23.0 condition: transformation-service.enabled repository: https://feast-helm-charts.storage.googleapis.com - name: redis diff --git a/infra/templates/README.md.jinja2 b/infra/templates/README.md.jinja2 index cd6e42c1d1..6a8ebdbab7 100644 --- a/infra/templates/README.md.jinja2 +++ b/infra/templates/README.md.jinja2 @@ -16,7 +16,14 @@ ## Overview -Feast is an open source feature store for machine learning. Feast is the fastest path to productionizing analytic data for model training and online inference. +Feast (**Fea**ture **St**ore) is an open source feature store for machine learning. Feast is the fastest path to manage existing infrastructure to productionize analytic data for model training and online inference. + + +Feast allows ML platform teams to: + +* **Make features consistently available for training and serving** by managing an _offline store_ (to process historical data for scale-out batch scoring or model training), a low-latency _online store_ (to power real-time prediction)_,_ and a battle-tested _feature server_ (for serving pre-computed features online). +* **Avoid data leakage** by generating point-in-time correct feature sets so data scientists can focus on feature engineering rather than debugging error-prone dataset joining logic. This ensure that future feature values do not leak to models during training. +* **Decouple ML from data infrastructure** by providing a single data access layer that abstracts feature storage from feature retrieval, ensuring models remain portable as you move from training models to serving models, from batch models to realtime models, and from one data infra system to another. Please see our [documentation](https://docs.feast.dev/) for more information about the project. diff --git a/java/CONTRIBUTING.md b/java/CONTRIBUTING.md index 86eacfef41..f6c789d984 100644 --- a/java/CONTRIBUTING.md +++ b/java/CONTRIBUTING.md @@ -1,5 +1,5 @@ # Development Guide: feast-java -> The higher level [Development Guide](https://docs.feast.dev/contributing/development-guide) +> The higher level [Development Guide](https://docs.feast.dev/v/master/project/development-guide) > gives contributing to Feast codebase as a whole. ### Overview @@ -9,7 +9,7 @@ the feast-java Repository: - [Feast Java Client](#feast-java-client) > Don't see the Feast component that you want to contribute to here? -> Check out the [Development Guide](https://docs.feast.dev/contributing/development-guide) +> Check out the [Development Guide](https://docs.feast.dev/v/master/project/development-guide) > to learn how Feast components are distributed over multiple repositories. #### Common Setup diff --git a/java/README.md b/java/README.md index ff5a1b8553..8c3d93628e 100644 --- a/java/README.md +++ b/java/README.md @@ -13,8 +13,8 @@ The process of ingesting data into the online store (Redis) is decoupled from th ### Contributing Guides on Contributing: -- [Contribution Process for Feast](https://docs.feast.dev/v/master/contributing/contributing) -- [Development Guide for Feast](https://docs.feast.dev/contributing/development-guide) +- [Contribution Process for Feast](https://docs.feast.dev/v/master/project/contributing) +- [Development Guide for Feast](https://docs.feast.dev/v/master/project/development-guide) - [Development Guide for feast-java (this repository)](CONTRIBUTING.md) ### Installing using Helm diff --git a/java/pom.xml b/java/pom.xml index 7ea4bc07bd..0bf92ee244 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -38,7 +38,7 @@ - 0.22.0 + 0.23.0 https://github.com/feast-dev/feast UTF-8 @@ -70,7 +70,7 @@ 2.0.1.Final 0.21.0 1.6.6 - 29.0-jre + 30.1-jre com.amazonaws aws-java-sdk-s3 - 1.12.110 + 1.12.261 diff --git a/java/serving/src/main/java/feast/serving/config/ApplicationProperties.java b/java/serving/src/main/java/feast/serving/config/ApplicationProperties.java index 268592d20a..5850eb6483 100644 --- a/java/serving/src/main/java/feast/serving/config/ApplicationProperties.java +++ b/java/serving/src/main/java/feast/serving/config/ApplicationProperties.java @@ -38,22 +38,84 @@ /** Feast Serving properties. */ public class ApplicationProperties { private static final Logger log = org.slf4j.LoggerFactory.getLogger(ApplicationProperties.class); + private FeastProperties feast; + private GrpcServer grpc; + private RestServer rest; - public static class FeastProperties { - /* Feast Serving build version */ - @NotBlank private String version = "unknown"; + public FeastProperties getFeast() { + return feast; + } - public void setRegistry(String registry) { - this.registry = registry; + public void setFeast(FeastProperties feast) { + this.feast = feast; + } + + public GrpcServer getGrpc() { + return grpc; + } + + public void setGrpc(GrpcServer grpc) { + this.grpc = grpc; + } + + public RestServer getRest() { + return rest; + } + + public void setRest(RestServer rest) { + this.rest = rest; + } + + /** + * Validates all FeastProperties. This method runs after properties have been initialized and + * individually and conditionally validates each class. + */ + @PostConstruct + public void validate() { + ValidatorFactory factory = Validation.buildDefaultValidatorFactory(); + Validator validator = factory.getValidator(); + + // Validate root fields in FeastProperties + Set> violations = validator.validate(this); + if (!violations.isEmpty()) { + throw new ConstraintViolationException(violations); } + } + public enum StoreType { + REDIS, + REDIS_CLUSTER; + } + + public static class FeastProperties { + /* Feast Serving build version */ + @NotBlank private String version = "unknown"; @NotBlank private String registry; + @NotBlank private String project; + private int registryRefreshInterval; + private int entityKeySerializationVersion; + /** Name of the active store configuration (only one store can be active at a time). */ + @NotBlank private String activeStore; + /** + * Collection of store configurations. The active store is selected by the "activeStore" field. + */ + @JsonMerge(OptBoolean.FALSE) + private List stores = new ArrayList<>(); + /* Metric tracing properties. */ + private TracingProperties tracing; + /* Feast Audit Logging properties */ + @NotNull private LoggingProperties logging; + private String gcpProject; + private String awsRegion; + private String transformationServiceEndpoint; public String getRegistry() { return registry; } - @NotBlank private String project; + public void setRegistry(String registry) { + this.registry = registry; + } public String getProject() { return project; @@ -63,8 +125,6 @@ public void setProject(final String project) { this.project = project; } - private int registryRefreshInterval; - public int getRegistryRefreshInterval() { return registryRefreshInterval; } @@ -73,6 +133,14 @@ public void setRegistryRefreshInterval(int registryRefreshInterval) { this.registryRefreshInterval = registryRefreshInterval; } + public int getEntityKeySerializationVersion() { + return entityKeySerializationVersion; + } + + public void setEntityKeySerializationVersion(int entityKeySerializationVersion) { + this.entityKeySerializationVersion = entityKeySerializationVersion; + } + /** * Finds and returns the active store * @@ -92,25 +160,6 @@ public void setActiveStore(String activeStore) { this.activeStore = activeStore; } - /** Name of the active store configuration (only one store can be active at a time). */ - @NotBlank private String activeStore; - - /** - * Collection of store configurations. The active store is selected by the "activeStore" field. - */ - @JsonMerge(OptBoolean.FALSE) - private List stores = new ArrayList<>(); - - /* Metric tracing properties. */ - private TracingProperties tracing; - - /* Feast Audit Logging properties */ - @NotNull private LoggingProperties logging; - - public void setStores(List stores) { - this.stores = stores; - } - /** * Gets Serving store configuration as a list of {@link Store}. * @@ -120,6 +169,10 @@ public List getStores() { return stores; } + public void setStores(List stores) { + this.stores = stores; + } + /** * Gets Feast Serving build version. * @@ -129,10 +182,6 @@ public String getVersion() { return version; } - public void setTracing(TracingProperties tracing) { - this.tracing = tracing; - } - /** * Gets tracing properties * @@ -142,6 +191,10 @@ public TracingProperties getTracing() { return tracing; } + public void setTracing(TracingProperties tracing) { + this.tracing = tracing; + } + /** * Gets logging properties * @@ -151,8 +204,6 @@ public LoggingProperties getLogging() { return logging; } - private String gcpProject; - public String getGcpProject() { return gcpProject; } @@ -161,17 +212,13 @@ public void setGcpProject(String gcpProject) { this.gcpProject = gcpProject; } - public void setAwsRegion(String awsRegion) { - this.awsRegion = awsRegion; - } - - private String awsRegion; - public String getAwsRegion() { return awsRegion; } - private String transformationServiceEndpoint; + public void setAwsRegion(String awsRegion) { + this.awsRegion = awsRegion; + } public String getTransformationServiceEndpoint() { return transformationServiceEndpoint; @@ -182,16 +229,6 @@ public void setTransformationServiceEndpoint(String transformationServiceEndpoin } } - private FeastProperties feast; - - public void setFeast(FeastProperties feast) { - this.feast = feast; - } - - public FeastProperties getFeast() { - return feast; - } - /** Store configuration class for database that this Feast Serving uses. */ public static class Store { @@ -327,30 +364,6 @@ public void setServer(Server server) { } } - private GrpcServer grpc; - private RestServer rest; - - public GrpcServer getGrpc() { - return grpc; - } - - public void setGrpc(GrpcServer grpc) { - this.grpc = grpc; - } - - public RestServer getRest() { - return rest; - } - - public void setRest(RestServer rest) { - this.rest = rest; - } - - public enum StoreType { - REDIS, - REDIS_CLUSTER; - } - /** Trace metric collection properties */ public static class TracingProperties { @@ -417,20 +430,4 @@ public void setServiceName(String serviceName) { this.serviceName = serviceName; } } - - /** - * Validates all FeastProperties. This method runs after properties have been initialized and - * individually and conditionally validates each class. - */ - @PostConstruct - public void validate() { - ValidatorFactory factory = Validation.buildDefaultValidatorFactory(); - Validator validator = factory.getValidator(); - - // Validate root fields in FeastProperties - Set> violations = validator.validate(this); - if (!violations.isEmpty()) { - throw new ConstraintViolationException(violations); - } - } } diff --git a/java/serving/src/main/java/feast/serving/config/ServingServiceConfigV2.java b/java/serving/src/main/java/feast/serving/config/ServingServiceConfigV2.java index 4ea0692ccd..868e3b83d1 100644 --- a/java/serving/src/main/java/feast/serving/config/ServingServiceConfigV2.java +++ b/java/serving/src/main/java/feast/serving/config/ServingServiceConfigV2.java @@ -48,7 +48,8 @@ public ServingServiceV2 registryBasedServingServiceV2( new OnlineRetriever( applicationProperties.getFeast().getProject(), redisClusterClient, - new EntityKeySerializerV2()); + new EntityKeySerializerV2( + applicationProperties.getFeast().getEntityKeySerializationVersion())); break; case REDIS: RedisClientAdapter redisClient = RedisClient.create(store.getRedisConfig()); @@ -57,7 +58,8 @@ public ServingServiceV2 registryBasedServingServiceV2( new OnlineRetriever( applicationProperties.getFeast().getProject(), redisClient, - new EntityKeySerializerV2()); + new EntityKeySerializerV2( + applicationProperties.getFeast().getEntityKeySerializationVersion())); break; default: throw new RuntimeException( diff --git a/java/serving/src/main/java/feast/serving/registry/Registry.java b/java/serving/src/main/java/feast/serving/registry/Registry.java index bc953174ea..a7b28f7c66 100644 --- a/java/serving/src/main/java/feast/serving/registry/Registry.java +++ b/java/serving/src/main/java/feast/serving/registry/Registry.java @@ -33,6 +33,7 @@ public class Registry { private Map onDemandFeatureViewNameToSpec; private final Map featureServiceNameToSpec; + private final Map entityNameToJoinKey; Registry(RegistryProto.Registry registry) { this.registry = registry; @@ -60,6 +61,12 @@ public class Registry { .collect( Collectors.toMap( FeatureServiceProto.FeatureServiceSpec::getName, Function.identity())); + this.entityNameToJoinKey = + registry.getEntitiesList().stream() + .map(EntityProto.Entity::getSpec) + .collect( + Collectors.toMap( + EntityProto.EntitySpecV2::getName, EntityProto.EntitySpecV2::getJoinKey)); } public RegistryProto.Registry getRegistry() { @@ -115,4 +122,12 @@ public FeatureServiceProto.FeatureServiceSpec getFeatureServiceSpec(String name) } return spec; } + + public String getEntityJoinKey(String name) { + String joinKey = entityNameToJoinKey.get(name); + if (joinKey == null) { + throw new SpecRetrievalException(String.format("Unable to find entity with name: %s", name)); + } + return joinKey; + } } diff --git a/java/serving/src/main/java/feast/serving/registry/RegistryRepository.java b/java/serving/src/main/java/feast/serving/registry/RegistryRepository.java index 369493ee0f..023ec1a062 100644 --- a/java/serving/src/main/java/feast/serving/registry/RegistryRepository.java +++ b/java/serving/src/main/java/feast/serving/registry/RegistryRepository.java @@ -102,4 +102,8 @@ public Duration getMaxAge(ServingAPIProto.FeatureReferenceV2 featureReference) { public List getEntitiesList(ServingAPIProto.FeatureReferenceV2 featureReference) { return getFeatureViewSpec(featureReference).getEntitiesList(); } + + public String getEntityJoinKey(String name) { + return this.registry.getEntityJoinKey(name); + } } diff --git a/java/serving/src/main/java/feast/serving/service/OnlineServingServiceV2.java b/java/serving/src/main/java/feast/serving/service/OnlineServingServiceV2.java index 12e8a5b158..3751ee8119 100644 --- a/java/serving/src/main/java/feast/serving/service/OnlineServingServiceV2.java +++ b/java/serving/src/main/java/feast/serving/service/OnlineServingServiceV2.java @@ -34,7 +34,6 @@ import feast.serving.registry.RegistryRepository; import feast.serving.util.Metrics; import feast.storage.api.retriever.OnlineRetrieverV2; -import io.grpc.Status; import io.opentracing.Span; import io.opentracing.Tracer; import java.util.*; @@ -51,6 +50,11 @@ public class OnlineServingServiceV2 implements ServingServiceV2 { private final OnlineTransformationService onlineTransformationService; private final String project; + public static final String DUMMY_ENTITY_ID = "__dummy_id"; + public static final String DUMMY_ENTITY_VAL = ""; + public static final ValueProto.Value DUMMY_ENTITY_VALUE = + ValueProto.Value.newBuilder().setStringVal(DUMMY_ENTITY_VAL).build(); + public OnlineServingServiceV2( OnlineRetrieverV2 retriever, Tracer tracer, @@ -103,31 +107,18 @@ public ServingAPIProto.GetOnlineFeaturesResponse getOnlineFeatures( List> entityRows = getEntityRows(request); - List entityNames; - if (retrievedFeatureReferences.size() > 0) { - entityNames = this.registryRepository.getEntitiesList(retrievedFeatureReferences.get(0)); - } else { - throw new RuntimeException("Requested features list must not be empty"); - } - Span storageRetrievalSpan = tracer.buildSpan("storageRetrieval").start(); if (storageRetrievalSpan != null) { storageRetrievalSpan.setTag("entities", entityRows.size()); storageRetrievalSpan.setTag("features", retrievedFeatureReferences.size()); } + List> features = - retriever.getOnlineFeatures(entityRows, retrievedFeatureReferences, entityNames); + retrieveFeatures(retrievedFeatureReferences, entityRows); if (storageRetrievalSpan != null) { storageRetrievalSpan.finish(); } - if (features.size() != entityRows.size()) { - throw Status.INTERNAL - .withDescription( - "The no. of FeatureRow obtained from OnlineRetriever" - + "does not match no. of entityRow passed.") - .asRuntimeException(); - } Span postProcessingSpan = tracer.buildSpan("postProcessing").start(); @@ -255,6 +246,84 @@ private List> getEntityRows( return entityRows; } + private List> retrieveFeatures( + List featureReferences, List> entityRows) { + // Prepare feature reference to index mapping. This mapping will be used to arrange the + // retrieved features to the same order as in the input. + if (featureReferences.isEmpty()) { + throw new RuntimeException("Requested features list must not be empty."); + } + Map featureReferenceToIndexMap = + new HashMap<>(featureReferences.size()); + for (int i = 0; i < featureReferences.size(); i++) { + FeatureReferenceV2 featureReference = featureReferences.get(i); + if (featureReferenceToIndexMap.containsKey(featureReference)) { + throw new RuntimeException( + String.format( + "Found duplicate features %s:%s.", + featureReference.getFeatureViewName(), featureReference.getFeatureName())); + } + featureReferenceToIndexMap.put(featureReference, i); + } + + // Create placeholders for retrieved features. + List> features = new ArrayList<>(entityRows.size()); + for (int i = 0; i < entityRows.size(); i++) { + List featuresPerEntity = + new ArrayList<>(featureReferences.size()); + for (int j = 0; j < featureReferences.size(); j++) { + featuresPerEntity.add(null); + } + features.add(featuresPerEntity); + } + + // Group feature references by join keys. + Map> groupNameToFeatureReferencesMap = + featureReferences.stream() + .collect( + Collectors.groupingBy( + featureReference -> + this.registryRepository.getEntitiesList(featureReference).stream() + .map(this.registryRepository::getEntityJoinKey) + .sorted() + .collect(Collectors.joining(",")))); + + // Retrieve features one group at a time. + for (List featureReferencesPerGroup : + groupNameToFeatureReferencesMap.values()) { + List entityNames = + this.registryRepository.getEntitiesList(featureReferencesPerGroup.get(0)); + List> entityRowsPerGroup = new ArrayList<>(entityRows.size()); + for (Map entityRow : entityRows) { + Map entityRowPerGroup = new HashMap<>(); + entityNames.stream() + .map(this.registryRepository::getEntityJoinKey) + .forEach( + joinKey -> { + if (joinKey.equals(DUMMY_ENTITY_ID)) { + entityRowPerGroup.put(joinKey, DUMMY_ENTITY_VALUE); + } else { + ValueProto.Value value = entityRow.get(joinKey); + if (value != null) { + entityRowPerGroup.put(joinKey, value); + } + } + }); + entityRowsPerGroup.add(entityRowPerGroup); + } + List> featuresPerGroup = + retriever.getOnlineFeatures(entityRowsPerGroup, featureReferencesPerGroup, entityNames); + for (int i = 0; i < featuresPerGroup.size(); i++) { + for (int j = 0; j < featureReferencesPerGroup.size(); j++) { + int k = featureReferenceToIndexMap.get(featureReferencesPerGroup.get(j)); + features.get(i).set(k, featuresPerGroup.get(i).get(j)); + } + } + } + + return features; + } + private void populateOnDemandFeatures( List onDemandFeatureReferences, List onDemandFeatureSources, diff --git a/java/serving/src/test/java/feast/serving/it/ServingBaseTests.java b/java/serving/src/test/java/feast/serving/it/ServingBaseTests.java index 30cba0cb06..66987e8c0d 100644 --- a/java/serving/src/test/java/feast/serving/it/ServingBaseTests.java +++ b/java/serving/src/test/java/feast/serving/it/ServingBaseTests.java @@ -172,5 +172,35 @@ public void shouldGetOnlineFeaturesWithStringEntity() { } } + @Test + public void shouldGetOnlineFeaturesFromAllFeatureViews() { + Map entityRows = + ImmutableMap.of( + "entity", + ValueProto.RepeatedValue.newBuilder() + .addVal(DataGenerator.createStrValue("key-1")) + .build(), + "driver_id", + ValueProto.RepeatedValue.newBuilder() + .addVal(DataGenerator.createInt64Value(1005)) + .build()); + + ImmutableList featureReferences = + ImmutableList.of( + "feature_view_0:feature_0", + "feature_view_0:feature_1", + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:avg_daily_trips"); + + ServingAPIProto.GetOnlineFeaturesRequest req = + TestUtils.createOnlineFeatureRequest(featureReferences, entityRows); + + ServingAPIProto.GetOnlineFeaturesResponse resp = servingStub.getOnlineFeatures(req); + + for (final int featureIdx : List.of(0, 1, 2, 3)) { + assertEquals(FieldStatus.PRESENT, resp.getResults(featureIdx).getStatuses(0)); + } + } + abstract void updateRegistryFile(RegistryProto.Registry registry); } diff --git a/java/serving/src/test/java/feast/serving/service/OnlineServingServiceTest.java b/java/serving/src/test/java/feast/serving/service/OnlineServingServiceTest.java index 64d2e20c9b..933e38f056 100644 --- a/java/serving/src/test/java/feast/serving/service/OnlineServingServiceTest.java +++ b/java/serving/src/test/java/feast/serving/service/OnlineServingServiceTest.java @@ -170,6 +170,8 @@ public void shouldReturnResponseWithValuesAndMetadataIfKeysPresent() { .thenReturn(featureSpecs.get(0)); when(registry.getFeatureSpec(mockedFeatureRows.get(3).getFeatureReference())) .thenReturn(featureSpecs.get(1)); + when(registry.getEntityJoinKey("entity1")).thenReturn("entity1"); + when(registry.getEntityJoinKey("entity2")).thenReturn("entity2"); when(tracer.buildSpan(ArgumentMatchers.any())).thenReturn(Mockito.mock(SpanBuilder.class)); @@ -237,6 +239,8 @@ public void shouldReturnResponseWithUnsetValuesAndMetadataIfKeysNotPresent() { .thenReturn(featureSpecs.get(0)); when(registry.getFeatureSpec(mockedFeatureRows.get(1).getFeatureReference())) .thenReturn(featureSpecs.get(1)); + when(registry.getEntityJoinKey("entity1")).thenReturn("entity1"); + when(registry.getEntityJoinKey("entity2")).thenReturn("entity2"); when(tracer.buildSpan(ArgumentMatchers.any())).thenReturn(Mockito.mock(SpanBuilder.class)); @@ -314,6 +318,8 @@ public void shouldReturnResponseWithValuesAndMetadataIfMaxAgeIsExceeded() { .thenReturn(featureSpecs.get(1)); when(registry.getFeatureSpec(mockedFeatureRows.get(5).getFeatureReference())) .thenReturn(featureSpecs.get(0)); + when(registry.getEntityJoinKey("entity1")).thenReturn("entity1"); + when(registry.getEntityJoinKey("entity2")).thenReturn("entity2"); when(tracer.buildSpan(ArgumentMatchers.any())).thenReturn(Mockito.mock(SpanBuilder.class)); diff --git a/java/serving/src/test/resources/docker-compose/feast10/Dockerfile b/java/serving/src/test/resources/docker-compose/feast10/Dockerfile index dc26c804a9..dee7dcf84c 100644 --- a/java/serving/src/test/resources/docker-compose/feast10/Dockerfile +++ b/java/serving/src/test/resources/docker-compose/feast10/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.7 +FROM python:3.8 WORKDIR /usr/src/ diff --git a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializerV2.java b/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializerV2.java index 3e9ab7e8ab..f99e5cbdb1 100644 --- a/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializerV2.java +++ b/java/storage/connectors/redis/src/main/java/feast/storage/connectors/redis/retriever/EntityKeySerializerV2.java @@ -30,6 +30,15 @@ // https://github.com/feast-dev/feast/blob/b1ccf8dd1535f721aee8bea937ee38feff80bec5/sdk/python/feast/infra/key_encoding_utils.py#L22 // and must be kept up to date with any changes in that logic. public class EntityKeySerializerV2 implements EntityKeySerializer { + private final int entityKeySerializationVersion; + + public EntityKeySerializerV2() { + this(1); + } + + public EntityKeySerializerV2(int entityKeySerializationVersion) { + this.entityKeySerializationVersion = entityKeySerializationVersion; + } @Override public byte[] serialize(RedisProto.RedisKeyV2 entityKey) { @@ -83,7 +92,11 @@ public byte[] serialize(RedisProto.RedisKeyV2 entityKey) { we use `struct.pack(" encodeInteger(Integer value) { return Arrays.asList(ArrayUtils.toObject(buffer.array())); } + private List encodeLong(Long value) { + ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES); + buffer.order(ByteOrder.LITTLE_ENDIAN); + buffer.putLong(value); + + return Arrays.asList(ArrayUtils.toObject(buffer.array())); + } + private List encodeString(String value) { byte[] stringBytes = value.getBytes(StandardCharsets.UTF_8); return encodeBytes(stringBytes); diff --git a/protos/feast/core/Registry.proto b/protos/feast/core/Registry.proto index 19f17a8158..7d80d8c837 100644 --- a/protos/feast/core/Registry.proto +++ b/protos/feast/core/Registry.proto @@ -34,7 +34,7 @@ import "feast/core/SavedDataset.proto"; import "feast/core/ValidationProfile.proto"; import "google/protobuf/timestamp.proto"; -// Next id: 15 +// Next id: 16 message Registry { repeated Entity entities = 1; repeated FeatureTable feature_tables = 2; @@ -47,9 +47,15 @@ message Registry { repeated SavedDataset saved_datasets = 11; repeated ValidationReference validation_references = 13; Infra infra = 10; + // Tracking metadata of Feast by project + repeated ProjectMetadata project_metadata = 15; string registry_schema_version = 3; // to support migrations; incremented when schema is changed string version_id = 4; // version id, random string generated on each update of the data; now used only for debugging purposes google.protobuf.Timestamp last_updated = 5; +} +message ProjectMetadata { + string project = 1; + string project_uuid = 2; } diff --git a/protos/feast/core/SavedDataset.proto b/protos/feast/core/SavedDataset.proto index 353e925ad1..53f06f73a9 100644 --- a/protos/feast/core/SavedDataset.proto +++ b/protos/feast/core/SavedDataset.proto @@ -58,6 +58,7 @@ message SavedDatasetStorage { DataSource.SnowflakeOptions snowflake_storage = 7; DataSource.TrinoOptions trino_storage = 8; DataSource.SparkOptions spark_storage = 9; + DataSource.CustomSourceOptions custom_storage = 10; } } diff --git a/protos/feast/core/StreamFeatureView.proto b/protos/feast/core/StreamFeatureView.proto index d217b86a3f..06e9ee0612 100644 --- a/protos/feast/core/StreamFeatureView.proto +++ b/protos/feast/core/StreamFeatureView.proto @@ -34,7 +34,7 @@ import "feast/core/Aggregation.proto"; message StreamFeatureView { // User-specified specifications of this feature view. StreamFeatureViewSpec spec = 1; - StreamFeatureViewMeta meta = 2; + FeatureViewMeta meta = 2; } // Next available id: 17 @@ -90,13 +90,3 @@ message StreamFeatureViewSpec { string timestamp_field = 16; } -message StreamFeatureViewMeta { - // Time where this Feature View is created - google.protobuf.Timestamp created_timestamp = 1; - - // Time where this Feature View is last updated - google.protobuf.Timestamp last_updated_timestamp = 2; - - // List of pairs (start_time, end_time) for which this feature view has been materialized. - repeated MaterializationInterval materialization_intervals = 3; -} diff --git a/pyproject.toml b/pyproject.toml index 64394a558a..c89f1d9cc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=60", "wheel", "setuptools_scm>=6.2", "grpcio", "grpcio-tools==1.44.0", "mypy-protobuf==3.1", "sphinx!=4.0.0"] +requires = ["setuptools>=60", "wheel", "setuptools_scm>=6.2", "grpcio", "grpcio-tools>=1.47.0", "mypy-protobuf==3.1", "sphinx!=4.0.0"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] @@ -25,6 +25,7 @@ exclude = ''' | pb2.py | \.pyi | protos + | sdk/python/feast/embedded_go/lib )/ ) ''' diff --git a/sdk/python/docs/index.rst b/sdk/python/docs/index.rst index 9297901c33..07b9d9a77e 100644 --- a/sdk/python/docs/index.rst +++ b/sdk/python/docs/index.rst @@ -250,18 +250,21 @@ Sqlite Online Store .. automodule:: feast.infra.online_stores.sqlite :members: + :noindex: Datastore Online Store ---------------------- .. automodule:: feast.infra.online_stores.datastore :members: + :noindex: DynamoDB Online Store --------------------- .. automodule:: feast.infra.online_stores.dynamodb :members: + :noindex: Redis Online Store ------------------ @@ -283,3 +286,23 @@ HBase Online Store .. automodule:: feast.infra.online_stores.contrib.hbase_online_store.hbase :members: :noindex: + + +Batch Materialization Engine +============================ + +.. automodule:: feast.infra.materialization + :members: BatchMaterializationEngine, MaterializationJob, MaterializationTask + +Local Engine +------------ +.. autoclass:: feast.infra.materialization.LocalMaterializationEngine + :members: + :noindex: + +(Alpha) Lambda Based Engine +--------------------------- + +.. autoclass:: feast.infra.materialization.lambda.lambda_engine + :members: + :noindex: diff --git a/sdk/python/docs/source/feast.infra.materialization.lambda.rst b/sdk/python/docs/source/feast.infra.materialization.lambda.rst new file mode 100644 index 0000000000..7ca1d44314 --- /dev/null +++ b/sdk/python/docs/source/feast.infra.materialization.lambda.rst @@ -0,0 +1,29 @@ +feast.infra.materialization.lambda package +========================================== + +Submodules +---------- + +feast.infra.materialization.lambda.app module +--------------------------------------------- + +.. automodule:: feast.infra.materialization.lambda.app + :members: + :undoc-members: + :show-inheritance: + +feast.infra.materialization.lambda.lambda\_engine module +-------------------------------------------------------- + +.. automodule:: feast.infra.materialization.lambda.lambda_engine + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: feast.infra.materialization.lambda + :members: + :undoc-members: + :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.materialization.rst b/sdk/python/docs/source/feast.infra.materialization.rst new file mode 100644 index 0000000000..ff3e1cf135 --- /dev/null +++ b/sdk/python/docs/source/feast.infra.materialization.rst @@ -0,0 +1,37 @@ +feast.infra.materialization package +=================================== + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + feast.infra.materialization.lambda + +Submodules +---------- + +feast.infra.materialization.batch\_materialization\_engine module +----------------------------------------------------------------- + +.. automodule:: feast.infra.materialization.batch_materialization_engine + :members: + :undoc-members: + :show-inheritance: + +feast.infra.materialization.local\_engine module +------------------------------------------------ + +.. automodule:: feast.infra.materialization.local_engine + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: feast.infra.materialization + :members: + :undoc-members: + :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.online_stores.rst b/sdk/python/docs/source/feast.infra.online_stores.rst index 842522c9d7..65758c409c 100644 --- a/sdk/python/docs/source/feast.infra.online_stores.rst +++ b/sdk/python/docs/source/feast.infra.online_stores.rst @@ -52,6 +52,14 @@ feast.infra.online\_stores.redis module :undoc-members: :show-inheritance: +feast.infra.online\_stores.snowflake module +------------------------------------------- + +.. automodule:: feast.infra.online_stores.snowflake + :members: + :undoc-members: + :show-inheritance: + feast.infra.online\_stores.sqlite module ---------------------------------------- diff --git a/sdk/python/docs/source/feast.infra.rst b/sdk/python/docs/source/feast.infra.rst index ec2cc120a6..42c7d1334b 100644 --- a/sdk/python/docs/source/feast.infra.rst +++ b/sdk/python/docs/source/feast.infra.rst @@ -7,6 +7,7 @@ Subpackages .. toctree:: :maxdepth: 4 + feast.infra.materialization feast.infra.offline_stores feast.infra.online_stores feast.infra.registry_stores diff --git a/sdk/python/docs/source/feast.rst b/sdk/python/docs/source/feast.rst index 7c569fc713..c000ac2e2b 100644 --- a/sdk/python/docs/source/feast.rst +++ b/sdk/python/docs/source/feast.rst @@ -225,6 +225,14 @@ feast.online\_response module :undoc-members: :show-inheritance: +feast.project\_metadata module +------------------------------ + +.. automodule:: feast.project_metadata + :members: + :undoc-members: + :show-inheritance: + feast.proto\_json module ------------------------ @@ -273,6 +281,14 @@ feast.repo\_operations module :undoc-members: :show-inheritance: +feast.repo\_upgrade module +-------------------------- + +.. automodule:: feast.repo_upgrade + :members: + :undoc-members: + :show-inheritance: + feast.request\_feature\_view module ----------------------------------- diff --git a/sdk/python/docs/source/index.rst b/sdk/python/docs/source/index.rst index 9297901c33..07b9d9a77e 100644 --- a/sdk/python/docs/source/index.rst +++ b/sdk/python/docs/source/index.rst @@ -250,18 +250,21 @@ Sqlite Online Store .. automodule:: feast.infra.online_stores.sqlite :members: + :noindex: Datastore Online Store ---------------------- .. automodule:: feast.infra.online_stores.datastore :members: + :noindex: DynamoDB Online Store --------------------- .. automodule:: feast.infra.online_stores.dynamodb :members: + :noindex: Redis Online Store ------------------ @@ -283,3 +286,23 @@ HBase Online Store .. automodule:: feast.infra.online_stores.contrib.hbase_online_store.hbase :members: :noindex: + + +Batch Materialization Engine +============================ + +.. automodule:: feast.infra.materialization + :members: BatchMaterializationEngine, MaterializationJob, MaterializationTask + +Local Engine +------------ +.. autoclass:: feast.infra.materialization.LocalMaterializationEngine + :members: + :noindex: + +(Alpha) Lambda Based Engine +--------------------------- + +.. autoclass:: feast.infra.materialization.lambda.lambda_engine + :members: + :noindex: diff --git a/sdk/python/feast/cli.py b/sdk/python/feast/cli.py index 91815d30fd..153c1a5ddd 100644 --- a/sdk/python/feast/cli.py +++ b/sdk/python/feast/cli.py @@ -118,21 +118,24 @@ def version(): "-h", type=click.STRING, default="0.0.0.0", - help="Specify a host for the server [default: 0.0.0.0]", + show_default=True, + help="Specify a host for the server", ) @click.option( "--port", "-p", type=click.INT, default=8888, - help="Specify a port for the server [default: 8888]", + show_default=True, + help="Specify a port for the server", ) @click.option( "--registry_ttl_sec", "-r", - help="Number of seconds after which the registry is refreshed. Default is 5 seconds.", + help="Number of seconds after which the registry is refreshed", type=int, default=5, + show_default=True, ) @click.pass_context def ui(ctx: click.Context, host: str, port: int, registry_ttl_sec: int): @@ -524,7 +527,10 @@ def registry_dump_command(ctx: click.Context): @click.argument("start_ts") @click.argument("end_ts") @click.option( - "--views", "-v", help="Feature views to materialize", multiple=True, + "--views", + "-v", + help="Feature views to materialize", + multiple=True, ) @click.pass_context def materialize_command( @@ -551,7 +557,10 @@ def materialize_command( @cli.command("materialize-incremental") @click.argument("end_ts") @click.option( - "--views", "-v", help="Feature views to incrementally materialize", multiple=True, + "--views", + "-v", + help="Feature views to incrementally materialize", + multiple=True, ) @click.pass_context def materialize_incremental_command(ctx: click.Context, end_ts: str, views: List[str]): @@ -604,14 +613,16 @@ def init_command(project_directory, minimal: bool, template: str): "-h", type=click.STRING, default="127.0.0.1", - help="Specify a host for the server [default: 127.0.0.1]", + show_default=True, + help="Specify a host for the server", ) @click.option( "--port", "-p", type=click.INT, default=6566, - help="Specify a port for the server [default: 6566]", + show_default=True, + help="Specify a port for the server", ) @click.option( "--type", @@ -619,13 +630,26 @@ def init_command(project_directory, minimal: bool, template: str): "type_", type=click.STRING, default="http", - help="Specify a server type: 'http' or 'grpc' [default: http]", + show_default=True, + help="Specify a server type: 'http' or 'grpc'", +) +@click.option( + "--go", + is_flag=True, + show_default=True, + help="Use Go to serve", ) @click.option( - "--no-access-log", is_flag=True, help="Disable the Uvicorn access log.", + "--no-access-log", + is_flag=True, + show_default=True, + help="Disable the Uvicorn access log", ) @click.option( - "--no-feature-log", is_flag=True, help="Disable logging served features", + "--no-feature-log", + is_flag=True, + show_default=True, + help="Disable logging served features", ) @click.pass_context def serve_command( @@ -633,6 +657,7 @@ def serve_command( host: str, port: int, type_: str, + go: bool, no_access_log: bool, no_feature_log: bool, ): @@ -641,6 +666,10 @@ def serve_command( cli_check_repo(repo) store = FeatureStore(repo_path=str(repo)) + if go: + # Turn on Go feature retrieval. + store.config.go_feature_serving = True + store.serve(host, port, type_, no_access_log, no_feature_log) @@ -771,13 +800,19 @@ def disable_alpha_features(ctx: click.Context): @cli.command("validate") @click.option( - "--feature-service", "-f", help="Specify a feature service name", + "--feature-service", + "-f", + help="Specify a feature service name", ) @click.option( - "--reference", "-r", help="Specify a validation reference name", + "--reference", + "-r", + help="Specify a validation reference name", ) @click.option( - "--no-profile-cache", is_flag=True, help="Do not store cached profile in registry", + "--no-profile-cache", + is_flag=True, + help="Do not store cached profile in registry", ) @click.argument("start_ts") @click.argument("end_ts") diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index f5c40d2421..a1e44b3186 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -116,7 +116,10 @@ class KinesisOptions: """ def __init__( - self, record_format: StreamFormat, region: str, stream_name: str, + self, + record_format: StreamFormat, + region: str, + stream_name: str, ): self.record_format = record_format self.region = region @@ -270,6 +273,13 @@ def __init__( ), DeprecationWarning, ) + if ( + self.timestamp_field + and self.timestamp_field == self.created_timestamp_column + ): + raise ValueError( + "Please do not use the same column for 'timestamp_field' and 'created_timestamp_column'." + ) self.description = description or "" self.tags = tags or {} self.owner = owner or "" diff --git a/sdk/python/feast/diff/infra_diff.py b/sdk/python/feast/diff/infra_diff.py index a09eaf39eb..51bece33dd 100644 --- a/sdk/python/feast/diff/infra_diff.py +++ b/sdk/python/feast/diff/infra_diff.py @@ -126,7 +126,8 @@ def diff_infra_protos( infra_objects_to_delete, infra_objects_to_add, ) = tag_infra_proto_objects_for_keep_delete_add( - current_infra_objects, new_infra_objects, + current_infra_objects, + new_infra_objects, ) for e in infra_objects_to_add: @@ -199,5 +200,10 @@ def diff_between( ) ) return InfraObjectDiff( - new.name, infra_object_type, current, new, property_diffs, transition, + new.name, + infra_object_type, + current, + new, + property_diffs, + transition, ) diff --git a/sdk/python/feast/diff/registry_diff.py b/sdk/python/feast/diff/registry_diff.py index 7a5b9b7564..fc0acf0223 100644 --- a/sdk/python/feast/diff/registry_diff.py +++ b/sdk/python/feast/diff/registry_diff.py @@ -161,7 +161,9 @@ def diff_registry_objects( def extract_objects_for_keep_delete_update_add( - registry: BaseRegistry, current_project: str, desired_repo_contents: RepoContents, + registry: BaseRegistry, + current_project: str, + desired_repo_contents: RepoContents, ) -> Tuple[ Dict[FeastObjectType, Set[FeastObject]], Dict[FeastObjectType, Set[FeastObject]], @@ -208,7 +210,9 @@ def extract_objects_for_keep_delete_update_add( def diff_between( - registry: BaseRegistry, current_project: str, desired_repo_contents: RepoContents, + registry: BaseRegistry, + current_project: str, + desired_repo_contents: RepoContents, ) -> RegistryDiff: """ Returns the difference between the current and desired repo states. @@ -305,12 +309,16 @@ def apply_diff_to_registry( BaseFeatureView, feast_object_diff.current_feast_object ) registry.delete_feature_view( - feature_view_obj.name, project, commit=False, + feature_view_obj.name, + project, + commit=False, ) elif feast_object_diff.feast_object_type == FeastObjectType.DATA_SOURCE: ds_obj = cast(DataSource, feast_object_diff.current_feast_object) registry.delete_data_source( - ds_obj.name, project, commit=False, + ds_obj.name, + project, + commit=False, ) if feast_object_diff.transition_type in [ diff --git a/sdk/python/feast/driver_test_data.py b/sdk/python/feast/driver_test_data.py index 117bfcbd9c..da9d061313 100644 --- a/sdk/python/feast/driver_test_data.py +++ b/sdk/python/feast/driver_test_data.py @@ -30,7 +30,12 @@ def _convert_event_timestamp(event_timestamp: pd.Timestamp, t: EventTimestampTyp def create_orders_df( - customers, drivers, start_date, end_date, order_count, locations=None, + customers, + drivers, + start_date, + end_date, + order_count, + locations=None, ) -> pd.DataFrame: """ Example df generated by this function (if locations): @@ -98,7 +103,7 @@ def create_driver_hourly_stats_df(drivers, start_date, end_date) -> pd.DataFrame "event_timestamp": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( - start=start_date, end=end_date, freq="1H", closed="left" + start=start_date, end=end_date, freq="1H", inclusive="left" ) ] # include a fixed timestamp for get_historical_features in the quickstart diff --git a/sdk/python/feast/embedded_go/online_features_service.py b/sdk/python/feast/embedded_go/online_features_service.py index 3081843778..bf82fab6a3 100644 --- a/sdk/python/feast/embedded_go/online_features_service.py +++ b/sdk/python/feast/embedded_go/online_features_service.py @@ -50,7 +50,8 @@ def __init__( ) self._service = NewOnlineFeatureService( - self._config, self._transformation_callback, + self._config, + self._transformation_callback, ) # This should raise an exception if there were any errors in NewOnlineFeatureService. @@ -147,6 +148,7 @@ def get_online_features( features_ptr_array, features_ptr_schema ) resp = record_batch_to_online_response(record_batch) + del record_batch return OnlineResponse(resp) def start_grpc_server( @@ -262,7 +264,9 @@ def transformation_callback( def logging_callback( - fs: "FeatureStore", feature_service_name: str, dataset_dir: str, + fs: "FeatureStore", + feature_service_name: str, + dataset_dir: str, ) -> bytes: feature_service = fs.get_feature_service(feature_service_name, allow_cache=True) try: diff --git a/sdk/python/feast/feature.py b/sdk/python/feast/feature.py index d1f96c302a..6b5acd9fc6 100644 --- a/sdk/python/feast/feature.py +++ b/sdk/python/feast/feature.py @@ -30,7 +30,10 @@ class Feature: """ def __init__( - self, name: str, dtype: ValueType, labels: Optional[Dict[str, str]] = None, + self, + name: str, + dtype: ValueType, + labels: Optional[Dict[str, str]] = None, ): """Creates a Feature object.""" self._name = name @@ -91,7 +94,9 @@ def to_proto(self) -> FeatureSpecProto: value_type = ValueTypeProto.Enum.Value(self.dtype.name) return FeatureSpecProto( - name=self.name, value_type=value_type, tags=self.labels, + name=self.name, + value_type=value_type, + tags=self.labels, ) @classmethod diff --git a/sdk/python/feast/feature_logging.py b/sdk/python/feast/feature_logging.py index 275bde72ec..da9a0c9fe5 100644 --- a/sdk/python/feast/feature_logging.py +++ b/sdk/python/feast/feature_logging.py @@ -34,12 +34,12 @@ class LoggingSource: @abc.abstractmethod def get_schema(self, registry: "BaseRegistry") -> pa.Schema: - """ Generate schema for logs destination. """ + """Generate schema for logs destination.""" raise NotImplementedError @abc.abstractmethod def get_log_timestamp_column(self) -> str: - """ Return timestamp column that must exist in generated schema. """ + """Return timestamp column that must exist in generated schema.""" raise NotImplementedError diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index de52b9e3f3..c4ccc9a648 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -118,7 +118,9 @@ class FeatureStore: @log_exceptions def __init__( - self, repo_path: Optional[str] = None, config: Optional[RepoConfig] = None, + self, + repo_path: Optional[str] = None, + config: Optional[RepoConfig] = None, ): """ Creates a FeatureStore object. @@ -142,7 +144,7 @@ def __init__( self._registry = SqlRegistry(registry_config, None) else: r = Registry(registry_config, repo_path=self.repo_path) - r._initialize_registry() + r._initialize_registry(self.config.project) self._registry = r self._provider = get_provider(self.config, self.repo_path) self._go_server = None @@ -183,7 +185,7 @@ def refresh_registry(self): """ registry_config = self.config.get_registry_config() registry = Registry(registry_config, repo_path=self.repo_path) - registry.refresh() + registry.refresh(self.config.project) self._registry = registry @@ -253,7 +255,9 @@ def list_request_feature_views( ) def _list_feature_views( - self, allow_cache: bool = False, hide_dummy_entity: bool = True, + self, + allow_cache: bool = False, + hide_dummy_entity: bool = True, ) -> List[FeatureView]: feature_views = [] for fv in self._registry.list_feature_views( @@ -266,7 +270,9 @@ def _list_feature_views( return feature_views def _list_stream_feature_views( - self, allow_cache: bool = False, hide_dummy_entity: bool = True, + self, + allow_cache: bool = False, + hide_dummy_entity: bool = True, ) -> List[StreamFeatureView]: stream_feature_views = [] for sfv in self._registry.list_stream_feature_views( @@ -480,7 +486,9 @@ def delete_feature_service(self, name: str): return self._registry.delete_feature_service(name, self.project) def _get_features( - self, features: Union[List[str], FeatureService], allow_cache: bool = False, + self, + features: Union[List[str], FeatureService], + allow_cache: bool = False, ) -> List[str]: _features = features @@ -511,8 +519,8 @@ def _get_features( return _feature_refs def _should_use_plan(self): - """Returns True if _plan and _apply_diffs should be used, False otherwise.""" - # Currently only the local provider with sqlite online store supports _plan and _apply_diffs. + """Returns True if plan and _apply_diffs should be used, False otherwise.""" + # Currently only the local provider with sqlite online store supports plan and _apply_diffs. return self.config.provider == "local" and ( self.config.online_store and self.config.online_store.type == "sqlite" ) @@ -589,7 +597,8 @@ def _make_inferences( feature_service.infer_features(fvs_to_update=fvs_to_update_map) def _get_feature_views_to_materialize( - self, feature_views: Optional[List[str]], + self, + feature_views: Optional[List[str]], ) -> List[FeatureView]: """ Returns the list of feature views that should be materialized. @@ -636,7 +645,7 @@ def _get_feature_views_to_materialize( return feature_views_to_materialize @log_exceptions_and_usage - def _plan( + def plan( self, desired_repo_contents: RepoContents ) -> Tuple[RegistryDiff, InfraDiff, Infra]: """Dry-run registering objects to metadata store. @@ -670,7 +679,7 @@ def _plan( ... ttl=timedelta(seconds=86400 * 1), ... batch_source=driver_hourly_stats, ... ) - >>> registry_diff, infra_diff, new_infra = fs._plan(RepoContents( + >>> registry_diff, infra_diff, new_infra = fs.plan(RepoContents( ... data_sources=[driver_hourly_stats], ... feature_views=[driver_hourly_stats_view], ... on_demand_feature_views=list(), @@ -704,7 +713,7 @@ def _plan( # Compute the desired difference between the current infra, as stored in the registry, # and the desired infra. - self._registry.refresh() + self._registry.refresh(self.project) current_infra_proto = self._registry.proto().infra.__deepcopy__() desired_registry_proto = desired_repo_contents.to_registry_proto() new_infra = self._provider.plan_infra(self.config, desired_registry_proto) @@ -1057,19 +1066,20 @@ def get_historical_features( # Check that the right request data is present in the entity_df if type(entity_df) == pd.DataFrame: - entity_pd_df = cast(pd.DataFrame, entity_df) + entity_df = utils.make_df_tzaware(cast(pd.DataFrame, entity_df)) for fv in request_feature_views: for feature in fv.features: - if feature.name not in entity_pd_df.columns: + if feature.name not in entity_df.columns: raise RequestDataNotFoundInEntityDfException( feature_name=feature.name, feature_view_name=fv.name ) for odfv in on_demand_feature_views: odfv_request_data_schema = odfv.get_request_data_schema() for feature_name in odfv_request_data_schema.keys(): - if feature_name not in entity_pd_df.columns: + if feature_name not in entity_df.columns: raise RequestDataNotFoundInEntityDfException( - feature_name=feature_name, feature_view_name=odfv.name, + feature_name=feature_name, + feature_view_name=odfv.name, ) _validate_feature_refs(_feature_refs, full_feature_names) @@ -1121,8 +1131,7 @@ def create_saved_dataset( if not from_.metadata: raise ValueError( - "RetrievalJob must contains metadata. " - "Use RetrievalJob produced by get_historical_features" + f"The RetrievalJob {type(from_)} must implement the metadata property." ) dataset = SavedDataset( @@ -1182,7 +1191,9 @@ def get_saved_dataset(self, name: str) -> SavedDataset: @log_exceptions_and_usage def materialize_incremental( - self, end_date: datetime, feature_views: Optional[List[str]] = None, + self, + end_date: datetime, + feature_views: Optional[List[str]] = None, ) -> None: """ Materialize incremental new data from the offline store into the online store. @@ -1264,7 +1275,10 @@ def tqdm_builder(length): ) self._registry.apply_materialization( - feature_view, self.project, start_date, end_date, + feature_view, + self.project, + start_date, + end_date, ) @log_exceptions_and_usage @@ -1336,7 +1350,10 @@ def tqdm_builder(length): ) self._registry.apply_materialization( - feature_view, self.project, start_date, end_date, + feature_view, + self.project, + start_date, + end_date, ) @log_exceptions_and_usage @@ -1383,7 +1400,7 @@ def push( fv.name, df, allow_registry_cache=allow_registry_cache ) if to == PushMode.OFFLINE or to == PushMode.ONLINE_AND_OFFLINE: - self._write_to_offline_store( + self.write_to_offline_store( fv.name, df, allow_registry_cache=allow_registry_cache ) @@ -1415,14 +1432,18 @@ def write_to_online_store( provider.ingest_df(feature_view, entities, df) @log_exceptions_and_usage - def _write_to_offline_store( + def write_to_offline_store( self, feature_view_name: str, df: pd.DataFrame, allow_registry_cache: bool = True, + reorder_columns: bool = True, ): """ - ingests data directly into the Online store + Persists the dataframe directly into the batch data source for the given feature view. + + Fails if the dataframe columns do not match the columns of the batch data source. Optionally + reorders the columns of the dataframe to match. """ # TODO: restrict this to work with online StreamFeatureViews and validate the FeatureView type try: @@ -1433,7 +1454,21 @@ def _write_to_offline_store( feature_view = self.get_feature_view( feature_view_name, allow_registry_cache=allow_registry_cache ) - df.reset_index(drop=True) + + # Get columns of the batch source and the input dataframe. + column_names_and_types = ( + feature_view.batch_source.get_table_column_names_and_types(self.config) + ) + source_columns = [column for column, _ in column_names_and_types] + input_columns = df.columns.values.tolist() + + if set(input_columns) != set(source_columns): + raise ValueError( + f"The input dataframe has columns {set(input_columns)} but the batch source has columns {set(source_columns)}." + ) + + if reorder_columns: + df = df.reindex(columns=source_columns) table = pa.Table.from_pandas(df) provider = self._get_provider() @@ -1529,7 +1564,7 @@ def _get_online_features( for k, v in entity_values.items() } - # If Go feature server is enabled, send request to it instead of going through regular Python logic + # If the embedded Go code is enabled, send request to it instead of going through regular Python logic. if self.config.go_feature_retrieval: self._lazy_init_go_server() @@ -1683,12 +1718,17 @@ def _get_online_features( for table, requested_features in grouped_refs: # Get the correct set of entity values with the correct join keys. table_entity_values, idxs = self._get_unique_entities( - table, join_key_values, entity_name_to_join_key_map, + table, + join_key_values, + entity_name_to_join_key_map, ) # Fetch feature data for the minimum set of Entities. feature_data = self._read_from_online_store( - table_entity_values, provider, requested_features, table, + table_entity_values, + provider, + requested_features, + table, ) # Populate the result_rows with the Features from the OnlineStore inplace. @@ -1857,7 +1897,9 @@ def _get_unique_entities( """ # Get the correct set of entity values with the correct join keys. table_entity_values = self._get_table_entity_values( - table, entity_name_to_join_key_map, join_key_values, + table, + entity_name_to_join_key_map, + join_key_values, ) # Convert back to rowise. @@ -2042,7 +2084,8 @@ def _augment_response_with_on_demand_transforms( for odfv_name, _feature_refs in odfv_feature_refs.items(): odfv = requested_odfv_map[odfv_name] transformed_features_df = odfv.get_transformed_features_df( - initial_response_df, full_feature_names, + initial_response_df, + full_feature_names, ) selected_subset = [ f for f in transformed_features_df.columns if f in _feature_refs @@ -2099,9 +2142,7 @@ def _get_feature_views_to_use( features: Optional[Union[List[str], FeatureService]], allow_cache=False, hide_dummy_entity: bool = True, - ) -> Tuple[ - List[FeatureView], List[RequestFeatureView], List[OnDemandFeatureView], - ]: + ) -> Tuple[List[FeatureView], List[RequestFeatureView], List[OnDemandFeatureView]]: fvs = { fv.name: fv @@ -2176,7 +2217,7 @@ def serve( ) -> None: """Start the feature consumption server locally on a given port.""" type_ = type_.lower() - if self.config.go_feature_retrieval: + if self.config.go_feature_serving: # Start go server instead of python if the flag is enabled self._lazy_init_go_server() enable_logging = ( @@ -2255,7 +2296,7 @@ def _teardown_go_server(self): @log_exceptions_and_usage def write_logged_features( - self, logs: Union[pa.Table, Path], source: Union[FeatureService] + self, logs: Union[pa.Table, Path], source: FeatureService ): """ Write logs produced by a source (currently only feature service is supported as a source) @@ -2284,7 +2325,7 @@ def write_logged_features( @log_exceptions_and_usage def validate_logged_features( self, - source: Union[FeatureService], + source: FeatureService, start: datetime, end: datetime, reference: ValidationReference, @@ -2346,10 +2387,10 @@ def get_validation_reference( self, name: str, allow_cache: bool = False ) -> ValidationReference: """ - Retrieves a validation reference. + Retrieves a validation reference. - Raises: - ValidationReferenceNotFoundException: The validation reference could not be found. + Raises: + ValidationReferenceNotFoundException: The validation reference could not be found. """ ref = self._registry.get_validation_reference( name, project=self.project, allow_cache=allow_cache diff --git a/sdk/python/feast/feature_view.py b/sdk/python/feast/feature_view.py index 348c3019c5..0310376646 100644 --- a/sdk/python/feast/feature_view.py +++ b/sdk/python/feast/feature_view.py @@ -44,7 +44,10 @@ DUMMY_ENTITY_ID = "__dummy_id" DUMMY_ENTITY_NAME = "__dummy" DUMMY_ENTITY_VAL = "" -DUMMY_ENTITY = Entity(name=DUMMY_ENTITY_NAME, join_keys=[DUMMY_ENTITY_ID],) +DUMMY_ENTITY = Entity( + name=DUMMY_ENTITY_NAME, + join_keys=[DUMMY_ENTITY_ID], +) @typechecked @@ -407,21 +410,8 @@ def to_proto(self) -> FeatureViewProto: Returns: A FeatureViewProto protobuf. """ - meta = FeatureViewMetaProto(materialization_intervals=[]) - if self.created_timestamp: - meta.created_timestamp.FromDatetime(self.created_timestamp) - if self.last_updated_timestamp: - meta.last_updated_timestamp.FromDatetime(self.last_updated_timestamp) - for interval in self.materialization_intervals: - interval_proto = MaterializationIntervalProto() - interval_proto.start_time.FromDatetime(interval[0]) - interval_proto.end_time.FromDatetime(interval[1]) - meta.materialization_intervals.append(interval_proto) - - ttl_duration = None - if self.ttl is not None: - ttl_duration = Duration() - ttl_duration.FromTimedelta(self.ttl) + meta = self.to_proto_meta() + ttl_duration = self.get_ttl_duration() batch_source_proto = self.batch_source.to_proto() batch_source_proto.data_source_class_type = f"{self.batch_source.__class__.__module__}.{self.batch_source.__class__.__name__}" @@ -447,6 +437,26 @@ def to_proto(self) -> FeatureViewProto: return FeatureViewProto(spec=spec, meta=meta) + def to_proto_meta(self): + meta = FeatureViewMetaProto(materialization_intervals=[]) + if self.created_timestamp: + meta.created_timestamp.FromDatetime(self.created_timestamp) + if self.last_updated_timestamp: + meta.last_updated_timestamp.FromDatetime(self.last_updated_timestamp) + for interval in self.materialization_intervals: + interval_proto = MaterializationIntervalProto() + interval_proto.start_time.FromDatetime(interval[0]) + interval_proto.end_time.FromDatetime(interval[1]) + meta.materialization_intervals.append(interval_proto) + return meta + + def get_ttl_duration(self): + ttl_duration = None + if self.ttl is not None: + ttl_duration = Duration() + ttl_duration.FromTimedelta(self.ttl) + return ttl_duration + @classmethod def from_proto(cls, feature_view_proto: FeatureViewProto): """ diff --git a/sdk/python/feast/field.py b/sdk/python/feast/field.py index d0b4274cd2..a3dc3732da 100644 --- a/sdk/python/feast/field.py +++ b/sdk/python/feast/field.py @@ -38,7 +38,11 @@ class Field: tags: Dict[str, str] def __init__( - self, *, name: str, dtype: FeastType, tags: Optional[Dict[str, str]] = None, + self, + *, + name: str, + dtype: FeastType, + tags: Optional[Dict[str, str]] = None, ): """ Creates a Field object. diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index 011a3b99b2..0b8e42b4e9 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -144,7 +144,8 @@ def update_feature_views_with_inferred_features_and_entities( ): fv.entity_columns.append( Field( - name=entity.join_key, dtype=from_value_type(entity.value_type), + name=entity.join_key, + dtype=from_value_type(entity.value_type), ) ) @@ -166,7 +167,10 @@ def update_feature_views_with_inferred_features_and_entities( if run_inference_for_entities or run_inference_for_features: _infer_features_and_entities( - fv, join_keys, run_inference_for_features, config, + fv, + join_keys, + run_inference_for_features, + config, ) if not fv.features: @@ -177,7 +181,10 @@ def update_feature_views_with_inferred_features_and_entities( def _infer_features_and_entities( - fv: FeatureView, join_keys: Set[str], run_inference_for_features, config, + fv: FeatureView, + join_keys: Set[str], + run_inference_for_features, + config, ) -> None: """ Updates the specific feature in place with inferred features and entities. diff --git a/sdk/python/feast/infra/aws.py b/sdk/python/feast/infra/aws.py index 14301faf19..145c55952e 100644 --- a/sdk/python/feast/infra/aws.py +++ b/sdk/python/feast/infra/aws.py @@ -106,6 +106,15 @@ def update_infra( self._deploy_feature_server(project, image_uri) + if self.batch_engine: + self.batch_engine.update( + project, + tables_to_delete, + tables_to_keep, + entities_to_delete, + entities_to_keep, + ) + def _deploy_feature_server(self, project: str, image_uri: str): _logger.info("Deploying feature server...") @@ -196,10 +205,12 @@ def _deploy_feature_server(self, project: str, image_uri: str): @log_exceptions_and_usage(provider="AwsProvider") def teardown_infra( - self, project: str, tables: Sequence[FeatureView], entities: Sequence[Entity], + self, + project: str, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ) -> None: - if self.online_store: - self.online_store.teardown(self.repo_config, tables, entities) + super(AwsProvider, self).teardown_infra(project, tables, entities) if ( self.repo_config.feature_server is not None diff --git a/sdk/python/feast/infra/contrib/spark_kafka_processor.py b/sdk/python/feast/infra/contrib/spark_kafka_processor.py index 4dfb615773..32d91b2010 100644 --- a/sdk/python/feast/infra/contrib/spark_kafka_processor.py +++ b/sdk/python/feast/infra/contrib/spark_kafka_processor.py @@ -1,12 +1,14 @@ from types import MethodType -from typing import List +from typing import List, Optional +import pandas as pd from pyspark.sql import DataFrame, SparkSession from pyspark.sql.avro.functions import from_avro from pyspark.sql.functions import col, from_json from feast.data_format import AvroFormat, JsonFormat -from feast.data_source import KafkaSource +from feast.data_source import KafkaSource, PushMode +from feast.feature_store import FeatureStore from feast.infra.contrib.stream_processor import ( ProcessorConfig, StreamProcessor, @@ -24,16 +26,16 @@ class SparkProcessorConfig(ProcessorConfig): class SparkKafkaProcessor(StreamProcessor): spark: SparkSession format: str - write_function: MethodType + preprocess_fn: Optional[MethodType] join_keys: List[str] def __init__( self, + *, + fs: FeatureStore, sfv: StreamFeatureView, config: ProcessorConfig, - write_function: MethodType, - processing_time: str = "30 seconds", - query_timeout: int = 15, + preprocess_fn: Optional[MethodType] = None, ): if not isinstance(sfv.stream_source, KafkaSource): raise ValueError("data source is not kafka source") @@ -55,15 +57,16 @@ def __init__( if not isinstance(config, SparkProcessorConfig): raise ValueError("config is not spark processor config") self.spark = config.spark_session - self.write_function = write_function - self.processing_time = processing_time - self.query_timeout = query_timeout - super().__init__(sfv=sfv, data_source=sfv.stream_source) + self.preprocess_fn = preprocess_fn + self.processing_time = config.processing_time + self.query_timeout = config.query_timeout + self.join_keys = [fs.get_entity(entity).join_key for entity in sfv.entities] + super().__init__(fs=fs, sfv=sfv, data_source=sfv.stream_source) - def ingest_stream_feature_view(self) -> None: + def ingest_stream_feature_view(self, to: PushMode = PushMode.ONLINE) -> None: ingested_stream_df = self._ingest_stream_data() transformed_df = self._construct_transformation_plan(ingested_stream_df) - online_store_query = self._write_to_online_store(transformed_df) + online_store_query = self._write_stream_data(transformed_df, to) return online_store_query def _ingest_stream_data(self) -> StreamTable: @@ -119,13 +122,35 @@ def _ingest_stream_data(self) -> StreamTable: def _construct_transformation_plan(self, df: StreamTable) -> StreamTable: return self.sfv.udf.__call__(df) if self.sfv.udf else df - def _write_to_online_store(self, df: StreamTable): + def _write_stream_data(self, df: StreamTable, to: PushMode): # Validation occurs at the fs.write_to_online_store() phase against the stream feature view schema. def batch_write(row: DataFrame, batch_id: int): - pd_row = row.toPandas() - self.write_function( - pd_row, input_timestamp="event_timestamp", output_timestamp="" + rows: pd.DataFrame = row.toPandas() + + # Extract the latest feature values for each unique entity row (i.e. the join keys). + # Also add a 'created' column. + rows = ( + rows.sort_values( + by=self.join_keys + [self.sfv.timestamp_field], ascending=True + ) + .groupby(self.join_keys) + .nth(0) ) + rows["created"] = pd.to_datetime("now", utc=True) + + # Reset indices to ensure the dataframe has all the required columns. + rows = rows.reset_index() + + # Optionally execute preprocessor before writing to the online store. + if self.preprocess_fn: + rows = self.preprocess_fn(rows) + + # Finally persist the data to the online store and/or offline store. + if rows.size > 0: + if to == PushMode.ONLINE or to == PushMode.ONLINE_AND_OFFLINE: + self.fs.write_to_online_store(self.sfv.name, rows) + if to == PushMode.OFFLINE or to == PushMode.ONLINE_AND_OFFLINE: + self.fs.write_to_offline_store(self.sfv.name, rows) query = ( df.writeStream.outputMode("update") diff --git a/sdk/python/feast/infra/contrib/stream_processor.py b/sdk/python/feast/infra/contrib/stream_processor.py index 2ccf1e59f8..24817c82ea 100644 --- a/sdk/python/feast/infra/contrib/stream_processor.py +++ b/sdk/python/feast/infra/contrib/stream_processor.py @@ -1,14 +1,17 @@ from abc import ABC -from typing import Callable +from types import MethodType +from typing import TYPE_CHECKING, Optional -import pandas as pd from pyspark.sql import DataFrame -from feast.data_source import DataSource +from feast.data_source import DataSource, PushMode from feast.importer import import_class from feast.repo_config import FeastConfigBaseModel from feast.stream_feature_view import StreamFeatureView +if TYPE_CHECKING: + from feast.feature_store import FeatureStore + STREAM_PROCESSOR_CLASS_FOR_TYPE = { ("spark", "kafka"): "feast.infra.contrib.spark_kafka_processor.SparkKafkaProcessor", } @@ -30,21 +33,26 @@ class StreamProcessor(ABC): and persist that data to the online store. Attributes: + fs: The feature store where data should be persisted. sfv: The stream feature view on which the stream processor operates. data_source: The stream data source from which data will be ingested. """ + fs: "FeatureStore" sfv: StreamFeatureView data_source: DataSource - def __init__(self, sfv: StreamFeatureView, data_source: DataSource): + def __init__( + self, fs: "FeatureStore", sfv: StreamFeatureView, data_source: DataSource + ): + self.fs = fs self.sfv = sfv self.data_source = data_source - def ingest_stream_feature_view(self) -> None: + def ingest_stream_feature_view(self, to: PushMode = PushMode.ONLINE) -> None: """ Ingests data from the stream source attached to the stream feature view; transforms the data - and then persists it to the online store. + and then persists it to the online store and/or offline store, depending on the 'to' parameter. """ pass @@ -62,26 +70,32 @@ def _construct_transformation_plan(self, table: StreamTable) -> StreamTable: """ pass - def _write_to_online_store(self, table: StreamTable) -> None: + def _write_stream_data(self, table: StreamTable, to: PushMode) -> None: """ - Returns query for persisting data to the online store. + Launches a job to persist stream data to the online store and/or offline store, depending + on the 'to' parameter, and returns a handle for the job. """ pass def get_stream_processor_object( config: ProcessorConfig, + fs: "FeatureStore", sfv: StreamFeatureView, - write_function: Callable[[pd.DataFrame, str, str], None], + preprocess_fn: Optional[MethodType] = None, ): """ - Returns a stream processor object based on the config mode and stream source type. The write function is a - function that wraps the feature store "write_to_online_store" capability. + Returns a stream processor object based on the config. + + The returned object will be capable of launching an ingestion job that reads data from the + given stream feature view's stream source, transforms it if the stream feature view has a + transformation, and then writes it to the online store. It will also preprocess the data + if a preprocessor method is defined. """ if config.mode == "spark" and config.source == "kafka": stream_processor = STREAM_PROCESSOR_CLASS_FOR_TYPE[("spark", "kafka")] module_name, class_name = stream_processor.rsplit(".", 1) cls = import_class(module_name, class_name, "StreamProcessor") - return cls(sfv=sfv, config=config, write_function=write_function,) + return cls(fs=fs, sfv=sfv, config=config, preprocess_fn=preprocess_fn) else: raise ValueError("other processors besides spark-kafka not supported") diff --git a/sdk/python/feast/infra/key_encoding_utils.py b/sdk/python/feast/infra/key_encoding_utils.py index 8333610473..62b6b72724 100644 --- a/sdk/python/feast/infra/key_encoding_utils.py +++ b/sdk/python/feast/infra/key_encoding_utils.py @@ -6,7 +6,9 @@ from feast.protos.feast.types.Value_pb2 import ValueType -def _serialize_val(value_type, v: ValueProto) -> Tuple[bytes, int]: +def _serialize_val( + value_type, v: ValueProto, entity_key_serialization_version=1 +) -> Tuple[bytes, int]: if value_type == "string_val": return v.string_val.encode("utf8"), ValueType.STRING elif value_type == "bytes_val": @@ -14,14 +16,16 @@ def _serialize_val(value_type, v: ValueProto) -> Tuple[bytes, int]: elif value_type == "int32_val": return struct.pack(" bytes: """ - Serialize keys to a bytestring so it can be used to prefix-scan through items stored in the online store + Serialize keys to a bytestring, so it can be used to prefix-scan through items stored in the online store using serialize_entity_key. This encoding is a partial implementation of serialize_entity_key, only operating on the keys of entities, @@ -35,7 +39,9 @@ def serialize_entity_key_prefix(entity_keys: List[str]) -> bytes: return b"".join(output) -def serialize_entity_key(entity_key: EntityKeyProto) -> bytes: +def serialize_entity_key( + entity_key: EntityKeyProto, entity_key_serialization_version=1 +) -> bytes: """ Serialize entity key to a bytestring so it can be used as a lookup key in a hash table. @@ -54,7 +60,11 @@ def serialize_entity_key(entity_key: EntityKeyProto) -> bytes: output.append(struct.pack(" MaterializationJobStatus: + ... + + @abstractmethod + def error(self) -> Optional[BaseException]: + ... + + @abstractmethod + def should_be_retried(self) -> bool: + ... + + @abstractmethod + def job_id(self) -> str: + ... + + @abstractmethod + def url(self) -> Optional[str]: + ... + + +class BatchMaterializationEngine(ABC): + def __init__( + self, + *, + repo_config: RepoConfig, + offline_store: OfflineStore, + online_store: OnlineStore, + **kwargs, + ): + self.repo_config = repo_config + self.offline_store = offline_store + self.online_store = online_store + + @abstractmethod + def update( + self, + project: str, + views_to_delete: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + views_to_keep: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + ): + """This method ensures that any necessary infrastructure or resources needed by the + engine are set up ahead of materialization.""" + + @abstractmethod + def materialize( + self, registry: BaseRegistry, tasks: List[MaterializationTask] + ) -> List[MaterializationJob]: + """ + Materialize data from the offline store to the online store for this feature repo. + Args: + registry: The feast registry containing the applied feature views. + tasks: A list of individual materialization tasks. + Returns: + A list of materialization jobs representing each task. + """ + ... + + @abstractmethod + def teardown_infra( + self, + project: str, + fvs: Sequence[Union[BatchFeatureView, StreamFeatureView, FeatureView]], + entities: Sequence[Entity], + ): + """This method ensures that any infrastructure or resources set up by ``update()``are torn down.""" diff --git a/sdk/python/feast/infra/materialization/lambda/Dockerfile b/sdk/python/feast/infra/materialization/lambda/Dockerfile new file mode 100644 index 0000000000..bbdb74bdfe --- /dev/null +++ b/sdk/python/feast/infra/materialization/lambda/Dockerfile @@ -0,0 +1,25 @@ +FROM public.ecr.aws/lambda/python:3.9 + +RUN yum install -y git + + +# Copy app handler code +COPY sdk/python/feast/infra/materialization/lambda/app.py ${LAMBDA_TASK_ROOT} + +# Copy necessary parts of the Feast codebase +COPY sdk/python sdk/python +COPY protos protos +COPY go go +COPY setup.py setup.py +COPY pyproject.toml pyproject.toml +COPY README.md README.md + +# Install Feast for AWS with Lambda dependencies +# We need this mount thingy because setuptools_scm needs access to the +# git dir to infer the version of feast we're installing. +# https://github.com/pypa/setuptools_scm#usage-from-docker +# I think it also assumes that this dockerfile is being built from the root of the directory. +RUN --mount=source=.git,target=.git,type=bind pip3 install --no-cache-dir -e '.[aws,redis]' + +# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) +CMD [ "app.handler" ] diff --git a/sdk/python/feast/infra/materialization/lambda/__init__.py b/sdk/python/feast/infra/materialization/lambda/__init__.py new file mode 100644 index 0000000000..d21505d91e --- /dev/null +++ b/sdk/python/feast/infra/materialization/lambda/__init__.py @@ -0,0 +1,11 @@ +from .lambda_engine import ( + LambdaMaterializationEngine, + LambdaMaterializationEngineConfig, + LambdaMaterializationJob, +) + +__all__ = [ + "LambdaMaterializationEngineConfig", + "LambdaMaterializationJob", + "LambdaMaterializationEngine", +] diff --git a/sdk/python/feast/infra/materialization/lambda/app.py b/sdk/python/feast/infra/materialization/lambda/app.py new file mode 100644 index 0000000000..375674adaa --- /dev/null +++ b/sdk/python/feast/infra/materialization/lambda/app.py @@ -0,0 +1,85 @@ +import base64 +import json +import sys +import tempfile +import traceback +from pathlib import Path + +import pyarrow.parquet as pq + +from feast import FeatureStore +from feast.constants import FEATURE_STORE_YAML_ENV_NAME +from feast.infra.materialization.local_engine import DEFAULT_BATCH_SIZE +from feast.utils import _convert_arrow_to_proto, _run_pyarrow_field_mapping + + +def handler(event, context): + """Provide an event that contains the following keys: + + - operation: one of the operations in the operations dict below + - tableName: required for operations that interact with DynamoDB + - payload: a parameter to pass to the operation being performed + """ + print("Received event: " + json.dumps(event, indent=2), flush=True) + + try: + + config_base64 = event[FEATURE_STORE_YAML_ENV_NAME] + + config_bytes = base64.b64decode(config_base64) + + # Create a new unique directory for writing feature_store.yaml + repo_path = Path(tempfile.mkdtemp()) + + with open(repo_path / "feature_store.yaml", "wb") as f: + f.write(config_bytes) + + # Initialize the feature store + store = FeatureStore(repo_path=str(repo_path.resolve())) + + view_name = event["view_name"] + view_type = event["view_type"] + path = event["path"] + + bucket = path[len("s3://") :].split("/", 1)[0] + key = path[len("s3://") :].split("/", 1)[1] + print(f"Inferred Bucket: `{bucket}` Key: `{key}`", flush=True) + + if view_type == "batch": + # TODO: This probably needs to be become `store.get_batch_feature_view` at some point. + feature_view = store.get_feature_view(view_name) + else: + feature_view = store.get_stream_feature_view(view_name) + + print(f"Got Feature View: `{feature_view}`", flush=True) + + table = pq.read_table(path) + if feature_view.batch_source.field_mapping is not None: + table = _run_pyarrow_field_mapping( + table, feature_view.batch_source.field_mapping + ) + + join_key_to_value_type = { + entity.name: entity.dtype.to_value_type() + for entity in feature_view.entity_columns + } + + written_rows = 0 + + for batch in table.to_batches(DEFAULT_BATCH_SIZE): + rows_to_write = _convert_arrow_to_proto( + batch, feature_view, join_key_to_value_type + ) + store._provider.online_write_batch( + store.config, + feature_view, + rows_to_write, + lambda x: None, + ) + written_rows += len(rows_to_write) + return {"written_rows": written_rows} + except Exception as e: + print(f"Exception: {e}", flush=True) + print("Traceback:", flush=True) + print(traceback.format_exc(), flush=True) + sys.exit(1) diff --git a/sdk/python/feast/infra/materialization/lambda/lambda_engine.py b/sdk/python/feast/infra/materialization/lambda/lambda_engine.py new file mode 100644 index 0000000000..69986ca6e1 --- /dev/null +++ b/sdk/python/feast/infra/materialization/lambda/lambda_engine.py @@ -0,0 +1,242 @@ +import base64 +import json +import logging +from concurrent.futures import ThreadPoolExecutor, wait +from dataclasses import dataclass +from datetime import datetime +from typing import Callable, List, Literal, Optional, Sequence, Union + +import boto3 +from pydantic import StrictStr +from tqdm import tqdm + +from feast.batch_feature_view import BatchFeatureView +from feast.constants import FEATURE_STORE_YAML_ENV_NAME +from feast.entity import Entity +from feast.feature_view import FeatureView +from feast.infra.materialization.batch_materialization_engine import ( + BatchMaterializationEngine, + MaterializationJob, + MaterializationJobStatus, + MaterializationTask, +) +from feast.infra.offline_stores.offline_store import OfflineStore +from feast.infra.online_stores.online_store import OnlineStore +from feast.registry import BaseRegistry +from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.stream_feature_view import StreamFeatureView +from feast.utils import _get_column_names +from feast.version import get_version + +DEFAULT_BATCH_SIZE = 10_000 + +logger = logging.getLogger(__name__) + + +class LambdaMaterializationEngineConfig(FeastConfigBaseModel): + """Batch Materialization Engine config for lambda based engine""" + + type: Literal["lambda"] = "lambda" + """ Type selector""" + + materialization_image: StrictStr + """ The URI of a container image in the Amazon ECR registry, which should be used for materialization. """ + + lambda_role: StrictStr + """ Role that should be used by the materialization lambda """ + + +@dataclass +class LambdaMaterializationJob(MaterializationJob): + def __init__(self, job_id: str, status: MaterializationJobStatus) -> None: + super().__init__() + self._job_id: str = job_id + self._status = status + self._error = None + + def status(self) -> MaterializationJobStatus: + return self._status + + def error(self) -> Optional[BaseException]: + return self._error + + def should_be_retried(self) -> bool: + return False + + def job_id(self) -> str: + return self._job_id + + def url(self) -> Optional[str]: + return None + + +class LambdaMaterializationEngine(BatchMaterializationEngine): + """ + WARNING: This engine should be considered "Alpha" functionality. + """ + + def update( + self, + project: str, + views_to_delete: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + views_to_keep: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + ): + # This should be setting up the lambda function. + r = self.lambda_client.create_function( + FunctionName=self.lambda_name, + PackageType="Image", + Role=self.repo_config.batch_engine.lambda_role, + Code={"ImageUri": self.repo_config.batch_engine.materialization_image}, + Timeout=600, + Tags={ + "feast-owned": "True", + "project": project, + "feast-sdk-version": get_version(), + }, + ) + logger.info( + "Creating lambda function %s, %s", + self.lambda_name, + r["ResponseMetadata"]["RequestId"], + ) + + logger.info("Waiting for function %s to be active", self.lambda_name) + waiter = self.lambda_client.get_waiter("function_active") + waiter.wait(FunctionName=self.lambda_name) + + def teardown_infra( + self, + project: str, + fvs: Sequence[Union[BatchFeatureView, StreamFeatureView, FeatureView]], + entities: Sequence[Entity], + ): + # This should be tearing down the lambda function. + logger.info("Tearing down lambda %s", self.lambda_name) + r = self.lambda_client.delete_function(FunctionName=self.lambda_name) + logger.info("Finished tearing down lambda %s: %s", self.lambda_name, r) + + def __init__( + self, + *, + repo_config: RepoConfig, + offline_store: OfflineStore, + online_store: OnlineStore, + **kwargs, + ): + super().__init__( + repo_config=repo_config, + offline_store=offline_store, + online_store=online_store, + **kwargs, + ) + repo_path = self.repo_config.repo_path + assert repo_path + feature_store_path = repo_path / "feature_store.yaml" + self.feature_store_base64 = str( + base64.b64encode(bytes(feature_store_path.read_text(), "UTF-8")), "UTF-8" + ) + + self.lambda_name = f"feast-materialize-{self.repo_config.project}" + if len(self.lambda_name) > 64: + self.lambda_name = self.lambda_name[:64] + self.lambda_client = boto3.client("lambda") + + def materialize( + self, registry, tasks: List[MaterializationTask] + ) -> List[MaterializationJob]: + return [ + self._materialize_one( + registry, + task.feature_view, + task.start_time, + task.end_time, + task.project, + task.tqdm_builder, + ) + for task in tasks + ] + + def _materialize_one( + self, + registry: BaseRegistry, + feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView], + start_date: datetime, + end_date: datetime, + project: str, + tqdm_builder: Callable[[int], tqdm], + ): + entities = [] + for entity_name in feature_view.entities: + entities.append(registry.get_entity(entity_name, project)) + + ( + join_key_columns, + feature_name_columns, + timestamp_field, + created_timestamp_column, + ) = _get_column_names(feature_view, entities) + + job_id = f"{feature_view.name}-{start_date}-{end_date}" + + offline_job = self.offline_store.pull_latest_from_table_or_query( + config=self.repo_config, + data_source=feature_view.batch_source, + join_key_columns=join_key_columns, + feature_name_columns=feature_name_columns, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + start_date=start_date, + end_date=end_date, + ) + + paths = offline_job.to_remote_storage() + max_workers = len(paths) if len(paths) <= 20 else 20 + executor = ThreadPoolExecutor(max_workers=max_workers) + futures = [] + + for path in paths: + payload = { + FEATURE_STORE_YAML_ENV_NAME: self.feature_store_base64, + "view_name": feature_view.name, + "view_type": "batch", + "path": path, + } + # Invoke a lambda to materialize this file. + + logger.info("Invoking materialization for %s", path) + futures.append( + executor.submit( + self.lambda_client.invoke, + FunctionName=self.lambda_name, + InvocationType="RequestResponse", + Payload=json.dumps(payload), + ) + ) + + done, not_done = wait(futures) + logger.info("Done: %s Not Done: %s", done, not_done) + for f in done: + response = f.result() + output = json.loads(response["Payload"].read()) + + logger.info( + f"Ingested task; request id {response['ResponseMetadata']['RequestId']}, " + f"rows written: {output['written_rows']}" + ) + + for f in not_done: + response = f.result() + logger.error(f"Ingestion failed: {response}") + + return LambdaMaterializationJob( + job_id=job_id, + status=MaterializationJobStatus.SUCCEEDED + if not not_done + else MaterializationJobStatus.ERROR, + ) diff --git a/sdk/python/feast/infra/materialization/local_engine.py b/sdk/python/feast/infra/materialization/local_engine.py new file mode 100644 index 0000000000..4f775981ef --- /dev/null +++ b/sdk/python/feast/infra/materialization/local_engine.py @@ -0,0 +1,185 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Callable, List, Literal, Optional, Sequence, Union + +from tqdm import tqdm + +from feast.batch_feature_view import BatchFeatureView +from feast.entity import Entity +from feast.feature_view import FeatureView +from feast.infra.offline_stores.offline_store import OfflineStore +from feast.infra.online_stores.online_store import OnlineStore +from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.stream_feature_view import StreamFeatureView + +from ...registry import BaseRegistry +from ...utils import ( + _convert_arrow_to_proto, + _get_column_names, + _run_pyarrow_field_mapping, +) +from .batch_materialization_engine import ( + BatchMaterializationEngine, + MaterializationJob, + MaterializationJobStatus, + MaterializationTask, +) + +DEFAULT_BATCH_SIZE = 10_000 + + +class LocalMaterializationEngineConfig(FeastConfigBaseModel): + """Batch Materialization Engine config for local in-process engine""" + + type: Literal["local"] = "local" + """ Type selector""" + + +@dataclass +class LocalMaterializationJob(MaterializationJob): + def __init__( + self, + job_id: str, + status: MaterializationJobStatus, + error: Optional[BaseException] = None, + ) -> None: + super().__init__() + self._job_id: str = job_id + self._status: MaterializationJobStatus = status + self._error: Optional[BaseException] = error + + def status(self) -> MaterializationJobStatus: + return self._status + + def error(self) -> Optional[BaseException]: + return self._error + + def should_be_retried(self) -> bool: + return False + + def job_id(self) -> str: + return self._job_id + + def url(self) -> Optional[str]: + return None + + +class LocalMaterializationEngine(BatchMaterializationEngine): + def update( + self, + project: str, + views_to_delete: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + views_to_keep: Sequence[ + Union[BatchFeatureView, StreamFeatureView, FeatureView] + ], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + ): + # Nothing to set up. + pass + + def teardown_infra( + self, + project: str, + fvs: Sequence[Union[BatchFeatureView, StreamFeatureView, FeatureView]], + entities: Sequence[Entity], + ): + # Nothing to tear down. + pass + + def __init__( + self, + *, + repo_config: RepoConfig, + offline_store: OfflineStore, + online_store: OnlineStore, + **kwargs, + ): + super().__init__( + repo_config=repo_config, + offline_store=offline_store, + online_store=online_store, + **kwargs, + ) + + def materialize( + self, registry, tasks: List[MaterializationTask] + ) -> List[MaterializationJob]: + return [ + self._materialize_one( + registry, + task.feature_view, + task.start_time, + task.end_time, + task.project, + task.tqdm_builder, + ) + for task in tasks + ] + + def _materialize_one( + self, + registry: BaseRegistry, + feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView], + start_date: datetime, + end_date: datetime, + project: str, + tqdm_builder: Callable[[int], tqdm], + ): + entities = [] + for entity_name in feature_view.entities: + entities.append(registry.get_entity(entity_name, project)) + + ( + join_key_columns, + feature_name_columns, + timestamp_field, + created_timestamp_column, + ) = _get_column_names(feature_view, entities) + + job_id = f"{feature_view.name}-{start_date}-{end_date}" + + try: + offline_job = self.offline_store.pull_latest_from_table_or_query( + config=self.repo_config, + data_source=feature_view.batch_source, + join_key_columns=join_key_columns, + feature_name_columns=feature_name_columns, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + start_date=start_date, + end_date=end_date, + ) + + table = offline_job.to_arrow() + + if feature_view.batch_source.field_mapping is not None: + table = _run_pyarrow_field_mapping( + table, feature_view.batch_source.field_mapping + ) + + join_key_to_value_type = { + entity.name: entity.dtype.to_value_type() + for entity in feature_view.entity_columns + } + + with tqdm_builder(table.num_rows) as pbar: + for batch in table.to_batches(DEFAULT_BATCH_SIZE): + rows_to_write = _convert_arrow_to_proto( + batch, feature_view, join_key_to_value_type + ) + self.online_store.online_write_batch( + self.repo_config, + feature_view, + rows_to_write, + lambda x: pbar.update(x), + ) + return LocalMaterializationJob( + job_id=job_id, status=MaterializationJobStatus.SUCCEEDED + ) + except BaseException as e: + return LocalMaterializationJob( + job_id=job_id, status=MaterializationJobStatus.ERROR, error=e + ) diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index 259a3af7d9..e3791f08c7 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -57,6 +57,7 @@ from google.cloud import bigquery from google.cloud.bigquery import Client, SchemaField, Table from google.cloud.bigquery._pandas_helpers import ARROW_SCALAR_IDS_TO_BQ + from google.cloud.storage import Client as StorageClient except ImportError as e: from feast.errors import FeastExtrasDependencyImportError @@ -83,6 +84,9 @@ class BigQueryOfflineStoreConfig(FeastConfigBaseModel): For more information on BigQuery data locations see: https://cloud.google.com/bigquery/docs/locations """ + gcs_staging_location: Optional[str] = None + """ (optional) GCS location used for offloading BigQuery results as parquet files.""" + class BigQueryOfflineStore(OfflineStore): @staticmethod @@ -130,7 +134,10 @@ def pull_latest_from_table_or_query( # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized return BigQueryRetrievalJob( - query=query, client=client, config=config, full_feature_names=False, + query=query, + client=client, + config=config, + full_feature_names=False, ) @staticmethod @@ -160,7 +167,10 @@ def pull_all_from_table_or_query( WHERE {timestamp_field} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') """ return BigQueryRetrievalJob( - query=query, client=client, config=config, full_feature_names=False, + query=query, + client=client, + config=config, + full_feature_names=False, ) @staticmethod @@ -191,20 +201,27 @@ def get_historical_features( config.offline_store.location, ) - entity_schema = _get_entity_schema(client=client, entity_df=entity_df,) + entity_schema = _get_entity_schema( + client=client, + entity_df=entity_df, + ) - entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( - entity_schema + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df(entity_schema) ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( - entity_df, entity_df_event_timestamp_col, client, + entity_df, + entity_df_event_timestamp_col, + client, ) @contextlib.contextmanager def query_generator() -> Iterator[str]: _upload_entity_df( - client=client, table_name=table_reference, entity_df=entity_df, + client=client, + table_name=table_reference, + entity_df=entity_df, ) expected_join_keys = offline_utils.get_expected_join_keys( @@ -289,7 +306,7 @@ def write_logged_features( file_obj=f, destination=destination.table, job_config=job_config, - ) + ).result() return @@ -302,7 +319,7 @@ def write_logged_features( file_obj=parquet_temp_file, destination=destination.table, job_config=job_config, - ) + ).result() @staticmethod def offline_write_batch( @@ -329,8 +346,8 @@ def offline_write_batch( ) if column_names != table.column_names: raise ValueError( - f"The input pyarrow table has schema {pa_schema} with the incorrect columns {column_names}. " - f"The columns are expected to be (in this order): {column_names}." + f"The input pyarrow table has schema {table.schema} with the incorrect columns {table.column_names}. " + f"The schema is expected to be {pa_schema} with the columns (in this exact order) to be {column_names}." ) if table.schema != pa_schema: @@ -356,7 +373,7 @@ def offline_write_batch( file_obj=parquet_temp_file, destination=feature_view.batch_source.table, job_config=job_config, - ) + ).result() class BigQueryRetrievalJob(RetrievalJob): @@ -386,6 +403,14 @@ def query_generator() -> Iterator[str]: on_demand_feature_views if on_demand_feature_views else [] ) self._metadata = metadata + if self.config.offline_store.gcs_staging_location: + self._gcs_path = ( + self.config.offline_store.gcs_staging_location + + f"/{self.config.project}/export/" + + str(uuid.uuid4()) + ) + else: + self._gcs_path = None @property def full_feature_names(self) -> bool: @@ -412,7 +437,7 @@ def to_bigquery( job_config: bigquery.QueryJobConfig = None, timeout: int = 1800, retry_cadence: int = 10, - ) -> Optional[str]: + ) -> str: """ Triggers the execution of a historical feature retrieval query and exports the results to a BigQuery table. Runs for a maximum amount of time specified by the timeout parameter (defaulting to 30 minutes). @@ -478,6 +503,43 @@ def persist(self, storage: SavedDatasetStorage): def metadata(self) -> Optional[RetrievalMetadata]: return self._metadata + def supports_remote_storage_export(self) -> bool: + return self._gcs_path is not None + + def to_remote_storage(self) -> List[str]: + if not self._gcs_path: + raise ValueError( + "gcs_staging_location needs to be specified for the big query " + "offline store when executing `to_remote_storage()`" + ) + + table = self.to_bigquery() + + job_config = bigquery.job.ExtractJobConfig() + job_config.destination_format = "PARQUET" + + extract_job = self.client.extract_table( + table, + destination_uris=[f"{self._gcs_path}/*.parquet"], + location=self.config.offline_store.location, + job_config=job_config, + ) + extract_job.result() + + bucket: str + prefix: str + storage_client = StorageClient(project=self.client.project) + bucket, prefix = self._gcs_path[len("gs://") :].split("/", 1) + prefix = prefix.rsplit("/", 1)[0] + if prefix.startswith("/"): + prefix = prefix[1:] + + blobs = storage_client.list_blobs(bucket, prefix=prefix) + results = [] + for b in blobs: + results.append(f"gs://{b.bucket.name}/{b.name}") + return results + def block_until_done( client: Client, @@ -518,7 +580,7 @@ def _wait_until_done(bq_job): finally: if client.get_job(bq_job).state in ["PENDING", "RUNNING"]: - client.cancel_job(bq_job) + client.cancel_job(bq_job.job_id) raise BigQueryJobCancelled(job_id=bq_job.job_id) if bq_job.exception(): @@ -549,9 +611,12 @@ def _get_table_reference_for_new_entity( def _upload_entity_df( - client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], + client: Client, + table_name: str, + entity_df: Union[pd.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" + job: Union[bigquery.job.query.QueryJob, bigquery.job.load.LoadJob] if isinstance(entity_df, str): job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") diff --git a/sdk/python/feast/infra/offline_stores/bigquery_source.py b/sdk/python/feast/infra/offline_stores/bigquery_source.py index b06cc23369..bb8316869b 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery_source.py +++ b/sdk/python/feast/infra/offline_stores/bigquery_source.py @@ -204,7 +204,9 @@ class BigQueryOptions: """ def __init__( - self, table: Optional[str], query: Optional[str], + self, + table: Optional[str], + query: Optional[str], ): self.table = table or "" self.query = query or "" @@ -221,7 +223,8 @@ def from_proto(cls, bigquery_options_proto: DataSourceProto.BigQueryOptions): Returns a BigQueryOptions object based on the bigquery_options protobuf """ bigquery_options = cls( - table=bigquery_options_proto.table, query=bigquery_options_proto.query, + table=bigquery_options_proto.table, + query=bigquery_options_proto.query, ) return bigquery_options @@ -234,7 +237,8 @@ def to_proto(self) -> DataSourceProto.BigQueryOptions: BigQueryOptionsProto protobuf """ bigquery_options_proto = DataSourceProto.BigQueryOptions( - table=self.table, query=self.query, + table=self.table, + query=self.query, ) return bigquery_options_proto diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres.py index f2aa535c1d..28944df72e 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres.py @@ -127,8 +127,8 @@ def query_generator() -> Iterator[str]: else: raise TypeError(entity_df) - entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( - entity_schema + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df(entity_schema) ) expected_join_keys = offline_utils.get_expected_join_keys( @@ -140,7 +140,10 @@ def query_generator() -> Iterator[str]: ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( - entity_df, entity_df_event_timestamp_col, config, df_query, + entity_df, + entity_df_event_timestamp_col, + config, + df_query, ) query_context = offline_utils.get_feature_view_query_context( @@ -211,7 +214,7 @@ def pull_all_from_table_or_query( query = f""" SELECT {field_string} - FROM {from_expression} + FROM {from_expression} AS paftoq_alias WHERE "{timestamp_field}" BETWEEN '{start_date}'::timestamptz AND '{end_date}'::timestamptz """ diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py index 6671a47765..c84fce03dc 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py @@ -40,7 +40,10 @@ def postgres_container(): log_string_to_wait_for = "database system is ready to accept connections" waited = wait_for_logs( - container=container, predicate=log_string_to_wait_for, timeout=30, interval=10, + container=container, + predicate=log_string_to_wait_for, + timeout=30, + interval=10, ) logger.info("Waited for %s seconds until postgres container was up", waited) @@ -52,7 +55,9 @@ class PostgreSQLDataSourceCreator(DataSourceCreator, OnlineStoreCreator): def __init__( self, project_name: str, fixture_request: pytest.FixtureRequest, **kwargs ): - super().__init__(project_name,) + super().__init__( + project_name, + ) self.project_name = project_name self.container = fixture_request.getfixturevalue("postgres_container") diff --git a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py index 2a0925d929..2437714dec 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py +++ b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py @@ -130,13 +130,16 @@ def get_historical_features( tmp_entity_df_table_name = offline_utils.get_temp_entity_table_name() entity_schema = _get_entity_schema( - spark_session=spark_session, entity_df=entity_df, + spark_session=spark_session, + entity_df=entity_df, ) event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( entity_schema=entity_schema, ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( - entity_df, event_timestamp_col, spark_session, + entity_df, + event_timestamp_col, + spark_session, ) _upload_entity_df( spark_session=spark_session, @@ -327,8 +330,8 @@ def _get_entity_df_event_timestamp_range( df = spark_session.sql(entity_df).select(entity_df_event_timestamp_col) # TODO(kzhang132): need utc conversion here. entity_df_event_timestamp_range = ( - df.agg({entity_df_event_timestamp_col: "max"}).collect()[0][0], df.agg({entity_df_event_timestamp_col: "min"}).collect()[0][0], + df.agg({entity_df_event_timestamp_col: "max"}).collect()[0][0], ) else: raise InvalidEntityType(type(entity_df)) diff --git a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py index ade1e54365..454e7ee87e 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py @@ -80,7 +80,10 @@ def __init__( RuntimeWarning, ) self.spark_options = SparkOptions( - table=table, query=query, path=path, file_format=file_format, + table=table, + query=query, + path=path, + file_format=file_format, ) @property @@ -174,7 +177,8 @@ def get_table_query_string(self) -> str: """Returns a string that can directly be used to reference this table in SQL""" if self.table: # Backticks make sure that spark sql knows this a table reference. - return f"`{self.table}`" + table = ".".join([f"`{x}`" for x in self.table.split(".")]) + return table if self.query: return f"({self.query})" @@ -304,7 +308,10 @@ def __init__( file_format: Optional[str] = None, ): self.spark_options = SparkOptions( - table=table, query=query, path=path, file_format=file_format, + table=table, + query=query, + path=path, + file_format=file_format, ) @staticmethod diff --git a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/test_config/manual_tests.py b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/test_config/manual_tests.py index 9c73f01819..7d31aa90fb 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/test_config/manual_tests.py +++ b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/test_config/manual_tests.py @@ -7,6 +7,7 @@ FULL_REPO_CONFIGS = [ IntegrationTestRepoConfig( - provider="local", offline_store_creator=TrinoSourceCreator, + provider="local", + offline_store_creator=TrinoSourceCreator, ), ] diff --git a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/tests/data_source.py index f2b9f785a0..67efa6a27f 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/tests/data_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/tests/data_source.py @@ -63,7 +63,10 @@ def __init__( ) self.exposed_port = self.container.get_exposed_port("8080") self.client = Trino( - user="user", catalog="memory", host="localhost", port=self.exposed_port, + user="user", + catalog="memory", + host="localhost", + port=self.exposed_port, ) def teardown(self): diff --git a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino.py b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino.py index 87a99b820e..88a9021d1c 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino.py +++ b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino.py @@ -202,7 +202,10 @@ def pull_latest_from_table_or_query( # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized return TrinoRetrievalJob( - query=query, client=client, config=config, full_feature_names=False, + query=query, + client=client, + config=config, + full_feature_names=False, ) @staticmethod @@ -240,8 +243,10 @@ def get_historical_features( connector=config.offline_store.connector, ) - entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( - entity_schema=entity_schema + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df( + entity_schema=entity_schema + ) ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( @@ -327,11 +332,17 @@ def pull_all_from_table_or_query( WHERE {timestamp_field} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' """ return TrinoRetrievalJob( - query=query, client=client, config=config, full_feature_names=False, + query=query, + client=client, + config=config, + full_feature_names=False, ) -def _get_table_reference_for_new_entity(catalog: str, dataset_name: str,) -> str: +def _get_table_reference_for_new_entity( + catalog: str, + dataset_name: str, +) -> str: """Gets the table_id for the new entity to be uploaded.""" table_name = offline_utils.get_temp_entity_table_name() return f"{catalog}.{dataset_name}.{table_name}" diff --git a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino_source.py b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino_source.py index b559d0e59e..d82650712e 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/trino_offline_store/trino_source.py @@ -61,7 +61,8 @@ def from_proto(cls, trino_options_proto: DataSourceProto.TrinoOptions): Returns a TrinoOptions object based on the trino_options protobuf """ trino_options = cls( - table=trino_options_proto.table, query=trino_options_proto.query, + table=trino_options_proto.table, + query=trino_options_proto.query, ) return trino_options @@ -74,7 +75,8 @@ def to_proto(self) -> DataSourceProto.TrinoOptions: """ trino_options_proto = DataSourceProto.TrinoOptions( - table=self.table, query=self.query, + table=self.table, + query=self.query, ) return trino_options_proto diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 75968146de..829bd36c3d 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -29,14 +29,14 @@ DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, get_pyarrow_schema_from_batch_source, ) -from feast.infra.provider import ( - _get_requested_feature_views_to_features_dict, - _run_dask_field_mapping, -) from feast.registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage from feast.usage import log_exceptions_and_usage +from feast.utils import ( + _get_requested_feature_views_to_features_dict, + _run_dask_field_mapping, +) class FileOfflineStoreConfig(FeastConfigBaseModel): @@ -88,7 +88,8 @@ def _to_arrow_internal(self): def persist(self, storage: SavedDatasetStorage): assert isinstance(storage, SavedDatasetFileStorage) filesystem, path = FileSource.create_filesystem_and_path( - storage.file_options.uri, storage.file_options.s3_endpoint_override, + storage.file_options.uri, + storage.file_options.s3_endpoint_override, ) if path.endswith(".parquet"): @@ -105,6 +106,9 @@ def persist(self, storage: SavedDatasetStorage): def metadata(self) -> Optional[RetrievalMetadata]: return self._metadata + def supports_remote_storage_export(self) -> bool: + return False + class FileOfflineStore(OfflineStore): @staticmethod @@ -311,7 +315,9 @@ def evaluate_offline_job(): # TODO(kevjumba): remove try catch when fix is merged upstream in Dask. try: if created_timestamp_column: - source_df = source_df.sort_values(by=created_timestamp_column,) + source_df = source_df.sort_values( + by=created_timestamp_column, + ) source_df = source_df.sort_values(by=timestamp_field) @@ -349,7 +355,8 @@ def evaluate_offline_job(): # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized return FileRetrievalJob( - evaluation_function=evaluate_offline_job, full_feature_names=False, + evaluation_function=evaluate_offline_job, + full_feature_names=False, ) @staticmethod @@ -391,7 +398,8 @@ def write_logged_features( data = pyarrow.parquet.read_table(data, use_threads=False, pre_buffer=False) filesystem, path = FileSource.create_filesystem_and_path( - destination.path, destination.s3_endpoint_override, + destination.path, + destination.s3_endpoint_override, ) pyarrow.dataset.write_dataset( @@ -430,8 +438,8 @@ def offline_write_batch( ) if column_names != table.column_names: raise ValueError( - f"The input pyarrow table has schema {pa_schema} with the incorrect columns {column_names}. " - f"The columns are expected to be (in this order): {column_names}." + f"The input pyarrow table has schema {table.schema} with the incorrect columns {table.column_names}. " + f"The schema is expected to be {pa_schema} with the columns (in this exact order) to be {column_names}." ) file_options = feature_view.batch_source.file_options @@ -450,7 +458,8 @@ def offline_write_batch( def _get_entity_df_event_timestamp_range( - entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, + entity_df: Union[pd.DataFrame, str], + entity_df_event_timestamp_col: str, ) -> Tuple[datetime, datetime]: if not isinstance(entity_df, pd.DataFrame): raise ValueError( @@ -480,7 +489,10 @@ def _read_datasource(data_source) -> dd.DataFrame: else None ) - return dd.read_parquet(data_source.path, storage_options=storage_options,) + return dd.read_parquet( + data_source.path, + storage_options=storage_options, + ) def _field_mapping( @@ -530,7 +542,8 @@ def _field_mapping( # Make sure to not have duplicated columns if entity_df_event_timestamp_col == timestamp_field: df_to_join = _run_dask_field_mapping( - df_to_join, {timestamp_field: f"__{timestamp_field}"}, + df_to_join, + {timestamp_field: f"__{timestamp_field}"}, ) timestamp_field = f"__{timestamp_field}" @@ -568,7 +581,9 @@ def _merge( def _normalize_timestamp( - df_to_join: dd.DataFrame, timestamp_field: str, created_timestamp_column: str, + df_to_join: dd.DataFrame, + timestamp_field: str, + created_timestamp_column: str, ) -> dd.DataFrame: df_to_join_types = df_to_join.dtypes timestamp_field_type = df_to_join_types[timestamp_field] @@ -620,6 +635,14 @@ def _filter_ttl( ) ] + df_to_join = df_to_join.persist() + else: + df_to_join = df_to_join[ + # do not drop entity rows if one of the sources returns NaNs + df_to_join[timestamp_field].isna() + | (df_to_join[timestamp_field] <= df_to_join[entity_df_event_timestamp_col]) + ] + df_to_join = df_to_join.persist() return df_to_join @@ -642,14 +665,18 @@ def _drop_duplicates( df_to_join = df_to_join.persist() df_to_join = df_to_join.drop_duplicates( - all_join_keys + [entity_df_event_timestamp_col], keep="last", ignore_index=True, + all_join_keys + [entity_df_event_timestamp_col], + keep="last", + ignore_index=True, ) return df_to_join.persist() def _drop_columns( - df_to_join: dd.DataFrame, timestamp_field: str, created_timestamp_column: str, + df_to_join: dd.DataFrame, + timestamp_field: str, + created_timestamp_column: str, ) -> dd.DataFrame: entity_df_with_features = df_to_join.drop([timestamp_field], axis=1).persist() diff --git a/sdk/python/feast/infra/offline_stores/offline_store.py b/sdk/python/feast/infra/offline_stores/offline_store.py index 439911fe2a..c8a0cb8a5c 100644 --- a/sdk/python/feast/infra/offline_stores/offline_store.py +++ b/sdk/python/feast/infra/offline_stores/offline_store.py @@ -85,7 +85,8 @@ def to_df( for odfv in self.on_demand_feature_views: features_df = features_df.join( odfv.get_transformed_features_df( - features_df, self.full_feature_names, + features_df, + self.full_feature_names, ) ) @@ -129,7 +130,8 @@ def to_arrow( for odfv in self.on_demand_feature_views: features_df = features_df.join( odfv.get_transformed_features_df( - features_df, self.full_feature_names, + features_df, + self.full_feature_names, ) ) @@ -163,6 +165,26 @@ def metadata(self) -> Optional[RetrievalMetadata]: """ pass + def supports_remote_storage_export(self) -> bool: + """ + This method should return True if the RetrievalJob supports `to_remote_storage()`. + """ + return False + + def to_remote_storage(self) -> List[str]: + """ + This method should export the result of this RetrievalJob to + remote storage (such as S3, GCS, HDFS, etc). + Implementations of this method should export the results as + multiple parquet files, each file sized appropriately + depending on how much data is being returned by the retrieval + job. + + Returns: + A list of parquet file paths in remote storage. + """ + raise NotImplementedError() + class OfflineStore(ABC): """ diff --git a/sdk/python/feast/infra/offline_stores/offline_utils.py b/sdk/python/feast/infra/offline_stores/offline_utils.py index abe8d4e4e5..8b963a864b 100644 --- a/sdk/python/feast/infra/offline_stores/offline_utils.py +++ b/sdk/python/feast/infra/offline_stores/offline_utils.py @@ -17,11 +17,10 @@ from feast.feature_view import FeatureView from feast.importer import import_class from feast.infra.offline_stores.offline_store import OfflineStore -from feast.infra.provider import _get_requested_feature_views_to_features_dict from feast.registry import BaseRegistry from feast.repo_config import RepoConfig from feast.type_map import feast_value_type_to_pa -from feast.utils import to_naive_utc +from feast.utils import _get_requested_feature_views_to_features_dict, to_naive_utc DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL = "event_timestamp" diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py index 8667989268..df70f958f7 100644 --- a/sdk/python/feast/infra/offline_stores/redshift.py +++ b/sdk/python/feast/infra/offline_stores/redshift.py @@ -192,12 +192,15 @@ def get_historical_features( entity_df, redshift_client, config, s3_resource ) - entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( - entity_schema + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df(entity_schema) ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( - entity_df, entity_df_event_timestamp_col, redshift_client, config, + entity_df, + entity_df_event_timestamp_col, + redshift_client, + config, ) @contextlib.contextmanager @@ -323,8 +326,8 @@ def offline_write_batch( ) if column_names != table.column_names: raise ValueError( - f"The input pyarrow table has schema {pa_schema} with the incorrect columns {column_names}. " - f"The columns are expected to be (in this order): {column_names}." + f"The input pyarrow table has schema {table.schema} with the incorrect columns {table.column_names}. " + f"The schema is expected to be {pa_schema} with the columns (in this exact order) to be {column_names}." ) if table.schema != pa_schema: @@ -490,6 +493,13 @@ def persist(self, storage: SavedDatasetStorage): def metadata(self) -> Optional[RetrievalMetadata]: return self._metadata + def supports_remote_storage_export(self) -> bool: + return True + + def to_remote_storage(self) -> List[str]: + path = self.to_s3() + return aws_utils.list_s3_files(self._config.offline_store.region, path) + def _upload_entity_df( entity_df: Union[pd.DataFrame, str], diff --git a/sdk/python/feast/infra/offline_stores/snowflake.py b/sdk/python/feast/infra/offline_stores/snowflake.py index ec06d8dce1..0f4c6a7b52 100644 --- a/sdk/python/feast/infra/offline_stores/snowflake.py +++ b/sdk/python/feast/infra/offline_stores/snowflake.py @@ -1,5 +1,6 @@ import contextlib import os +import uuid from datetime import datetime from pathlib import Path from typing import ( @@ -90,6 +91,12 @@ class SnowflakeOfflineStoreConfig(FeastConfigBaseModel): schema_: Optional[str] = Field(None, alias="schema") """ Snowflake schema name """ + storage_integration_name: Optional[str] = None + """ Storage integration name in snowflake """ + + blob_export_location: Optional[str] = None + """ Location (in S3, Google storage or Azure storage) where data is offloaded """ + class Config: allow_population_by_field_name = True @@ -217,12 +224,14 @@ def get_historical_features( entity_schema = _get_entity_schema(entity_df, snowflake_conn, config) - entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( - entity_schema + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df(entity_schema) ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( - entity_df, entity_df_event_timestamp_col, snowflake_conn, + entity_df, + entity_df_event_timestamp_col, + snowflake_conn, ) @contextlib.contextmanager @@ -332,8 +341,8 @@ def offline_write_batch( ) if column_names != table.column_names: raise ValueError( - f"The input pyarrow table has schema {pa_schema} with the incorrect columns {column_names}. " - f"The columns are expected to be (in this order): {column_names}." + f"The input pyarrow table has schema {table.schema} with the incorrect columns {table.column_names}. " + f"The schema is expected to be {pa_schema} with the columns (in this exact order) to be {column_names}." ) if table.schema != pa_schema: @@ -378,6 +387,11 @@ def query_generator() -> Iterator[str]: on_demand_feature_views if on_demand_feature_views else [] ) self._metadata = metadata + self.export_path: Optional[str] + if self.config.offline_store.blob_export_location: + self.export_path = f"{self.config.offline_store.blob_export_location}/{self.config.project}/{uuid.uuid4()}" + else: + self.export_path = None @property def full_feature_names(self) -> bool: @@ -413,7 +427,7 @@ def _to_arrow_internal(self) -> pa.Table: pd.DataFrame(columns=[md.name for md in empty_result.description]) ) - def to_snowflake(self, table_name: str) -> None: + def to_snowflake(self, table_name: str, temporary=False) -> None: """Save dataset as a new Snowflake table""" if self.on_demand_feature_views is not None: transformed_df = self.to_df() @@ -425,7 +439,7 @@ def to_snowflake(self, table_name: str) -> None: return None with self._query_generator() as query: - query = f'CREATE TABLE IF NOT EXISTS "{table_name}" AS ({query});\n' + query = f'CREATE {"TEMPORARY" if temporary else ""} TABLE IF NOT EXISTS "{table_name}" AS ({query});\n' execute_snowflake_statement(self.snowflake_conn, query) @@ -453,6 +467,41 @@ def persist(self, storage: SavedDatasetStorage): def metadata(self) -> Optional[RetrievalMetadata]: return self._metadata + def supports_remote_storage_export(self) -> bool: + return ( + self.config.offline_store.storage_integration_name + and self.config.offline_store.blob_export_location + ) + + def to_remote_storage(self) -> List[str]: + if not self.export_path: + raise ValueError( + "to_remote_storage() requires `blob_export_location` to be specified in config" + ) + if not self.config.offline_store.storage_integration_name: + raise ValueError( + "to_remote_storage() requires `storage_integration_name` to be specified in config" + ) + + table = f"temporary_{uuid.uuid4().hex}" + self.to_snowflake(table) + + copy_into_query = f"""copy into '{self.config.offline_store.blob_export_location}/{table}' from "{self.config.offline_store.database}"."{self.config.offline_store.schema_}"."{table}"\n + storage_integration = {self.config.offline_store.storage_integration_name}\n + file_format = (TYPE = PARQUET)\n + DETAILED_OUTPUT = TRUE\n + HEADER = TRUE;\n + """ + + cursor = execute_snowflake_statement(self.snowflake_conn, copy_into_query) + all_rows = ( + cursor.fetchall() + ) # This may be need pagination at some point in the future. + file_name_column_index = [ + idx for idx, rm in enumerate(cursor.description) if rm.name == "FILE_NAME" + ][0] + return [f"{self.export_path}/{row[file_name_column_index]}" for row in all_rows] + def _get_entity_schema( entity_df: Union[pd.DataFrame, str], diff --git a/sdk/python/feast/infra/offline_stores/snowflake_source.py b/sdk/python/feast/infra/offline_stores/snowflake_source.py index b072c6e871..258fba71b1 100644 --- a/sdk/python/feast/infra/offline_stores/snowflake_source.py +++ b/sdk/python/feast/infra/offline_stores/snowflake_source.py @@ -360,4 +360,6 @@ def to_proto(self) -> LoggingConfigProto: ) def to_data_source(self) -> DataSource: - return SnowflakeSource(table=self.table_name,) + return SnowflakeSource( + table=self.table_name, + ) diff --git a/sdk/python/feast/infra/online_stores/contrib/hbase_online_store/hbase.py b/sdk/python/feast/infra/online_stores/contrib/hbase_online_store/hbase.py index d95e83f429..aff0c6c42c 100644 --- a/sdk/python/feast/infra/online_stores/contrib/hbase_online_store/hbase.py +++ b/sdk/python/feast/infra/online_stores/contrib/hbase_online_store/hbase.py @@ -108,7 +108,10 @@ def online_write_batch( b = hbase.batch(table_name) for entity_key, values, timestamp, created_ts in data: - row_key = serialize_entity_key(entity_key).hex() + row_key = serialize_entity_key( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ).hex() values_dict = {} for feature_name, val in values.items(): values_dict[ @@ -154,7 +157,11 @@ def online_read( result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] row_keys = [ - serialize_entity_key(entity_key).hex() for entity_key in entity_keys + serialize_entity_key( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ).hex() + for entity_key in entity_keys ] rows = hbase.rows(table_name, row_keys=row_keys) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 81727067f5..18f3b189e7 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -49,7 +49,10 @@ def online_write_batch( with self._get_conn(config) as conn, conn.cursor() as cur: insert_values = [] for entity_key, values, timestamp, created_ts in data: - entity_key_bin = serialize_entity_key(entity_key) + entity_key_bin = serialize_entity_key( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) timestamp = _to_naive_utc(timestamp) if created_ts is not None: created_ts = _to_naive_utc(created_ts) @@ -104,7 +107,12 @@ def online_read( # to PostgreSQL keys = [] for entity_key in entity_keys: - keys.append(serialize_entity_key(entity_key)) + keys.append( + serialize_entity_key( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) + ) cur.execute( sql.SQL( @@ -112,7 +120,9 @@ def online_read( SELECT entity_key, feature_name, value, event_ts FROM {} WHERE entity_key = ANY(%s); """ - ).format(sql.Identifier(_table_id(project, table)),), + ).format( + sql.Identifier(_table_id(project, table)), + ), (keys,), ) @@ -228,7 +238,10 @@ def _drop_table_and_index(table_name): DROP TABLE IF EXISTS {}; DROP INDEX IF EXISTS {}; """ - ).format(sql.Identifier(table_name), sql.Identifier(f"{table_name}_ek"),) + ).format( + sql.Identifier(table_name), + sql.Identifier(f"{table_name}_ek"), + ) def _to_naive_utc(ts: datetime): diff --git a/sdk/python/feast/infra/online_stores/datastore.py b/sdk/python/feast/infra/online_stores/datastore.py index fc3659ea1a..eabf2ccefc 100644 --- a/sdk/python/feast/infra/online_stores/datastore.py +++ b/sdk/python/feast/infra/online_stores/datastore.py @@ -162,7 +162,7 @@ def online_write_batch( with ThreadPool(processes=write_concurrency) as pool: pool.map( lambda b: self._write_minibatch( - client, feast_project, table, b, progress + client, feast_project, table, b, progress, config ), self._to_minibatches(data, batch_size=write_batch_size), ) @@ -191,13 +191,22 @@ def _write_minibatch( Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] ], progress: Optional[Callable[[int], Any]], + config: RepoConfig, ): entities = [] for entity_key, features, timestamp, created_ts in data: - document_id = compute_entity_id(entity_key) + document_id = compute_entity_id( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) key = client.key( - "Project", project, "Table", table.name, "Row", document_id, + "Project", + project, + "Table", + table.name, + "Row", + document_id, ) entity = datastore.Entity( @@ -241,7 +250,10 @@ def online_read( keys: List[Key] = [] result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] for entity_key in entity_keys: - document_id = compute_entity_id(entity_key) + document_id = compute_entity_id( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) key = client.key( "Project", feast_project, "Table", table.name, "Row", document_id ) @@ -318,7 +330,10 @@ def _initialize_client( project_id: Optional[str], namespace: Optional[str] ) -> datastore.Client: try: - client = datastore.Client(project=project_id, namespace=namespace,) + client = datastore.Client( + project=project_id, + namespace=namespace, + ) return client except DefaultCredentialsError as e: raise FeastProviderLoginError( @@ -394,7 +409,8 @@ def from_infra_object_proto(infra_object_proto: InfraObjectProto) -> Any: @staticmethod def from_proto(datastore_table_proto: DatastoreTableProto) -> Any: datastore_table = DatastoreTable( - project=datastore_table_proto.project, name=datastore_table_proto.name, + project=datastore_table_proto.project, + name=datastore_table_proto.name, ) # Distinguish between null and empty string, since project_id and namespace are StringValues. diff --git a/sdk/python/feast/infra/online_stores/dynamodb.py b/sdk/python/feast/infra/online_stores/dynamodb.py index 50709fa3d4..257a1fd80d 100644 --- a/sdk/python/feast/infra/online_stores/dynamodb.py +++ b/sdk/python/feast/infra/online_stores/dynamodb.py @@ -188,7 +188,7 @@ def online_write_batch( table_instance = dynamodb_resource.Table( _get_table_name(online_config, config, table) ) - self._write_batch_non_duplicates(table_instance, data, progress) + self._write_batch_non_duplicates(table_instance, data, progress, config) @log_exceptions_and_usage(online_store="dynamodb") def online_read( @@ -216,7 +216,13 @@ def online_read( ) result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] - entity_ids = [compute_entity_id(entity_key) for entity_key in entity_keys] + entity_ids = [ + compute_entity_id( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) + for entity_key in entity_keys + ] batch_size = online_config.batch_size entity_ids_iter = iter(entity_ids) while True: @@ -229,7 +235,8 @@ def online_read( break batch_entity_ids = { table_instance.name: { - "Keys": [{"entity_id": entity_id} for entity_id in batch] + "Keys": [{"entity_id": entity_id} for entity_id in batch], + "ConsistentRead": True, } } with tracing_span(name="remote_call"): @@ -299,11 +306,15 @@ def _write_batch_non_duplicates( Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] ], progress: Optional[Callable[[int], Any]], + config: RepoConfig, ): """Deduplicate write batch request items on ``entity_id`` primary key.""" with table_instance.batch_writer(overwrite_by_pkeys=["entity_id"]) as batch: for entity_key, features, timestamp, created_ts in data: - entity_id = compute_entity_id(entity_key) + entity_id = compute_entity_id( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) batch.put_item( Item={ "entity_id": entity_id, # PartitionKey @@ -337,7 +348,8 @@ def _get_table_name( def _delete_table_idempotent( - dynamodb_resource, table_name: str, + dynamodb_resource, + table_name: str, ): try: table = dynamodb_resource.Table(table_name) @@ -398,7 +410,8 @@ def from_infra_object_proto(infra_object_proto: InfraObjectProto) -> Any: @staticmethod def from_proto(dynamodb_table_proto: DynamoDBTableProto) -> Any: return DynamoDBTable( - name=dynamodb_table_proto.name, region=dynamodb_table_proto.region, + name=dynamodb_table_proto.name, + region=dynamodb_table_proto.region, ) def update(self): diff --git a/sdk/python/feast/infra/online_stores/helpers.py b/sdk/python/feast/infra/online_stores/helpers.py index b206c08b7c..0e2fdb3500 100644 --- a/sdk/python/feast/infra/online_stores/helpers.py +++ b/sdk/python/feast/infra/online_stores/helpers.py @@ -21,8 +21,16 @@ def get_online_store_from_config(online_store_config: Any) -> OnlineStore: return online_store_class() -def _redis_key(project: str, entity_key: EntityKeyProto) -> bytes: - key: List[bytes] = [serialize_entity_key(entity_key), project.encode("utf-8")] +def _redis_key( + project: str, entity_key: EntityKeyProto, entity_key_serialization_version=1 +) -> bytes: + key: List[bytes] = [ + serialize_entity_key( + entity_key, + entity_key_serialization_version=entity_key_serialization_version, + ), + project.encode("utf-8"), + ] return b"".join(key) @@ -40,10 +48,17 @@ def _mmh3(key: str): return bytes.fromhex(struct.pack(" str: +def compute_entity_id( + entity_key: EntityKeyProto, entity_key_serialization_version=1 +) -> str: """ Compute Entity id given Feast Entity Key for online stores. Remember that Entity here refers to `EntityKeyProto` which is used in some online stores to encode the keys. It has nothing to do with the Entity concept we have in Feast. """ - return mmh3.hash_bytes(serialize_entity_key(entity_key)).hex() + return mmh3.hash_bytes( + serialize_entity_key( + entity_key, + entity_key_serialization_version=entity_key_serialization_version, + ) + ).hex() diff --git a/sdk/python/feast/infra/online_stores/redis.py b/sdk/python/feast/infra/online_stores/redis.py index 60fa9265ca..da458a3693 100644 --- a/sdk/python/feast/infra/online_stores/redis.py +++ b/sdk/python/feast/infra/online_stores/redis.py @@ -199,7 +199,11 @@ def online_write_batch( # TODO: investigate if check and set is a better approach rather than pulling all entity ts and then setting # it may be significantly slower but avoids potential (rare) race conditions for entity_key, _, _, _ in data: - redis_key_bin = _redis_key(project, entity_key) + redis_key_bin = _redis_key( + project, + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) keys.append(redis_key_bin) pipe.hmget(redis_key_bin, ts_key) prev_event_timestamps = pipe.execute() @@ -268,7 +272,11 @@ def online_read( keys = [] for entity_key in entity_keys: - redis_key_bin = _redis_key(project, entity_key) + redis_key_bin = _redis_key( + project, + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) keys.append(redis_key_bin) with client.pipeline(transaction=False) as pipe: for redis_key_bin in keys: diff --git a/sdk/python/feast/infra/online_stores/snowflake.py b/sdk/python/feast/infra/online_stores/snowflake.py new file mode 100644 index 0000000000..73c68e4bc0 --- /dev/null +++ b/sdk/python/feast/infra/online_stores/snowflake.py @@ -0,0 +1,267 @@ +import itertools +import os +from binascii import hexlify +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple + +import pandas as pd +import pytz +from pydantic import Field +from pydantic.schema import Literal + +from feast.entity import Entity +from feast.feature_view import FeatureView +from feast.infra.key_encoding_utils import serialize_entity_key +from feast.infra.online_stores.online_store import OnlineStore +from feast.infra.utils.snowflake_utils import get_snowflake_conn, write_pandas_binary +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.usage import log_exceptions_and_usage + + +class SnowflakeOnlineStoreConfig(FeastConfigBaseModel): + """Online store config for Snowflake""" + + type: Literal["snowflake.online"] = "snowflake.online" + """ Online store type selector""" + + config_path: Optional[str] = ( + Path(os.environ["HOME"]) / ".snowsql/config" + ).__str__() + """ Snowflake config path -- absolute path required (Can't use ~)""" + + account: Optional[str] = None + """ Snowflake deployment identifier -- drop .snowflakecomputing.com""" + + user: Optional[str] = None + """ Snowflake user name """ + + password: Optional[str] = None + """ Snowflake password """ + + role: Optional[str] = None + """ Snowflake role name""" + + warehouse: Optional[str] = None + """ Snowflake warehouse name """ + + database: Optional[str] = None + """ Snowflake database name """ + + schema_: Optional[str] = Field("PUBLIC", alias="schema") + """ Snowflake schema name """ + + class Config: + allow_population_by_field_name = True + + +class SnowflakeOnlineStore(OnlineStore): + @log_exceptions_and_usage(online_store="snowflake") + def online_write_batch( + self, + config: RepoConfig, + table: FeatureView, + data: List[ + Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] + ], + progress: Optional[Callable[[int], Any]], + ) -> None: + assert isinstance(config.online_store, SnowflakeOnlineStoreConfig) + + dfs = [None] * len(data) + for i, (entity_key, values, timestamp, created_ts) in enumerate(data): + df = pd.DataFrame( + columns=[ + "entity_feature_key", + "entity_key", + "feature_name", + "value", + "event_ts", + "created_ts", + ], + index=range(0, len(values)), + ) + + timestamp = _to_naive_utc(timestamp) + if created_ts is not None: + created_ts = _to_naive_utc(created_ts) + + entity_key_serialization_version = ( + config.entity_key_serialization_version + if config.entity_key_serialization_version + else 2 + ) + for j, (feature_name, val) in enumerate(values.items()): + df.loc[j, "entity_feature_key"] = serialize_entity_key( + entity_key, + entity_key_serialization_version, + ) + bytes(feature_name, encoding="utf-8") + df.loc[j, "entity_key"] = serialize_entity_key( + entity_key, + entity_key_serialization_version, + ) + df.loc[j, "feature_name"] = feature_name + df.loc[j, "value"] = val.SerializeToString() + df.loc[j, "event_ts"] = timestamp + df.loc[j, "created_ts"] = created_ts + + dfs[i] = df + + if dfs: + agg_df = pd.concat(dfs) + + # This combines both the data upload plus the overwrite in the same transaction + with get_snowflake_conn(config.online_store, autocommit=False) as conn: + write_pandas_binary( + conn, agg_df, f"[online-transient] {config.project}_{table.name}" + ) # special function for writing binary to snowflake + + query = f""" + INSERT OVERWRITE INTO "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}" + SELECT + "entity_feature_key", + "entity_key", + "feature_name", + "value", + "event_ts", + "created_ts" + FROM + (SELECT + *, + ROW_NUMBER() OVER(PARTITION BY "entity_key","feature_name" ORDER BY "event_ts" DESC, "created_ts" DESC) AS "_feast_row" + FROM + "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}") + WHERE + "_feast_row" = 1; + """ + + conn.cursor().execute(query) + + if progress: + progress(len(data)) + + return None + + @log_exceptions_and_usage(online_store="snowflake") + def online_read( + self, + config: RepoConfig, + table: FeatureView, + entity_keys: List[EntityKeyProto], + requested_features: List[str], + ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + assert isinstance(config.online_store, SnowflakeOnlineStoreConfig) + + result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] + + entity_key_serialization_version = ( + config.entity_key_serialization_version + if config.entity_key_serialization_version + else 2 + ) + + entity_fetch_str = ",".join( + [ + ( + "TO_BINARY(" + + hexlify( + serialize_entity_key(combo[0], entity_key_serialization_version) + + bytes(combo[1], encoding="utf-8") + ).__str__()[1:] + + ")" + ) + for combo in itertools.product(entity_keys, requested_features) + ] + ) + + with get_snowflake_conn(config.online_store) as conn: + + df = ( + conn.cursor() + .execute( + f""" + SELECT + "entity_key", "feature_name", "value", "event_ts" + FROM + "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}" + WHERE + "entity_feature_key" IN ({entity_fetch_str}) + """, + ) + .fetch_pandas_all() + ) + + for entity_key in entity_keys: + entity_key_bin = serialize_entity_key( + entity_key, + entity_key_serialization_version, + ) + res = {} + res_ts = None + for index, row in df[df["entity_key"] == entity_key_bin].iterrows(): + val = ValueProto() + val.ParseFromString(row["value"]) + res[row["feature_name"]] = val + res_ts = row["event_ts"].to_pydatetime() + + if not res: + result.append((None, None)) + else: + result.append((res_ts, res)) + return result + + @log_exceptions_and_usage(online_store="snowflake") + def update( + self, + config: RepoConfig, + tables_to_delete: Sequence[FeatureView], + tables_to_keep: Sequence[FeatureView], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + partial: bool, + ): + assert isinstance(config.online_store, SnowflakeOnlineStoreConfig) + + with get_snowflake_conn(config.online_store) as conn: + + for table in tables_to_keep: + + conn.cursor().execute( + f"""CREATE TRANSIENT TABLE IF NOT EXISTS "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}" ( + "entity_feature_key" BINARY, + "entity_key" BINARY, + "feature_name" VARCHAR, + "value" BINARY, + "event_ts" TIMESTAMP, + "created_ts" TIMESTAMP + )""" + ) + + for table in tables_to_delete: + + conn.cursor().execute( + f'DROP TABLE IF EXISTS "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}"' + ) + + def teardown( + self, + config: RepoConfig, + tables: Sequence[FeatureView], + entities: Sequence[Entity], + ): + assert isinstance(config.online_store, SnowflakeOnlineStoreConfig) + + with get_snowflake_conn(config.online_store) as conn: + + for table in tables: + query = f'DROP TABLE IF EXISTS "{config.online_store.database}"."{config.online_store.schema_}"."[online-transient] {config.project}_{table.name}"' + conn.cursor().execute(query) + + +def _to_naive_utc(ts: datetime): + if ts.tzinfo is None: + return ts + else: + return ts.astimezone(pytz.utc).replace(tzinfo=None) diff --git a/sdk/python/feast/infra/online_stores/sqlite.py b/sdk/python/feast/infra/online_stores/sqlite.py index 2f0e902942..a880cef050 100644 --- a/sdk/python/feast/infra/online_stores/sqlite.py +++ b/sdk/python/feast/infra/online_stores/sqlite.py @@ -95,7 +95,10 @@ def online_write_batch( with conn: for entity_key, values, timestamp, created_ts in data: - entity_key_bin = serialize_entity_key(entity_key) + entity_key_bin = serialize_entity_key( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) timestamp = to_naive_utc(timestamp) if created_ts is not None: created_ts = to_naive_utc(created_ts) @@ -153,7 +156,13 @@ def online_read( f"FROM {_table_id(config.project, table)} " f"WHERE entity_key IN ({','.join('?' * len(entity_keys))}) " f"ORDER BY entity_key", - [serialize_entity_key(entity_key) for entity_key in entity_keys], + [ + serialize_entity_key( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) + for entity_key in entity_keys + ], ) rows = cur.fetchall() @@ -161,7 +170,10 @@ def online_read( k: list(group) for k, group in itertools.groupby(rows, key=lambda r: r[0]) } for entity_key in entity_keys: - entity_key_bin = serialize_entity_key(entity_key) + entity_key_bin = serialize_entity_key( + entity_key, + entity_key_serialization_version=config.entity_key_serialization_version, + ) res = {} res_ts = None for _, feature_name, val_bin, ts in rows.get(entity_key_bin, []): @@ -283,7 +295,10 @@ def from_infra_object_proto(infra_object_proto: InfraObjectProto) -> Any: @staticmethod def from_proto(sqlite_table_proto: SqliteTableProto) -> Any: - return SqliteTable(path=sqlite_table_proto.path, name=sqlite_table_proto.name,) + return SqliteTable( + path=sqlite_table_proto.path, + name=sqlite_table_proto.name, + ) def update(self): self.conn.execute( diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index 8c6dd831dd..0b09f5df43 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -2,30 +2,35 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import pandas as pd -import pyarrow import pyarrow as pa from tqdm import tqdm -from feast import FeatureService +from feast import importer +from feast.batch_feature_view import BatchFeatureView from feast.entity import Entity from feast.feature_logging import FeatureServiceLoggingSource +from feast.feature_service import FeatureService from feast.feature_view import FeatureView +from feast.infra.materialization import BatchMaterializationEngine, MaterializationTask +from feast.infra.materialization.batch_materialization_engine import ( + MaterializationJobStatus, +) from feast.infra.offline_stores.offline_store import RetrievalJob from feast.infra.offline_stores.offline_utils import get_offline_store_from_config from feast.infra.online_stores.helpers import get_online_store_from_config -from feast.infra.provider import ( - Provider, - _convert_arrow_to_proto, - _get_column_names, - _run_field_mapping, -) +from feast.infra.provider import Provider from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.registry import BaseRegistry -from feast.repo_config import RepoConfig +from feast.repo_config import BATCH_ENGINE_CLASS_FOR_TYPE, RepoConfig from feast.saved_dataset import SavedDataset +from feast.stream_feature_view import StreamFeatureView from feast.usage import RatioSampler, log_exceptions_and_usage, set_usage_attribute -from feast.utils import make_tzaware +from feast.utils import ( + _convert_arrow_to_proto, + _run_pyarrow_field_mapping, + make_tzaware, +) DEFAULT_BATCH_SIZE = 10_000 @@ -41,6 +46,7 @@ def __init__(self, config: RepoConfig): self.repo_config = config self._offline_store = None self._online_store = None + self._batch_engine: Optional[BatchMaterializationEngine] = None @property def online_store(self): @@ -58,6 +64,46 @@ def offline_store(self): ) return self._offline_store + @property + def batch_engine(self) -> BatchMaterializationEngine: + if self._batch_engine: + return self._batch_engine + else: + engine_config = self.repo_config._batch_engine_config + config_is_dict = False + if isinstance(engine_config, str): + engine_config_type = engine_config + elif isinstance(engine_config, Dict): + if "type" not in engine_config: + raise ValueError("engine_config needs to have a `type` specified.") + engine_config_type = engine_config["type"] + config_is_dict = True + else: + raise RuntimeError( + f"Invalid config type specified for batch_engine: {type(engine_config)}" + ) + + if engine_config_type in BATCH_ENGINE_CLASS_FOR_TYPE: + engine_config_type = BATCH_ENGINE_CLASS_FOR_TYPE[engine_config_type] + engine_module, engine_class_name = engine_config_type.rsplit(".", 1) + engine_class = importer.import_class(engine_module, engine_class_name) + + if config_is_dict: + _batch_engine = engine_class( + repo_config=self.repo_config, + offline_store=self.offline_store, + online_store=self.online_store, + **engine_config, + ) + else: + _batch_engine = engine_class( + repo_config=self.repo_config, + offline_store=self.offline_store, + online_store=self.online_store, + ) + self._batch_engine = _batch_engine + return _batch_engine + def update_infra( self, project: str, @@ -79,13 +125,26 @@ def update_infra( entities_to_delete=entities_to_delete, partial=partial, ) + if self.batch_engine: + self.batch_engine.update( + project, + tables_to_delete, + tables_to_keep, + entities_to_delete, + entities_to_keep, + ) def teardown_infra( - self, project: str, tables: Sequence[FeatureView], entities: Sequence[Entity], + self, + project: str, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ) -> None: set_usage_attribute("provider", self.__class__.__name__) if self.online_store: self.online_store.teardown(self.repo_config, tables, entities) + if self.batch_engine: + self.batch_engine.teardown_infra(project, tables, entities) def online_write_batch( self, @@ -131,13 +190,18 @@ def online_read( return result def ingest_df( - self, feature_view: FeatureView, entities: List[Entity], df: pd.DataFrame, + self, + feature_view: FeatureView, + entities: List[Entity], + df: pd.DataFrame, ): set_usage_attribute("provider", self.__class__.__name__) table = pa.Table.from_pandas(df) if feature_view.batch_source.field_mapping is not None: - table = _run_field_mapping(table, feature_view.batch_source.field_mapping) + table = _run_pyarrow_field_mapping( + table, feature_view.batch_source.field_mapping + ) join_keys = {entity.join_key: entity.value_type for entity in entities} rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys) @@ -150,7 +214,9 @@ def ingest_df_to_offline_store(self, feature_view: FeatureView, table: pa.Table) set_usage_attribute("provider", self.__class__.__name__) if feature_view.batch_source.field_mapping is not None: - table = _run_field_mapping(table, feature_view.batch_source.field_mapping) + table = _run_pyarrow_field_mapping( + table, feature_view.batch_source.field_mapping + ) self.offline_write_batch(self.repo_config, feature_view, table, None) @@ -165,50 +231,24 @@ def materialize_single_feature_view( tqdm_builder: Callable[[int], tqdm], ) -> None: set_usage_attribute("provider", self.__class__.__name__) - - entities = [] - for entity_name in feature_view.entities: - entities.append(registry.get_entity(entity_name, project)) - - ( - join_key_columns, - feature_name_columns, - timestamp_field, - created_timestamp_column, - ) = _get_column_names(feature_view, entities) - - offline_job = self.offline_store.pull_latest_from_table_or_query( - config=config, - data_source=feature_view.batch_source, - join_key_columns=join_key_columns, - feature_name_columns=feature_name_columns, - timestamp_field=timestamp_field, - created_timestamp_column=created_timestamp_column, - start_date=start_date, - end_date=end_date, + assert ( + isinstance(feature_view, BatchFeatureView) + or isinstance(feature_view, StreamFeatureView) + or isinstance(feature_view, FeatureView) + ), f"Unexpected type for {feature_view.name}: {type(feature_view)}" + task = MaterializationTask( + project=project, + feature_view=feature_view, + start_time=start_date, + end_time=end_date, + tqdm_builder=tqdm_builder, ) - - table = offline_job.to_arrow() - - if feature_view.batch_source.field_mapping is not None: - table = _run_field_mapping(table, feature_view.batch_source.field_mapping) - - join_key_to_value_type = { - entity.name: entity.dtype.to_value_type() - for entity in feature_view.entity_columns - } - - with tqdm_builder(table.num_rows) as pbar: - for batch in table.to_batches(DEFAULT_BATCH_SIZE): - rows_to_write = _convert_arrow_to_proto( - batch, feature_view, join_key_to_value_type - ) - self.online_write_batch( - self.repo_config, - feature_view, - rows_to_write, - lambda x: pbar.update(x), - ) + jobs = self.batch_engine.materialize(registry, [task]) + assert len(jobs) == 1 + if jobs[0].status() == MaterializationJobStatus.ERROR and jobs[0].error(): + e = jobs[0].error() + assert e + raise e def get_historical_features( self, @@ -260,7 +300,7 @@ def retrieve_saved_dataset( def write_feature_service_logs( self, feature_service: FeatureService, - logs: Union[pyarrow.Table, str], + logs: Union[pa.Table, str], config: RepoConfig, registry: BaseRegistry, ): diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index d2e37e69db..086c9ec6b3 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -1,29 +1,24 @@ import abc -from collections import defaultdict from datetime import datetime from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union -import dask.dataframe as dd import pandas as pd import pyarrow from tqdm import tqdm from feast import FeatureService, errors from feast.entity import Entity -from feast.feature_view import DUMMY_ENTITY_ID, FeatureView +from feast.feature_view import FeatureView from feast.importer import import_class from feast.infra.infra_object import Infra from feast.infra.offline_stores.offline_store import RetrievalJob -from feast.on_demand_feature_view import OnDemandFeatureView from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.registry import BaseRegistry from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDataset -from feast.type_map import python_values_to_proto_values -from feast.value_type import ValueType PROVIDERS_CLASS_FOR_TYPE = { "gcp": "feast.infra.gcp.GcpProvider", @@ -79,7 +74,10 @@ def plan_infra( @abc.abstractmethod def teardown_infra( - self, project: str, tables: Sequence[FeatureView], entities: Sequence[Entity], + self, + project: str, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ): """ Tear down all cloud resources for a repo. @@ -119,7 +117,10 @@ def online_write_batch( ... def ingest_df( - self, feature_view: FeatureView, entities: List[Entity], df: pd.DataFrame, + self, + feature_view: FeatureView, + entities: List[Entity], + df: pd.DataFrame, ): """ Ingests a DataFrame directly into the online store @@ -127,7 +128,9 @@ def ingest_df( pass def ingest_df_to_offline_store( - self, feature_view: FeatureView, df: pyarrow.Table, + self, + feature_view: FeatureView, + df: pyarrow.Table, ): """ Ingests a DataFrame directly into the offline store @@ -252,198 +255,3 @@ def get_provider(config: RepoConfig, repo_path: Path) -> Provider: cls = import_class(module_name, class_name, "Provider") return cls(config) - - -def _get_requested_feature_views_to_features_dict( - feature_refs: List[str], - feature_views: List[FeatureView], - on_demand_feature_views: List[OnDemandFeatureView], -) -> Tuple[Dict[FeatureView, List[str]], Dict[OnDemandFeatureView, List[str]]]: - """Create a dict of FeatureView -> List[Feature] for all requested features. - Set full_feature_names to True to have feature names prefixed by their feature view name.""" - - feature_views_to_feature_map: Dict[FeatureView, List[str]] = defaultdict(list) - on_demand_feature_views_to_feature_map: Dict[ - OnDemandFeatureView, List[str] - ] = defaultdict(list) - - for ref in feature_refs: - ref_parts = ref.split(":") - feature_view_from_ref = ref_parts[0] - feature_from_ref = ref_parts[1] - - found = False - for fv in feature_views: - if fv.projection.name_to_use() == feature_view_from_ref: - found = True - feature_views_to_feature_map[fv].append(feature_from_ref) - for odfv in on_demand_feature_views: - if odfv.projection.name_to_use() == feature_view_from_ref: - found = True - on_demand_feature_views_to_feature_map[odfv].append(feature_from_ref) - - if not found: - raise ValueError(f"Could not find feature view from reference {ref}") - - return feature_views_to_feature_map, on_demand_feature_views_to_feature_map - - -def _get_column_names( - feature_view: FeatureView, entities: List[Entity] -) -> Tuple[List[str], List[str], str, Optional[str]]: - """ - If a field mapping exists, run it in reverse on the join keys, - feature names, event timestamp column, and created timestamp column - to get the names of the relevant columns in the offline feature store table. - - Returns: - Tuple containing the list of reverse-mapped join_keys, - reverse-mapped feature names, reverse-mapped event timestamp column, - and reverse-mapped created timestamp column that will be passed into - the query to the offline store. - """ - # if we have mapped fields, use the original field names in the call to the offline store - timestamp_field = feature_view.batch_source.timestamp_field - feature_names = [feature.name for feature in feature_view.features] - created_timestamp_column = feature_view.batch_source.created_timestamp_column - join_keys = [ - entity.join_key for entity in entities if entity.join_key != DUMMY_ENTITY_ID - ] - if feature_view.batch_source.field_mapping is not None: - reverse_field_mapping = { - v: k for k, v in feature_view.batch_source.field_mapping.items() - } - timestamp_field = ( - reverse_field_mapping[timestamp_field] - if timestamp_field in reverse_field_mapping.keys() - else timestamp_field - ) - created_timestamp_column = ( - reverse_field_mapping[created_timestamp_column] - if created_timestamp_column - and created_timestamp_column in reverse_field_mapping.keys() - else created_timestamp_column - ) - join_keys = [ - reverse_field_mapping[col] if col in reverse_field_mapping.keys() else col - for col in join_keys - ] - feature_names = [ - reverse_field_mapping[col] if col in reverse_field_mapping.keys() else col - for col in feature_names - ] - - # We need to exclude join keys and timestamp columns from the list of features, after they are mapped to - # their final column names via the `field_mapping` field of the source. - feature_names = [ - name - for name in feature_names - if name not in join_keys - and name != timestamp_field - and name != created_timestamp_column - ] - return ( - join_keys, - feature_names, - timestamp_field, - created_timestamp_column, - ) - - -def _run_field_mapping( - table: pyarrow.Table, field_mapping: Dict[str, str], -) -> pyarrow.Table: - # run field mapping in the forward direction - cols = table.column_names - mapped_cols = [ - field_mapping[col] if col in field_mapping.keys() else col for col in cols - ] - table = table.rename_columns(mapped_cols) - return table - - -def _run_dask_field_mapping( - table: dd.DataFrame, field_mapping: Dict[str, str], -): - if field_mapping: - # run field mapping in the forward direction - table = table.rename(columns=field_mapping) - table = table.persist() - - return table - - -def _coerce_datetime(ts): - """ - Depending on underlying time resolution, arrow to_pydict() sometimes returns pd - timestamp type (for nanosecond resolution), and sometimes you get standard python datetime - (for microsecond resolution). - While pd timestamp class is a subclass of python datetime, it doesn't always behave the - same way. We convert it to normal datetime so that consumers downstream don't have to deal - with these quirks. - """ - if isinstance(ts, pd.Timestamp): - return ts.to_pydatetime() - else: - return ts - - -def _convert_arrow_to_proto( - table: Union[pyarrow.Table, pyarrow.RecordBatch], - feature_view: FeatureView, - join_keys: Dict[str, ValueType], -) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]]: - # Avoid ChunkedArrays which guarentees `zero_copy_only` availiable. - if isinstance(table, pyarrow.Table): - table = table.to_batches()[0] - - columns = [ - (field.name, field.dtype.to_value_type()) for field in feature_view.features - ] + list(join_keys.items()) - - proto_values_by_column = { - column: python_values_to_proto_values( - table.column(column).to_numpy(zero_copy_only=False), value_type - ) - for column, value_type in columns - } - - entity_keys = [ - EntityKeyProto( - join_keys=join_keys, - entity_values=[proto_values_by_column[k][idx] for k in join_keys], - ) - for idx in range(table.num_rows) - ] - - # Serialize the features per row - feature_dict = { - feature.name: proto_values_by_column[feature.name] - for feature in feature_view.features - } - features = [dict(zip(feature_dict, vars)) for vars in zip(*feature_dict.values())] - - # Convert event_timestamps - event_timestamps = [ - _coerce_datetime(val) - for val in pd.to_datetime( - table.column(feature_view.batch_source.timestamp_field).to_numpy( - zero_copy_only=False - ) - ) - ] - - # Convert created_timestamps if they exist - if feature_view.batch_source.created_timestamp_column: - created_timestamps = [ - _coerce_datetime(val) - for val in pd.to_datetime( - table.column( - feature_view.batch_source.created_timestamp_column - ).to_numpy(zero_copy_only=False) - ) - ] - else: - created_timestamps = [None] * table.num_rows - - return list(zip(entity_keys, features, event_timestamps, created_timestamps)) diff --git a/sdk/python/feast/infra/registry_stores/sql.py b/sdk/python/feast/infra/registry_stores/sql.py index 7ea4a96849..9c6b47a714 100644 --- a/sdk/python/feast/infra/registry_stores/sql.py +++ b/sdk/python/feast/infra/registry_stores/sql.py @@ -1,4 +1,6 @@ +import uuid from datetime import datetime +from enum import Enum from pathlib import Path from typing import Any, List, Optional, Set, Union @@ -17,6 +19,7 @@ ) from sqlalchemy.engine import Engine +from feast import usage from feast.base_feature_view import BaseFeatureView from feast.data_source import DataSource from feast.entity import Entity @@ -32,6 +35,7 @@ from feast.feature_view import FeatureView from feast.infra.infra_object import Infra from feast.on_demand_feature_view import OnDemandFeatureView +from feast.project_metadata import ProjectMetadata from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.protos.feast.core.Entity_pb2 import Entity as EntityProto from feast.protos.feast.core.FeatureService_pb2 import ( @@ -156,6 +160,12 @@ Column("infra_proto", LargeBinary, nullable=False), ) + +class FeastMetadataKeys(Enum): + LAST_UPDATED_TIMESTAMP = "last_updated_timestamp" + PROJECT_UUID = "project_uuid" + + feast_metadata = Table( "feast_metadata", metadata, @@ -189,7 +199,7 @@ def teardown(self): stmt = delete(t) conn.execute(stmt) - def refresh(self): + def refresh(self, project: Optional[str]): # This method is a no-op since we're always reading the latest values from the db. pass @@ -459,8 +469,29 @@ def list_on_demand_feature_views( "feature_view_proto", ) + def list_project_metadata( + self, project: str, allow_cache: bool = False + ) -> List[ProjectMetadata]: + with self.engine.connect() as conn: + stmt = select(feast_metadata).where( + feast_metadata.c.project_id == project, + ) + rows = conn.execute(stmt).all() + if rows: + project_metadata = ProjectMetadata(project_name=project) + for row in rows: + if row["metadata_key"] == FeastMetadataKeys.PROJECT_UUID.value: + project_metadata.project_uuid = row["metadata_value"] + break + # TODO(adchia): Add other project metadata in a structured way + return [project_metadata] + return [] + def apply_saved_dataset( - self, saved_dataset: SavedDataset, project: str, commit: bool = True, + self, + saved_dataset: SavedDataset, + project: str, + commit: bool = True, ): return self._apply_object( saved_datasets, @@ -568,7 +599,9 @@ def apply_user_metadata( getattr(table.c, "feature_view_name") == name, table.c.project_id == project, ) - .values(values,) + .values( + values, + ) ) conn.execute(update_stmt) else: @@ -629,6 +662,7 @@ def proto(self) -> RegistryProto: (self.list_feature_services, r.feature_services), (self.list_saved_datasets, r.saved_datasets), (self.list_validation_references, r.validation_references), + (self.list_project_metadata, r.project_metadata), ]: objs: List[Any] = lister(project) # type: ignore if objs: @@ -651,14 +685,16 @@ def commit(self): def _apply_object( self, table, project: str, id_field_name, obj, proto_field_name, name=None ): + self._maybe_init_project_metadata(project) + name = name or obj.name with self.engine.connect() as conn: + update_datetime = datetime.utcnow() + update_time = int(update_datetime.timestamp()) stmt = select(table).where( getattr(table.c, id_field_name) == name, table.c.project_id == project ) row = conn.execute(stmt).first() - update_datetime = datetime.utcnow() - update_time = int(update_datetime.timestamp()) if hasattr(obj, "last_updated_timestamp"): obj.last_updated_timestamp = update_datetime @@ -670,7 +706,9 @@ def _apply_object( update_stmt = ( update(table) .where(getattr(table.c, id_field_name) == name) - .values(values,) + .values( + values, + ) ) conn.execute(update_stmt) else: @@ -680,11 +718,37 @@ def _apply_object( "last_updated_timestamp": update_time, "project_id": project, } - insert_stmt = insert(table).values(values,) + insert_stmt = insert(table).values( + values, + ) conn.execute(insert_stmt) self._set_last_updated_metadata(update_datetime, project) + def _maybe_init_project_metadata(self, project): + # Initialize project metadata if needed + with self.engine.connect() as conn: + update_datetime = datetime.utcnow() + update_time = int(update_datetime.timestamp()) + stmt = select(feast_metadata).where( + feast_metadata.c.metadata_key == FeastMetadataKeys.PROJECT_UUID.value, + feast_metadata.c.project_id == project, + ) + row = conn.execute(stmt).first() + if row: + usage.set_current_project_uuid(row["metadata_value"]) + else: + new_project_uuid = f"{uuid.uuid4()}" + values = { + "metadata_key": FeastMetadataKeys.PROJECT_UUID.value, + "metadata_value": new_project_uuid, + "last_updated_timestamp": update_time, + "project_id": project, + } + insert_stmt = insert(feast_metadata).values(values) + conn.execute(insert_stmt) + usage.set_current_project_uuid(new_project_uuid) + def _delete_object(self, table, name, project, id_field_name, not_found_exception): with self.engine.connect() as conn: stmt = delete(table).where( @@ -708,6 +772,8 @@ def _get_object( proto_field_name, not_found_exception, ): + self._maybe_init_project_metadata(project) + with self.engine.connect() as conn: stmt = select(table).where( getattr(table.c, id_field_name) == name, table.c.project_id == project @@ -721,6 +787,7 @@ def _get_object( def _list_objects( self, table, project, proto_class, python_class, proto_field_name ): + self._maybe_init_project_metadata(project) with self.engine.connect() as conn: stmt = select(table).where(table.c.project_id == project) rows = conn.execute(stmt).all() @@ -736,7 +803,8 @@ def _list_objects( def _set_last_updated_metadata(self, last_updated: datetime, project: str): with self.engine.connect() as conn: stmt = select(feast_metadata).where( - feast_metadata.c.metadata_key == "last_updated_timestamp", + feast_metadata.c.metadata_key + == FeastMetadataKeys.LAST_UPDATED_TIMESTAMP.value, feast_metadata.c.project_id == project, ) row = conn.execute(stmt).first() @@ -744,7 +812,7 @@ def _set_last_updated_metadata(self, last_updated: datetime, project: str): update_time = int(last_updated.timestamp()) values = { - "metadata_key": "last_updated_timestamp", + "metadata_key": FeastMetadataKeys.LAST_UPDATED_TIMESTAMP.value, "metadata_value": f"{update_time}", "last_updated_timestamp": update_time, "project_id": project, @@ -753,20 +821,24 @@ def _set_last_updated_metadata(self, last_updated: datetime, project: str): update_stmt = ( update(feast_metadata) .where( - feast_metadata.c.metadata_key == "last_updated_timestamp", + feast_metadata.c.metadata_key + == FeastMetadataKeys.LAST_UPDATED_TIMESTAMP.value, feast_metadata.c.project_id == project, ) .values(values) ) conn.execute(update_stmt) else: - insert_stmt = insert(feast_metadata).values(values,) + insert_stmt = insert(feast_metadata).values( + values, + ) conn.execute(insert_stmt) def _get_last_updated_metadata(self, project: str): with self.engine.connect() as conn: stmt = select(feast_metadata).where( - feast_metadata.c.metadata_key == "last_updated_timestamp", + feast_metadata.c.metadata_key + == FeastMetadataKeys.LAST_UPDATED_TIMESTAMP.value, feast_metadata.c.project_id == project, ) row = conn.execute(stmt).first() diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index 7badda9846..3c8ad9d71b 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -3,7 +3,7 @@ import tempfile import uuid from pathlib import Path -from typing import Any, Dict, Iterator, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union import pandas as pd import pyarrow @@ -89,7 +89,10 @@ def execute_redshift_statement_async( """ try: return redshift_data_client.execute_statement( - ClusterIdentifier=cluster_id, Database=database, DbUser=user, Sql=query, + ClusterIdentifier=cluster_id, + Database=database, + DbUser=user, + Sql=query, ) except ClientError as e: if e.response["Error"]["Code"] == "ValidationException": @@ -157,7 +160,11 @@ def get_redshift_statement_result(redshift_data_client, statement_id: str) -> di return redshift_data_client.get_statement_result(Id=statement_id) -def upload_df_to_s3(s3_resource, s3_path: str, df: pd.DataFrame,) -> None: +def upload_df_to_s3( + s3_resource, + s3_path: str, + df: pd.DataFrame, +) -> None: """Uploads a Pandas DataFrame to S3 as a parquet file Args: @@ -236,11 +243,19 @@ def upload_df_to_redshift( def delete_redshift_table( - redshift_data_client, cluster_id: str, database: str, user: str, table_name: str, + redshift_data_client, + cluster_id: str, + database: str, + user: str, + table_name: str, ): drop_query = f"DROP {table_name} IF EXISTS" execute_redshift_statement( - redshift_data_client, cluster_id, database, user, drop_query, + redshift_data_client, + cluster_id, + database, + user, + drop_query, ) @@ -376,7 +391,11 @@ def temporarily_upload_df_to_redshift( # Clean up the uploaded Redshift table execute_redshift_statement( - redshift_data_client, cluster_id, database, user, f"DROP TABLE {table_name}", + redshift_data_client, + cluster_id, + database, + user, + f"DROP TABLE {table_name}", ) @@ -423,7 +442,11 @@ def temporarily_upload_arrow_table_to_redshift( # Clean up the uploaded Redshift table execute_redshift_statement( - redshift_data_client, cluster_id, database, user, f"DROP TABLE {table_name}", + redshift_data_client, + cluster_id, + database, + user, + f"DROP TABLE {table_name}", ) @@ -473,7 +496,7 @@ def execute_redshift_query_and_unload_to_s3( # Run the query, unload the results to S3 unique_table_name = "_" + str(uuid.uuid4()).replace("-", "") query = f"CREATE TEMPORARY TABLE {unique_table_name} AS ({query});\n" - query += f"UNLOAD ('SELECT * FROM {unique_table_name}') TO '{s3_path}/' IAM_ROLE '{iam_role}' PARQUET" + query += f"UNLOAD ('SELECT * FROM {unique_table_name}') TO '{s3_path}/' IAM_ROLE '{iam_role}' FORMAT AS PARQUET" execute_redshift_statement(redshift_data_client, cluster_id, database, user, query) @@ -491,7 +514,13 @@ def unload_redshift_query_to_pa( bucket, key = get_bucket_and_key(s3_path) execute_redshift_query_and_unload_to_s3( - redshift_data_client, cluster_id, database, user, s3_path, iam_role, query, + redshift_data_client, + cluster_id, + database, + user, + s3_path, + iam_role, + query, ) with tempfile.TemporaryDirectory() as temp_dir: @@ -632,3 +661,14 @@ def delete_api_gateway(api_gateway_client, api_gateway_id: str) -> Dict: def get_account_id() -> str: """Get AWS Account ID""" return boto3.client("sts").get_caller_identity().get("Account") + + +def list_s3_files(aws_region: str, path: str) -> List[str]: + s3 = boto3.client("s3", config=Config(region_name=aws_region)) + if path.startswith("s3://"): + path = path[len("s3://") :] + bucket, prefix = path.split("/", 1) + objects = s3.list_objects_v2(Bucket=bucket, Prefix=prefix) + contents = objects["Contents"] + files = [f"s3://{bucket}/{content['Key']}" for content in contents] + return files diff --git a/sdk/python/feast/infra/utils/hbase_utils.py b/sdk/python/feast/infra/utils/hbase_utils.py index 78a39caed8..4816a60087 100644 --- a/sdk/python/feast/infra/utils/hbase_utils.py +++ b/sdk/python/feast/infra/utils/hbase_utils.py @@ -167,13 +167,16 @@ def main(): table = connection.table("test_hbase_driver_hourly_stats") row_keys = [ serialize_entity_key( - EntityKey(join_keys=["driver_id"], entity_values=[Value(int64_val=1004)]) + EntityKey(join_keys=["driver_id"], entity_values=[Value(int64_val=1004)]), + entity_key_serialization_version=2, ).hex(), serialize_entity_key( - EntityKey(join_keys=["driver_id"], entity_values=[Value(int64_val=1005)]) + EntityKey(join_keys=["driver_id"], entity_values=[Value(int64_val=1005)]), + entity_key_serialization_version=2, ).hex(), serialize_entity_key( - EntityKey(join_keys=["driver_id"], entity_values=[Value(int64_val=1024)]) + EntityKey(join_keys=["driver_id"], entity_values=[Value(int64_val=1024)]), + entity_key_serialization_version=2, ).hex(), ] rows = table.rows(row_keys) diff --git a/sdk/python/feast/infra/utils/postgres/connection_utils.py b/sdk/python/feast/infra/utils/postgres/connection_utils.py index 6dbb4a4bc0..0e9cbf96fe 100644 --- a/sdk/python/feast/infra/utils/postgres/connection_utils.py +++ b/sdk/python/feast/infra/utils/postgres/connection_utils.py @@ -64,5 +64,8 @@ def get_query_schema(config: PostgreSQLConfig, sql_query: str) -> Dict[str, np.d """ with _get_conn(config) as conn: conn.set_session(readonly=True) - df = pd.read_sql(f"SELECT * FROM {sql_query} LIMIT 0", conn,) + df = pd.read_sql( + f"SELECT * FROM {sql_query} LIMIT 0", + conn, + ) return dict(zip(df.columns, df.dtypes)) diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py index 05834ae436..f54288e45d 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -44,8 +44,12 @@ def execute_snowflake_statement(conn: SnowflakeConnection, query) -> SnowflakeCu def get_snowflake_conn(config, autocommit=True) -> SnowflakeConnection: - assert config.type == "snowflake.offline" - config_header = "connections.feast_offline_store" + assert config.type in ["snowflake.offline", "snowflake.online"] + + if config.type == "snowflake.offline": + config_header = "connections.feast_offline_store" + elif config.type == "snowflake.online": + config_header = "connections.feast_online_store" config_dict = dict(config) @@ -122,8 +126,8 @@ def write_pandas( conn: Connection to be used to communicate with Snowflake. df: Dataframe we'd like to write back. table_name: Table name where we want to insert into. - database: Database schema and table is in, if not provided the default one will be used (Default value = None). - schema: Schema table is in, if not provided the default one will be used (Default value = None). + database: Database table is in, if not provided the connection one will be used. + schema: Schema table is in, if not provided the connection one will be used. chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once (Default value = None). compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a @@ -342,7 +346,10 @@ def upload_df( def upload_local_pq( - path: Path, cursor: SnowflakeCursor, stage_name: str, parallel: int = 4, + path: Path, + cursor: SnowflakeCursor, + stage_name: str, + parallel: int = 4, ): """ Args: @@ -429,3 +436,176 @@ def parse_private_key_path(key_path: str, private_key_passphrase: str) -> bytes: ) return pkb + + +def write_pandas_binary( + conn: SnowflakeConnection, + df: pd.DataFrame, + table_name: str, + database: Optional[str] = None, + schema: Optional[str] = None, + chunk_size: Optional[int] = None, + compression: str = "gzip", + on_error: str = "abort_statement", + parallel: int = 4, + quote_identifiers: bool = True, + auto_create_table: bool = False, + create_temp_table: bool = False, +): + """Allows users to most efficiently write back a pandas DataFrame to Snowflake. + + It works by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table. + + Returns whether all files were ingested correctly, number of chunks uploaded, and number of rows ingested + with all of the COPY INTO command's output for debugging purposes. + + Example usage: + import pandas + from snowflake.connector.pandas_tools import write_pandas + + df = pandas.DataFrame([('Mark', 10), ('Luke', 20)], columns=['name', 'balance']) + success, nchunks, nrows, _ = write_pandas(cnx, df, 'customers') + + Args: + conn: Connection to be used to communicate with Snowflake. + df: Dataframe we'd like to write back. + table_name: Table name where we want to insert into. + database: Database table is in, if not provided the connection one will be used. + schema: Schema table is in, if not provided the connection one will be used. + chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once + (Default value = None). + compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a + better compression, while snappy is faster. Use whichever is more appropriate (Default value = 'gzip'). + on_error: Action to take when COPY INTO statements fail, default follows documentation at: + https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions + (Default value = 'abort_statement'). + parallel: Number of threads to be used when uploading chunks, default follows documentation at: + https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters (Default value = 4). + quote_identifiers: By default, identifiers, specifically database, schema, table and column names + (from df.columns) will be quoted. If set to False, identifiers are passed on to Snowflake without quoting. + I.e. identifiers will be coerced to uppercase by Snowflake. (Default value = True) + auto_create_table: When true, will automatically create a table with corresponding columns for each column in + the passed in DataFrame. The table will not be created if it already exists + create_temp_table: Will make the auto-created table as a temporary table + """ + if database is not None and schema is None: + raise ProgrammingError( + "Schema has to be provided to write_pandas when a database is provided" + ) + # This dictionary maps the compression algorithm to Snowflake put copy into command type + # https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#type-parquet + compression_map = {"gzip": "auto", "snappy": "snappy"} + if compression not in compression_map.keys(): + raise ProgrammingError( + "Invalid compression '{}', only acceptable values are: {}".format( + compression, compression_map.keys() + ) + ) + if quote_identifiers: + location = ( + (('"' + database + '".') if database else "") + + (('"' + schema + '".') if schema else "") + + ('"' + table_name + '"') + ) + else: + location = ( + (database + "." if database else "") + + (schema + "." if schema else "") + + (table_name) + ) + if chunk_size is None: + chunk_size = len(df) + cursor: SnowflakeCursor = conn.cursor() + stage_name = create_temporary_sfc_stage(cursor) + + with TemporaryDirectory() as tmp_folder: + for i, chunk in chunk_helper(df, chunk_size): + chunk_path = os.path.join(tmp_folder, "file{}.txt".format(i)) + # Dump chunk into parquet file + chunk.to_parquet( + chunk_path, + compression=compression, + use_deprecated_int96_timestamps=True, + ) + # Upload parquet file + upload_sql = ( + "PUT /* Python:snowflake.connector.pandas_tools.write_pandas() */ " + "'file://{path}' @\"{stage_name}\" PARALLEL={parallel}" + ).format( + path=chunk_path.replace("\\", "\\\\").replace("'", "\\'"), + stage_name=stage_name, + parallel=parallel, + ) + logger.debug(f"uploading files with '{upload_sql}'") + cursor.execute(upload_sql, _is_internal=True) + # Remove chunk file + os.remove(chunk_path) + if quote_identifiers: + columns = '"' + '","'.join(list(df.columns)) + '"' + else: + columns = ",".join(list(df.columns)) + + if auto_create_table: + file_format_name = create_file_format(compression, compression_map, cursor) + infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@\"{stage_name}\"', file_format=>'{file_format_name}'))" + logger.debug(f"inferring schema with '{infer_schema_sql}'") + result_cursor = cursor.execute(infer_schema_sql, _is_internal=True) + if result_cursor is None: + raise SnowflakeQueryUnknownError(infer_schema_sql) + result = cast(List[Tuple[str, str]], result_cursor.fetchall()) + column_type_mapping: Dict[str, str] = dict(result) + # Infer schema can return the columns out of order depending on the chunking we do when uploading + # so we have to iterate through the dataframe columns to make sure we create the table with its + # columns in order + quote = '"' if quote_identifiers else "" + create_table_columns = ", ".join( + [f"{quote}{c}{quote} {column_type_mapping[c]}" for c in df.columns] + ) + create_table_sql = ( + f"CREATE {'TEMP ' if create_temp_table else ''}TABLE IF NOT EXISTS {location} " + f"({create_table_columns})" + f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ " + ) + logger.debug(f"auto creating table with '{create_table_sql}'") + cursor.execute(create_table_sql, _is_internal=True) + drop_file_format_sql = f"DROP FILE FORMAT IF EXISTS {file_format_name}" + logger.debug(f"dropping file format with '{drop_file_format_sql}'") + cursor.execute(drop_file_format_sql, _is_internal=True) + + # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly + # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html) + if quote_identifiers: + parquet_columns = ",".join( + f'TO_BINARY($1:"{c}")' + if c in ["entity_feature_key", "entity_key", "value"] + else f'$1:"{c}"' + for c in df.columns + ) + else: + parquet_columns = ",".join( + f"TO_BINARY($1:{c})" + if c in ["entity_feature_key", "entity_key", "value"] + else f"$1:{c}" + for c in df.columns + ) + + copy_into_sql = ( + "COPY INTO {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ " + "({columns}) " + 'FROM (SELECT {parquet_columns} FROM @"{stage_name}") ' + "FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression} BINARY_AS_TEXT = FALSE) " + "PURGE=TRUE ON_ERROR={on_error}" + ).format( + location=location, + columns=columns, + parquet_columns=parquet_columns, + stage_name=stage_name, + compression=compression_map[compression], + on_error=on_error, + ) + logger.debug("copying into with '{}'".format(copy_into_sql)) + # Snowflake returns the original cursor if the query execution succeeded. + result_cursor = cursor.execute(copy_into_sql, _is_internal=True) + if result_cursor is None: + raise SnowflakeQueryUnknownError(copy_into_sql) + result_cursor.close() diff --git a/sdk/python/feast/on_demand_feature_view.py b/sdk/python/feast/on_demand_feature_view.py index bad4edba81..bb45dd6eb6 100644 --- a/sdk/python/feast/on_demand_feature_view.py +++ b/sdk/python/feast/on_demand_feature_view.py @@ -295,7 +295,10 @@ def to_proto(self) -> OnDemandFeatureViewProto: sources[source_name] = OnDemandSource( feature_view_projection=fv_projection.to_proto() ) - for (source_name, request_sources,) in self.source_request_sources.items(): + for ( + source_name, + request_sources, + ) in self.source_request_sources.items(): sources[source_name] = OnDemandSource( request_data_source=request_sources.to_proto() ) @@ -305,7 +308,8 @@ def to_proto(self) -> OnDemandFeatureViewProto: features=[feature.to_proto() for feature in self.features], sources=sources, user_defined_function=UserDefinedFunctionProto( - name=self.udf.__name__, body=dill.dumps(self.udf, recurse=True), + name=self.udf.__name__, + body=dill.dumps(self.udf, recurse=True), ), description=self.description, tags=self.tags, @@ -326,7 +330,10 @@ def from_proto(cls, on_demand_feature_view_proto: OnDemandFeatureViewProto): A OnDemandFeatureView object based on the on-demand feature view protobuf. """ sources = [] - for (_, on_demand_source,) in on_demand_feature_view_proto.spec.sources.items(): + for ( + _, + on_demand_source, + ) in on_demand_feature_view_proto.spec.sources.items(): if on_demand_source.WhichOneof("source") == "feature_view": sources.append( FeatureView.from_proto(on_demand_source.feature_view).projection @@ -341,6 +348,7 @@ def from_proto(cls, on_demand_feature_view_proto: OnDemandFeatureViewProto): sources.append( RequestSource.from_proto(on_demand_source.request_data_source) ) + on_demand_feature_view_obj = cls( name=on_demand_feature_view_proto.spec.name, schema=[ @@ -393,7 +401,9 @@ def get_request_data_schema(self) -> Dict[str, ValueType]: return schema def get_transformed_features_df( - self, df_with_features: pd.DataFrame, full_feature_names: bool = False, + self, + df_with_features: pd.DataFrame, + full_feature_names: bool = False, ) -> pd.DataFrame: # Apply on demand transformations columns_to_cleanup = [] diff --git a/sdk/python/feast/project_metadata.py b/sdk/python/feast/project_metadata.py new file mode 100644 index 0000000000..829e9ff0d5 --- /dev/null +++ b/sdk/python/feast/project_metadata.py @@ -0,0 +1,111 @@ +# Copyright 2019 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import uuid +from typing import Optional + +from google.protobuf.json_format import MessageToJson +from typeguard import typechecked + +from feast.protos.feast.core.Registry_pb2 import ProjectMetadata as ProjectMetadataProto +from feast.usage import log_exceptions + + +@typechecked +class ProjectMetadata: + """ + Tracks project level metadata + + Attributes: + project_name: The registry-scoped unique name of the project. + project_uuid: The UUID for this project + """ + + project_name: str + project_uuid: str + + @log_exceptions + def __init__( + self, + *args, + project_name: Optional[str] = None, + project_uuid: Optional[str] = None, + ): + """ + Creates an Project metadata object. + + Args: + project_name: The registry-scoped unique name of the project. + project_uuid: The UUID for this project + + Raises: + ValueError: Parameters are specified incorrectly. + """ + if not project_name: + raise ValueError("Project name needs to be specified") + + self.project_name = project_name + self.project_uuid = project_uuid or f"{uuid.uuid4()}" + + def __hash__(self) -> int: + return hash((self.project_name, self.project_uuid)) + + def __eq__(self, other): + if not isinstance(other, ProjectMetadata): + raise TypeError( + "Comparisons should only involve ProjectMetadata class objects." + ) + + if ( + self.project_name != other.project_name + or self.project_uuid != other.project_uuid + ): + return False + + return True + + def __str__(self): + return str(MessageToJson(self.to_proto())) + + def __lt__(self, other): + return self.project_name < other.project_name + + @classmethod + def from_proto(cls, project_metadata_proto: ProjectMetadataProto): + """ + Creates project metadata from a protobuf representation. + + Args: + project_metadata_proto: A protobuf representation of project metadata. + + Returns: + A ProjectMetadata object based on the protobuf. + """ + entity = cls( + project_name=project_metadata_proto.project, + project_uuid=project_metadata_proto.project_uuid, + ) + + return entity + + def to_proto(self) -> ProjectMetadataProto: + """ + Converts a project metadata object to its protobuf representation. + + Returns: + An ProjectMetadataProto protobuf. + """ + + return ProjectMetadataProto( + project=self.project_name, project_uuid=self.project_uuid + ) diff --git a/sdk/python/feast/proto_json.py b/sdk/python/feast/proto_json.py index 44e004cb03..58b77edf8b 100644 --- a/sdk/python/feast/proto_json.py +++ b/sdk/python/feast/proto_json.py @@ -70,7 +70,7 @@ def to_json_object(printer: _Printer, message: ProtoMessage) -> JsonObject: return value def from_json_object( - parser: _Parser, value: JsonObject, message: ProtoMessage, + parser: _Parser, value: JsonObject, message: ProtoMessage, path: str ) -> None: if value is None: message.null_val = 0 @@ -142,11 +142,11 @@ def to_json_object(printer: _Printer, message: ProtoMessage) -> JsonObject: return [printer._MessageToJsonObject(item) for item in message.val] def from_json_object( - parser: _Parser, value: JsonObject, message: ProtoMessage, + parser: _Parser, value: JsonObject, message: ProtoMessage, path: str ) -> None: array = value if isinstance(value, list) else value["val"] for item in array: - parser.ConvertMessage(item, message.val.add()) + parser.ConvertMessage(item, message.val.add(), path) _patch_proto_json_encoding(RepeatedValue, to_json_object, from_json_object) @@ -183,7 +183,7 @@ def to_json_object(printer: _Printer, message: ProtoMessage) -> JsonObject: return list(message.val) def from_json_object( - parser: _Parser, value: JsonObject, message: ProtoMessage, + parser: _Parser, value: JsonObject, message: ProtoMessage, path: str ) -> None: array = value if isinstance(value, list) else value["val"] message.val.extend(array) diff --git a/sdk/python/feast/registry.py b/sdk/python/feast/registry.py index c721bd648a..336bb2429f 100644 --- a/sdk/python/feast/registry.py +++ b/sdk/python/feast/registry.py @@ -14,6 +14,7 @@ import abc import json import logging +import uuid from abc import abstractmethod from collections import defaultdict from datetime import datetime, timedelta @@ -28,6 +29,7 @@ from google.protobuf.json_format import MessageToJson from proto import Message +from feast import usage from feast.base_feature_view import BaseFeatureView from feast.data_source import DataSource from feast.entity import Entity @@ -47,6 +49,8 @@ from feast.importer import import_class from feast.infra.infra_object import Infra from feast.on_demand_feature_view import OnDemandFeatureView +from feast.project_metadata import ProjectMetadata +from feast.protos.feast.core.Registry_pb2 import ProjectMetadata as ProjectMetadataProto from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto from feast.registry_store import NoopRegistryStore from feast.repo_config import RegistryConfig @@ -57,7 +61,6 @@ REGISTRY_SCHEMA_VERSION = "1" - REGISTRY_STORE_CLASS_FOR_TYPE = { "GCSRegistryStore": "feast.infra.gcp.GCSRegistryStore", "S3RegistryStore": "feast.infra.aws.S3RegistryStore", @@ -121,7 +124,6 @@ def get_objects_from_repo_contents( FEAST_OBJECT_TYPES = [feast_object_type for feast_object_type in FeastObjectType] - logger = logging.getLogger(__name__) @@ -489,7 +491,10 @@ def apply_materialization( # Saved dataset operations @abstractmethod def apply_saved_dataset( - self, saved_dataset: SavedDataset, project: str, commit: bool = True, + self, + saved_dataset: SavedDataset, + project: str, + commit: bool = True, ): """ Stores a saved dataset metadata with Feast @@ -579,17 +584,17 @@ def get_validation_reference( self, name: str, project: str, allow_cache: bool = False ) -> ValidationReference: """ - Retrieves a validation reference. + Retrieves a validation reference. - Args: - name: Name of dataset - project: Feast project that this dataset belongs to - allow_cache: Whether to allow returning this dataset from a cached registry + Args: + name: Name of dataset + project: Feast project that this dataset belongs to + allow_cache: Whether to allow returning this dataset from a cached registry - Returns: - Returns either the specified ValidationReference, or raises an exception if - none is found - """ + Returns: + Returns either the specified ValidationReference, or raises an exception if + none is found + """ # TODO: Needs to be implemented. def list_validation_references( @@ -607,6 +612,20 @@ def list_validation_references( List of request feature views """ + def list_project_metadata( + self, project: str, allow_cache: bool = False + ) -> List[ProjectMetadata]: + """ + Retrieves project metadata + + Args: + project: Filter metadata based on project name + allow_cache: Allow returning feature views from the cached registry + + Returns: + List of project metadata + """ + @abstractmethod def update_infra(self, infra: Infra, project: str, commit: bool = True): """ @@ -660,7 +679,7 @@ def commit(self): """Commits the state of the registry cache to the remote registry store.""" @abstractmethod - def refresh(self): + def refresh(self, project: Optional[str]): """Refreshes the state of the registry cache by fetching the registry state from the remote registry store.""" @staticmethod @@ -678,6 +697,10 @@ def to_dict(self, project: str) -> Dict[str, List[Any]]: """ registry_dict: Dict[str, Any] = defaultdict(list) registry_dict["project"] = project + for project_metadata in sorted(self.list_project_metadata(project=project)): + registry_dict["projectMetadata"].append( + self._message_to_sorted_dict(project_metadata.to_proto()) + ) for data_source in sorted( self.list_data_sources(project=project), key=lambda ds: ds.name ): @@ -733,6 +756,25 @@ def to_dict(self, project: str) -> Dict[str, List[Any]]: return registry_dict +def _get_project_metadata( + registry_proto: Optional[RegistryProto], project: str +) -> Optional[ProjectMetadataProto]: + if not registry_proto: + return None + for pm in registry_proto.project_metadata: + if pm.project == project: + return pm + return None + + +def _init_project_metadata(cached_registry_proto: RegistryProto, project: str): + new_project_uuid = f"{uuid.uuid4()}" + usage.set_current_project_uuid(new_project_uuid) + cached_registry_proto.project_metadata.append( + ProjectMetadata(project_name=project, project_uuid=new_project_uuid).to_proto() + ) + + class Registry(BaseRegistry): """ Registry: A registry allows for the management and persistence of feature definitions and related metadata. @@ -811,13 +853,14 @@ def clone(self) -> "Registry": new_registry._registry_store = NoopRegistryStore() return new_registry - def _initialize_registry(self): + def _initialize_registry(self, project: str): """Explicitly initializes the registry with an empty proto if it doesn't exist.""" try: - self._get_registry_proto() + self._get_registry_proto(project=project) except FileNotFoundError: registry_proto = RegistryProto() registry_proto.registry_schema_version = REGISTRY_SCHEMA_VERSION + _init_project_metadata(registry_proto, project) self._registry_store.update_registry_proto(registry_proto) def update_infra(self, infra: Infra, project: str, commit: bool = True): @@ -829,7 +872,7 @@ def update_infra(self, infra: Infra, project: str, commit: bool = True): project: Feast project that the Infra object refers to commit: Whether the change should be persisted immediately """ - self._prepare_registry_for_changes() + self._prepare_registry_for_changes(project) assert self.cached_registry_proto self.cached_registry_proto.infra.CopyFrom(infra.to_proto()) @@ -847,7 +890,9 @@ def get_infra(self, project: str, allow_cache: bool = False) -> Infra: Returns: The stored Infra object. """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) return Infra.from_proto(registry_proto.infra) def apply_entity(self, entity: Entity, project: str, commit: bool = True): @@ -868,7 +913,7 @@ def apply_entity(self, entity: Entity, project: str, commit: bool = True): entity_proto = entity.to_proto() entity_proto.spec.project = project - self._prepare_registry_for_changes() + self._prepare_registry_for_changes(project) assert self.cached_registry_proto for idx, existing_entity_proto in enumerate( @@ -896,7 +941,9 @@ def list_entities(self, project: str, allow_cache: bool = False) -> List[Entity] Returns: List of entities """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) entities = [] for entity_proto in registry_proto.entities: if entity_proto.spec.project == project: @@ -916,7 +963,9 @@ def list_data_sources( Returns: List of data sources """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) data_sources = [] for data_source_proto in registry_proto.data_sources: if data_source_proto.project == project: @@ -934,7 +983,7 @@ def apply_data_source( project: Feast project that this data source belongs to commit: Whether to immediately commit to the registry """ - registry = self._prepare_registry_for_changes() + registry = self._prepare_registry_for_changes(project) for idx, existing_data_source_proto in enumerate(registry.data_sources): if existing_data_source_proto.name == data_source.name: del registry.data_sources[idx] @@ -959,7 +1008,7 @@ def delete_data_source(self, name: str, project: str, commit: bool = True): project: Feast project that this data source belongs to commit: Whether the change should be persisted immediately """ - self._prepare_registry_for_changes() + self._prepare_registry_for_changes(project) assert self.cached_registry_proto for idx, data_source_proto in enumerate( @@ -990,7 +1039,7 @@ def apply_feature_service( feature_service_proto = feature_service.to_proto() feature_service_proto.spec.project = project - registry = self._prepare_registry_for_changes() + registry = self._prepare_registry_for_changes(project) for idx, existing_feature_service_proto in enumerate(registry.feature_services): if ( @@ -1017,7 +1066,7 @@ def list_feature_services( List of feature services """ - registry = self._get_registry_proto(allow_cache=allow_cache) + registry = self._get_registry_proto(project=project, allow_cache=allow_cache) feature_services = [] for feature_service_proto in registry.feature_services: if feature_service_proto.spec.project == project: @@ -1041,7 +1090,7 @@ def get_feature_service( Returns either the specified feature service, or raises an exception if none is found """ - registry = self._get_registry_proto(allow_cache=allow_cache) + registry = self._get_registry_proto(project=project, allow_cache=allow_cache) for feature_service_proto in registry.feature_services: if ( @@ -1064,7 +1113,9 @@ def get_entity(self, name: str, project: str, allow_cache: bool = False) -> Enti Returns either the specified entity, or raises an exception if none is found """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) for entity_proto in registry_proto.entities: if entity_proto.spec.name == name and entity_proto.spec.project == project: return Entity.from_proto(entity_proto) @@ -1090,7 +1141,7 @@ def apply_feature_view( feature_view_proto = feature_view.to_proto() feature_view_proto.spec.project = project - self._prepare_registry_for_changes() + self._prepare_registry_for_changes(project) assert self.cached_registry_proto self._check_conflicting_feature_view_names(feature_view) @@ -1129,6 +1180,7 @@ def apply_feature_view( else: del existing_feature_views_of_same_type[idx] break + existing_feature_views_of_same_type.append(feature_view_proto) if commit: self.commit() @@ -1146,7 +1198,7 @@ def list_stream_feature_views( Returns: List of stream feature views """ - registry = self._get_registry_proto(allow_cache=allow_cache) + registry = self._get_registry_proto(project=project, allow_cache=allow_cache) stream_feature_views = [] for stream_feature_view in registry.stream_feature_views: if stream_feature_view.spec.project == project: @@ -1169,7 +1221,7 @@ def list_on_demand_feature_views( List of on demand feature views """ - registry = self._get_registry_proto(allow_cache=allow_cache) + registry = self._get_registry_proto(project=project, allow_cache=allow_cache) on_demand_feature_views = [] for on_demand_feature_view in registry.on_demand_feature_views: if on_demand_feature_view.spec.project == project: @@ -1193,7 +1245,7 @@ def get_on_demand_feature_view( Returns either the specified on demand feature view, or raises an exception if none is found """ - registry = self._get_registry_proto(allow_cache=allow_cache) + registry = self._get_registry_proto(project=project, allow_cache=allow_cache) for on_demand_feature_view in registry.on_demand_feature_views: if ( @@ -1217,7 +1269,7 @@ def get_data_source( Returns: Returns either the specified data source, or raises an exception if none is found """ - registry = self._get_registry_proto(allow_cache=allow_cache) + registry = self._get_registry_proto(project=project, allow_cache=allow_cache) for data_source in registry.data_sources: if data_source.project == project and data_source.name == name: @@ -1242,7 +1294,7 @@ def apply_materialization( end_date (datetime): End date of the materialization interval to track commit: Whether the change should be persisted immediately """ - self._prepare_registry_for_changes() + self._prepare_registry_for_changes(project) assert self.cached_registry_proto for idx, existing_feature_view_proto in enumerate( @@ -1306,7 +1358,9 @@ def list_feature_views( Returns: List of feature views """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) feature_views: List[FeatureView] = [] for feature_view_proto in registry_proto.feature_views: if feature_view_proto.spec.project == project: @@ -1320,13 +1374,12 @@ def get_request_feature_view(self, name: str, project: str): Args: name: Name of feature view project: Feast project that this feature view belongs to - allow_cache: Allow returning feature view from the cached registry Returns: Returns either the specified feature view, or raises an exception if none is found """ - registry_proto = self._get_registry_proto(allow_cache=False) + registry_proto = self._get_registry_proto(project=project, allow_cache=False) for feature_view_proto in registry_proto.feature_views: if ( feature_view_proto.spec.name == name @@ -1348,7 +1401,9 @@ def list_request_feature_views( Returns: List of feature views """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) feature_views: List[RequestFeatureView] = [] for request_feature_view_proto in registry_proto.request_feature_views: if request_feature_view_proto.spec.project == project: @@ -1372,7 +1427,9 @@ def get_feature_view( Returns either the specified feature view, or raises an exception if none is found """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) for feature_view_proto in registry_proto.feature_views: if ( feature_view_proto.spec.name == name @@ -1396,7 +1453,9 @@ def get_stream_feature_view( Returns either the specified feature view, or raises an exception if none is found """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) for feature_view_proto in registry_proto.stream_feature_views: if ( feature_view_proto.spec.name == name @@ -1414,7 +1473,7 @@ def delete_feature_service(self, name: str, project: str, commit: bool = True): project: Feast project that this feature service belongs to commit: Whether the change should be persisted immediately """ - self._prepare_registry_for_changes() + self._prepare_registry_for_changes(project) assert self.cached_registry_proto for idx, feature_service_proto in enumerate( @@ -1439,7 +1498,7 @@ def delete_feature_view(self, name: str, project: str, commit: bool = True): project: Feast project that this feature view belongs to commit: Whether the change should be persisted immediately """ - self._prepare_registry_for_changes() + self._prepare_registry_for_changes(project) assert self.cached_registry_proto for idx, existing_feature_view_proto in enumerate( @@ -1501,7 +1560,7 @@ def delete_entity(self, name: str, project: str, commit: bool = True): project: Feast project that this entity belongs to commit: Whether the change should be persisted immediately """ - self._prepare_registry_for_changes() + self._prepare_registry_for_changes(project) assert self.cached_registry_proto for idx, existing_entity_proto in enumerate( @@ -1519,7 +1578,10 @@ def delete_entity(self, name: str, project: str, commit: bool = True): raise EntityNotFoundException(name, project) def apply_saved_dataset( - self, saved_dataset: SavedDataset, project: str, commit: bool = True, + self, + saved_dataset: SavedDataset, + project: str, + commit: bool = True, ): """ Stores a saved dataset metadata with Feast @@ -1536,7 +1598,7 @@ def apply_saved_dataset( saved_dataset_proto = saved_dataset.to_proto() saved_dataset_proto.spec.project = project - self._prepare_registry_for_changes() + self._prepare_registry_for_changes(project) assert self.cached_registry_proto for idx, existing_saved_dataset_proto in enumerate( @@ -1568,7 +1630,9 @@ def get_saved_dataset( Returns either the specified SavedDataset, or raises an exception if none is found """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) for saved_dataset in registry_proto.saved_datasets: if ( saved_dataset.spec.name == name @@ -1590,7 +1654,9 @@ def list_saved_datasets( Returns: Returns the list of SavedDatasets """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) return [ SavedDataset.from_proto(saved_dataset) for saved_dataset in registry_proto.saved_datasets @@ -1614,7 +1680,7 @@ def apply_validation_reference( validation_reference_proto = validation_reference.to_proto() validation_reference_proto.project = project - registry_proto = self._prepare_registry_for_changes() + registry_proto = self._prepare_registry_for_changes(project) for idx, existing_validation_reference in enumerate( registry_proto.validation_references ): @@ -1633,18 +1699,20 @@ def get_validation_reference( self, name: str, project: str, allow_cache: bool = False ) -> ValidationReference: """ - Retrieves a validation reference. + Retrieves a validation reference. - Args: - name: Name of dataset - project: Feast project that this dataset belongs to - allow_cache: Whether to allow returning this dataset from a cached registry + Args: + name: Name of dataset + project: Feast project that this dataset belongs to + allow_cache: Whether to allow returning this dataset from a cached registry - Returns: - Returns either the specified ValidationReference, or raises an exception if - none is found - """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) + Returns: + Returns either the specified ValidationReference, or raises an exception if + none is found + """ + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) for validation_reference in registry_proto.validation_references: if ( validation_reference.name == name @@ -1662,7 +1730,7 @@ def delete_validation_reference(self, name: str, project: str, commit: bool = Tr project: Feast project that this object belongs to commit: Whether the change should be persisted immediately """ - registry_proto = self._prepare_registry_for_changes() + registry_proto = self._prepare_registry_for_changes(project) for idx, existing_validation_reference in enumerate( registry_proto.validation_references ): @@ -1676,14 +1744,26 @@ def delete_validation_reference(self, name: str, project: str, commit: bool = Tr return raise ValidationReferenceNotFound(name, project=project) + def list_project_metadata( + self, project: str, allow_cache: bool = False + ) -> List[ProjectMetadata]: + registry_proto = self._get_registry_proto( + project=project, allow_cache=allow_cache + ) + return [ + ProjectMetadata.from_proto(project_metadata) + for project_metadata in registry_proto.project_metadata + if project_metadata.project == project + ] + def commit(self): """Commits the state of the registry cache to the remote registry store.""" if self.cached_registry_proto: self._registry_store.update_registry_proto(self.cached_registry_proto) - def refresh(self): + def refresh(self, project: Optional[str]): """Refreshes the state of the registry cache by fetching the registry state from the remote registry store.""" - self._get_registry_proto(allow_cache=False) + self._get_registry_proto(project=project, allow_cache=False) def teardown(self): """Tears down (removes) the registry.""" @@ -1692,21 +1772,34 @@ def teardown(self): def proto(self) -> RegistryProto: return self.cached_registry_proto or RegistryProto() - def _prepare_registry_for_changes(self): + def _prepare_registry_for_changes(self, project: str): """Prepares the Registry for changes by refreshing the cache if necessary.""" try: - self._get_registry_proto(allow_cache=True) + self._get_registry_proto(project=project, allow_cache=True) + if _get_project_metadata(self.cached_registry_proto, project) is None: + # Project metadata not initialized yet. Try pulling without cache + self._get_registry_proto(project=project, allow_cache=False) except FileNotFoundError: registry_proto = RegistryProto() registry_proto.registry_schema_version = REGISTRY_SCHEMA_VERSION self.cached_registry_proto = registry_proto self.cached_registry_proto_created = datetime.utcnow() + + # Initialize project metadata if needed + assert self.cached_registry_proto + if _get_project_metadata(self.cached_registry_proto, project) is None: + _init_project_metadata(self.cached_registry_proto, project) + self.commit() + return self.cached_registry_proto - def _get_registry_proto(self, allow_cache: bool = False) -> RegistryProto: + def _get_registry_proto( + self, project: Optional[str], allow_cache: bool = False + ) -> RegistryProto: """Returns the cached or remote registry state Args: + project: Name of the Feast project (optional) allow_cache: Whether to allow the use of the registry cache when fetching the RegistryProto Returns: Returns a RegistryProto object which represents the state of the registry @@ -1727,7 +1820,15 @@ def _get_registry_proto(self, allow_cache: bool = False) -> RegistryProto: ) ) - if allow_cache and not expired: + if project: + old_project_metadata = _get_project_metadata( + registry_proto=self.cached_registry_proto, project=project + ) + + if allow_cache and not expired and old_project_metadata is not None: + assert isinstance(self.cached_registry_proto, RegistryProto) + return self.cached_registry_proto + elif allow_cache and not expired: assert isinstance(self.cached_registry_proto, RegistryProto) return self.cached_registry_proto @@ -1735,6 +1836,18 @@ def _get_registry_proto(self, allow_cache: bool = False) -> RegistryProto: self.cached_registry_proto = registry_proto self.cached_registry_proto_created = datetime.utcnow() + if not project: + return registry_proto + + project_metadata = _get_project_metadata( + registry_proto=registry_proto, project=project + ) + if project_metadata: + usage.set_current_project_uuid(project_metadata.project_uuid) + else: + _init_project_metadata(registry_proto, project) + self.commit() + return registry_proto def _check_conflicting_feature_view_names(self, feature_view: BaseFeatureView): diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index b7cf1683dc..587907b284 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -1,5 +1,6 @@ import logging import os +import warnings from pathlib import Path from typing import Any @@ -25,11 +26,18 @@ from feast.importer import import_class from feast.usage import log_exceptions +warnings.simplefilter("once", RuntimeWarning) + _logger = logging.getLogger(__name__) # These dict exists so that: # - existing values for the online store type in featurestore.yaml files continue to work in a backwards compatible way # - first party and third party implementations can use the same class loading code path. +BATCH_ENGINE_CLASS_FOR_TYPE = { + "local": "feast.infra.materialization.LocalMaterializationEngine", + "lambda": "feast.infra.materialization.lambda.lambda_engine.LambdaMaterializationEngine", +} + ONLINE_STORE_CLASS_FOR_TYPE = { "sqlite": "feast.infra.online_stores.sqlite.SqliteOnlineStore", "datastore": "feast.infra.online_stores.datastore.DatastoreOnlineStore", @@ -120,6 +128,9 @@ class RepoConfig(FeastBaseModel): _offline_config: Any = Field(alias="offline_store") """ OfflineStoreConfig: Offline store configuration (optional depending on provider) """ + _batch_engine_config: Any = Field(alias="batch_engine") + """ BatchMaterializationEngine: Batch materialization configuration (optional depending on provider)""" + feature_server: Optional[Any] """ FeatureServerConfig: Feature server configuration (optional depending on provider) """ @@ -128,7 +139,22 @@ class RepoConfig(FeastBaseModel): repo_path: Optional[Path] = None + go_feature_serving: Optional[bool] = False + """ If True, use the Go feature server instead of the Python feature server. """ + go_feature_retrieval: Optional[bool] = False + """ If True, use the embedded Go code to retrieve features instead of the Python SDK. """ + + entity_key_serialization_version: StrictInt = 1 + """ Entity key serialization version: This version is used to control what serialization scheme is + used when writing data to the online store. + A value <= 1 uses the serialization scheme used by feast up to Feast 0.22. + A value of 2 uses a newer serialization scheme, supported as of Feast 0.23. + The main difference between the two scheme is that the serialization scheme v1 stored `long` values as `int`s, + which would result in errors trying to serialize a range of values. + v2 fixes this error, but v1 is kept around to ensure backwards compatibility - specifically the ability to read + feature values for entities that have already been written into the online store. + """ def __init__(self, **data: Any): super().__init__(**data) @@ -155,11 +181,30 @@ def __init__(self, **data: Any): elif data["provider"] == "aws": self._online_config = "dynamodb" + self._batch_engine = None + if "batch_engine" in data: + self._batch_engine_config = data["batch_engine"] + elif "batch_engine_config" in data: + self._batch_engine_config = data["batch_engine_config"] + else: + # Defaults to using local in-process materialization engine. + self._batch_engine_config = "local" + if isinstance(self.feature_server, Dict): self.feature_server = get_feature_server_config_from_type( self.feature_server["type"] )(**self.feature_server) + if self.entity_key_serialization_version <= 1: + warnings.warn( + "`entity_key_serialization_version` is either not specified in the feature_store.yaml, " + "or is specified to a value <= 1." + "This serialization version may cause errors when trying to write fields with the `Long` data type" + " into the online store. Specifying `entity_key_serialization_version` to 2 is recommended for" + " new projects. ", + RuntimeWarning, + ) + def get_registry_config(self): if isinstance(self.registry, str): return RegistryConfig(path=self.registry) @@ -195,6 +240,22 @@ def online_store(self): return self._online_store + @property + def batch_engine(self): + if not self._batch_engine: + if isinstance(self._batch_engine_config, Dict): + self._batch_engine = get_batch_engine_config_from_type( + self._batch_engine_config["type"] + )(**self._batch_engine_config) + elif isinstance(self._batch_engine_config, str): + self._batch_engine = get_batch_engine_config_from_type( + self._batch_engine_config + )() + elif self._batch_engine_config: + self._batch_engine = self._batch_engine + + return self._batch_engine + @root_validator(pre=True) @log_exceptions def _validate_online_store_config(cls, values): @@ -238,7 +299,8 @@ def _validate_online_store_config(cls, values): online_config_class(**values["online_store"]) except ValidationError as e: raise ValidationError( - [ErrorWrapper(e, loc="online_store")], model=RepoConfig, + [ErrorWrapper(e, loc="online_store")], + model=RepoConfig, ) return values @@ -272,7 +334,8 @@ def _validate_offline_store_config(cls, values): offline_config_class(**values["offline_store"]) except ValidationError as e: raise ValidationError( - [ErrorWrapper(e, loc="offline_store")], model=RepoConfig, + [ErrorWrapper(e, loc="offline_store")], + model=RepoConfig, ) return values @@ -306,7 +369,8 @@ def _validate_feature_server_config(cls, values): feature_server_config_class(**values["feature_server"]) except ValidationError as e: raise ValidationError( - [ErrorWrapper(e, loc="feature_server")], model=RepoConfig, + [ErrorWrapper(e, loc="feature_server")], + model=RepoConfig, ) return values @@ -343,7 +407,12 @@ def write_to_path(self, repo_path: Path): config_path = repo_path / "feature_store.yaml" with open(config_path, mode="w") as f: yaml.dump( - yaml.safe_load(self.json(exclude={"repo_path"}, exclude_unset=True,)), + yaml.safe_load( + self.json( + exclude={"repo_path"}, + exclude_unset=True, + ) + ), f, sort_keys=False, ) @@ -372,6 +441,17 @@ def get_data_source_class_from_type(data_source_type: str): return import_class(module_name, config_class_name, "DataSource") +def get_batch_engine_config_from_type(batch_engine_type: str): + if batch_engine_type in BATCH_ENGINE_CLASS_FOR_TYPE: + batch_engine_type = BATCH_ENGINE_CLASS_FOR_TYPE[batch_engine_type] + else: + assert batch_engine_type.endswith("Engine") + module_name, batch_engine_class_type = batch_engine_type.rsplit(".", 1) + config_class_name = f"{batch_engine_class_type}Config" + + return import_class(module_name, config_class_name, config_class_name) + + def get_online_config_from_type(online_store_type: str): if online_store_type in ONLINE_STORE_CLASS_FOR_TYPE: online_store_type = ONLINE_STORE_CLASS_FOR_TYPE[online_store_type] diff --git a/sdk/python/feast/repo_operations.py b/sdk/python/feast/repo_operations.py index 37daa6500e..9a5e64f8c3 100644 --- a/sdk/python/feast/repo_operations.py +++ b/sdk/python/feast/repo_operations.py @@ -183,7 +183,7 @@ def plan(repo_config: RepoConfig, repo_path: Path, skip_source_validation: bool) for data_source in data_sources: data_source.validate(store.config) - registry_diff, infra_diff, _ = store._plan(repo) + registry_diff, infra_diff, _ = store.plan(repo) click.echo(registry_diff.to_string()) click.echo(infra_diff.to_string()) @@ -262,7 +262,7 @@ def apply_total_with_repo_instance( for data_source in data_sources: data_source.validate(store.config) - registry_diff, infra_diff, new_infra = store._plan(repo) + registry_diff, infra_diff, new_infra = store.plan(repo) # For each object in the registry, determine whether it should be kept or deleted. ( diff --git a/sdk/python/feast/repo_upgrade.py b/sdk/python/feast/repo_upgrade.py index 5c8d7433b2..6aa7a2cc1d 100644 --- a/sdk/python/feast/repo_upgrade.py +++ b/sdk/python/feast/repo_upgrade.py @@ -3,6 +3,7 @@ from typing import Dict, List from bowler import Query +from fissix.fixer_util import touch_import from fissix.pgen2 import token from fissix.pygram import python_symbols from fissix.pytree import Node @@ -30,6 +31,39 @@ def __init__(self, repo_path: str, write: bool): def upgrade(self): self.remove_date_partition_column() + self.rename_features_to_schema() + + def rename_inputs_to_sources(self): + def _change_argument_transform(node, capture, filename) -> None: + children = node.children + self.rename_arguments_in_children(children, {"inputs": "sources"}) + + PATTERN = """ + decorator< + any * + "on_demand_feature_view" + any * + > + """ + + Query(self.repo_files).select(PATTERN).modify( + _change_argument_transform + ).execute(write=self.write, interactive=False) + + def rename_features_to_schema(self): + Query(str(self.repo_path)).select_class("Feature").modify( + self.import_remover("Feature") + ).execute(interactive=False, write=self.write) + + def _rename_class_name( + node: Node, capture: Dict[str, Node], filename: str + ) -> None: + self.rename_class_call(node, "Field") + touch_import("feast", "Field", node) + + Query(self.repo_files).select_class("Feature").is_call().modify( + _rename_class_name + ).execute(write=self.write, interactive=False) def remove_date_partition_column(self): def _remove_date_partition_column( @@ -42,6 +76,52 @@ def _remove_date_partition_column( _remove_date_partition_column ).execute(write=self.write, interactive=False) + @staticmethod + def rename_arguments_in_children( + children: List[Node], renames: Dict[str, str] + ) -> None: + """ + Renames the arguments in the children list of a node by searching for the + argument list or trailing list and renaming all keys in `renames` dict to + corresponding value. + """ + for child in children: + if not isinstance(child, Node): + continue + if ( + child.type == python_symbols.arglist + or child.type == python_symbols.trailer + ): + if not child.children: + continue + for _, child in enumerate(child.children): + if not isinstance(child, Node): + continue + else: + if child.type == python_symbols.argument: + if child.children[0].value in renames: + child.children[0].value = renames[ + child.children[0].value + ] + + @staticmethod + def rename_class_call(node: Node, new_class_name: str): + """ + Rename the class being instantiated. + f = Feature( + name="driver_id", + join_key="driver_id", + ) + into + f = Field( + name="driver_id", + ) + This method assumes that node represents a class call that already has an arglist. + """ + if len(node.children) < 2 or len(node.children[1].children) < 2: + raise ValueError(f"Expected a class call with an arglist but got {node}.") + node.children[0].value = new_class_name + @staticmethod def remove_argument_transform(node: Node, argument: str): """ @@ -70,3 +150,26 @@ def remove_argument_transform(node: Node, argument: str): class_args.pop(i) if i < len(class_args) and class_args[i].type == token.NEWLINE: class_args.pop(i) + + @staticmethod + def import_remover(class_name): + def remove_import_transformer(node, capture, filename): + if "class_import" in capture and capture["class_name"].value == class_name: + if capture["class_import"].type == python_symbols.import_from: + import_from_stmt = node.children + imported_classes = import_from_stmt[3] + + if len(imported_classes.children) > 1: + # something of the form `from feast import A, ValueType` + for i, class_leaf in enumerate(imported_classes.children): + if class_leaf.value == class_name: + imported_classes.children.pop(i) + if i == len(imported_classes.children): + imported_classes.children.pop(i - 1) + else: + imported_classes.children.pop(i) + else: + # something of the form `from feast import ValueType` + node.parent.children.remove(node) + + return remove_import_transformer diff --git a/sdk/python/feast/stream_feature_view.py b/sdk/python/feast/stream_feature_view.py index 077d8ab89a..29e8abb7da 100644 --- a/sdk/python/feast/stream_feature_view.py +++ b/sdk/python/feast/stream_feature_view.py @@ -6,7 +6,6 @@ from typing import Dict, List, Optional, Tuple, Union import dill -from google.protobuf.duration_pb2 import Duration from typeguard import typechecked from feast import utils @@ -16,18 +15,12 @@ from feast.feature_view import FeatureView from feast.field import Field from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto -from feast.protos.feast.core.FeatureView_pb2 import ( - MaterializationInterval as MaterializationIntervalProto, -) from feast.protos.feast.core.OnDemandFeatureView_pb2 import ( UserDefinedFunction as UserDefinedFunctionProto, ) from feast.protos.feast.core.StreamFeatureView_pb2 import ( StreamFeatureView as StreamFeatureViewProto, ) -from feast.protos.feast.core.StreamFeatureView_pb2 import ( - StreamFeatureViewMeta as StreamFeatureViewMetaProto, -) from feast.protos.feast.core.StreamFeatureView_pb2 import ( StreamFeatureViewSpec as StreamFeatureViewSpecProto, ) @@ -170,23 +163,8 @@ def __hash__(self) -> int: return super().__hash__() def to_proto(self): - meta = StreamFeatureViewMetaProto(materialization_intervals=[]) - if self.created_timestamp: - meta.created_timestamp.FromDatetime(self.created_timestamp) - - if self.last_updated_timestamp: - meta.last_updated_timestamp.FromDatetime(self.last_updated_timestamp) - - for interval in self.materialization_intervals: - interval_proto = MaterializationIntervalProto() - interval_proto.start_time.FromDatetime(interval[0]) - interval_proto.end_time.FromDatetime(interval[1]) - meta.materialization_intervals.append(interval_proto) - - ttl_duration = None - if self.ttl is not None: - ttl_duration = Duration() - ttl_duration.FromTimedelta(self.ttl) + meta = self.to_proto_meta() + ttl_duration = self.get_ttl_duration() batch_source_proto = None if self.batch_source: @@ -201,7 +179,8 @@ def to_proto(self): udf_proto = None if self.udf: udf_proto = UserDefinedFunctionProto( - name=self.udf.__name__, body=dill.dumps(self.udf, recurse=True), + name=self.udf.__name__, + body=dill.dumps(self.udf, recurse=True), ) spec = StreamFeatureViewSpecProto( name=self.name, diff --git a/sdk/python/feast/templates/aws/feature_store.yaml b/sdk/python/feast/templates/aws/feature_store.yaml index 27d1c6879f..3745a75347 100644 --- a/sdk/python/feast/templates/aws/feature_store.yaml +++ b/sdk/python/feast/templates/aws/feature_store.yaml @@ -12,3 +12,4 @@ offline_store: user: %REDSHIFT_USER% s3_staging_location: %REDSHIFT_S3_STAGING_LOCATION% iam_role: %REDSHIFT_IAM_ROLE% +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/aws/test.py b/sdk/python/feast/templates/aws/test.py index 07410954f7..3d223e8f26 100644 --- a/sdk/python/feast/templates/aws/test.py +++ b/sdk/python/feast/templates/aws/test.py @@ -54,7 +54,8 @@ def main(): # Retrieve features from the online store (Firestore) online_features = fs.get_online_features( - features=features, entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], + features=features, + entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], ).to_dict() print() diff --git a/sdk/python/feast/templates/gcp/driver_repo.py b/sdk/python/feast/templates/gcp/driver_repo.py index acb17d5519..6c904a0fee 100644 --- a/sdk/python/feast/templates/gcp/driver_repo.py +++ b/sdk/python/feast/templates/gcp/driver_repo.py @@ -18,6 +18,7 @@ # Indicates a data source from which feature values can be retrieved. Sources are queried when building training # datasets or materializing features into an online store. driver_stats_source = BigQuerySource( + name="driver_hourly_stats_source", # The BigQuery table where features can be found table="feast-oss.demo_data.driver_hourly_stats_2", # The event timestamp is used for point-in-time joins and for ensuring only diff --git a/sdk/python/feast/templates/gcp/feature_store.yaml b/sdk/python/feast/templates/gcp/feature_store.yaml index 14c8d5a94f..74ee729090 100644 --- a/sdk/python/feast/templates/gcp/feature_store.yaml +++ b/sdk/python/feast/templates/gcp/feature_store.yaml @@ -1,3 +1,4 @@ project: my_project registry: data/registry.db -provider: gcp \ No newline at end of file +provider: gcp +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/gcp/test.py b/sdk/python/feast/templates/gcp/test.py index 538334044b..8ff11bda5c 100644 --- a/sdk/python/feast/templates/gcp/test.py +++ b/sdk/python/feast/templates/gcp/test.py @@ -54,7 +54,8 @@ def main(): # Retrieve features from the online store (Firestore) online_features = fs.get_online_features( - features=features, entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], + features=features, + entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], ).to_dict() print() diff --git a/sdk/python/feast/templates/hbase/example.py b/sdk/python/feast/templates/hbase/example.py index b34696185b..6845371f1f 100644 --- a/sdk/python/feast/templates/hbase/example.py +++ b/sdk/python/feast/templates/hbase/example.py @@ -9,6 +9,7 @@ # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. driver_hourly_stats = FileSource( + name="driver_hourly_stats_source", path="%PARQUET_PATH%", timestamp_field="event_timestamp", created_timestamp_column="created", diff --git a/sdk/python/feast/templates/hbase/feature_store.yaml b/sdk/python/feast/templates/hbase/feature_store.yaml index 83ce237b71..f99e858f7c 100644 --- a/sdk/python/feast/templates/hbase/feature_store.yaml +++ b/sdk/python/feast/templates/hbase/feature_store.yaml @@ -5,3 +5,4 @@ online_store: type: hbase host: 127.0.0.1 port: 9090 +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/local/example.py b/sdk/python/feast/templates/local/example.py index 30f9adf189..4fd30ba3a1 100644 --- a/sdk/python/feast/templates/local/example.py +++ b/sdk/python/feast/templates/local/example.py @@ -9,6 +9,7 @@ # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. driver_hourly_stats = FileSource( + name="driver_hourly_stats_source", path="%PARQUET_PATH%", timestamp_field="event_timestamp", created_timestamp_column="created", diff --git a/sdk/python/feast/templates/local/feature_store.yaml b/sdk/python/feast/templates/local/feature_store.yaml index dcbe32d943..fddde04f90 100644 --- a/sdk/python/feast/templates/local/feature_store.yaml +++ b/sdk/python/feast/templates/local/feature_store.yaml @@ -2,4 +2,5 @@ project: my_project registry: data/registry.db provider: local online_store: - path: data/online_store.db \ No newline at end of file + path: data/online_store.db +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/minimal/feature_store.yaml b/sdk/python/feast/templates/minimal/feature_store.yaml index 2083288ad7..9808690005 100644 --- a/sdk/python/feast/templates/minimal/feature_store.yaml +++ b/sdk/python/feast/templates/minimal/feature_store.yaml @@ -2,4 +2,5 @@ project: my_project registry: /path/to/registry.db provider: local online_store: - path: /path/to/online_store.db \ No newline at end of file + path: /path/to/online_store.db +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/postgres/driver_repo.py b/sdk/python/feast/templates/postgres/driver_repo.py index 4096943bb7..61e32eb58e 100644 --- a/sdk/python/feast/templates/postgres/driver_repo.py +++ b/sdk/python/feast/templates/postgres/driver_repo.py @@ -6,7 +6,10 @@ ) from feast.types import Float32, Int64 -driver = Entity(name="driver_id", join_keys=["driver_id"],) +driver = Entity( + name="driver_id", + join_keys=["driver_id"], +) driver_stats_source = PostgreSQLSource( diff --git a/sdk/python/feast/templates/postgres/feature_store.yaml b/sdk/python/feast/templates/postgres/feature_store.yaml index 53b86b7064..0ccd4a6d49 100644 --- a/sdk/python/feast/templates/postgres/feature_store.yaml +++ b/sdk/python/feast/templates/postgres/feature_store.yaml @@ -25,3 +25,4 @@ offline_store: db_schema: DB_SCHEMA user: DB_USERNAME password: DB_PASSWORD +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/templates/postgres/test.py b/sdk/python/feast/templates/postgres/test.py index 81ac299698..d547bc8c64 100644 --- a/sdk/python/feast/templates/postgres/test.py +++ b/sdk/python/feast/templates/postgres/test.py @@ -52,7 +52,8 @@ def main(): # Retrieve features from the online store online_features = fs.get_online_features( - features=features, entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], + features=features, + entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], ).to_dict() print() diff --git a/sdk/python/feast/templates/snowflake/bootstrap.py b/sdk/python/feast/templates/snowflake/bootstrap.py index 194ba08c08..1663a1fb8b 100644 --- a/sdk/python/feast/templates/snowflake/bootstrap.py +++ b/sdk/python/feast/templates/snowflake/bootstrap.py @@ -13,7 +13,6 @@ def bootstrap(): from feast.driver_test_data import create_driver_hourly_stats_df repo_path = pathlib.Path(__file__).parent.absolute() - config_file = repo_path / "feature_store.yaml" project_name = str(repo_path)[str(repo_path).rfind("/") + 1 :] @@ -23,7 +22,6 @@ def bootstrap(): driver_entities = [1001, 1002, 1003, 1004, 1005] driver_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date) - repo_path = pathlib.Path(__file__).parent.absolute() data_path = repo_path / "data" data_path.mkdir(exist_ok=True) driver_stats_path = data_path / "driver_stats.parquet" @@ -38,6 +36,17 @@ def bootstrap(): snowflake_warehouse = click.prompt("Snowflake Warehouse Name (Case Sensitive):") snowflake_database = click.prompt("Snowflake Database Name (Case Sensitive):") + config_file = repo_path / "feature_store.yaml" + for i in range(2): + replace_str_in_file( + config_file, "SNOWFLAKE_DEPLOYMENT_URL", snowflake_deployment_url + ) + replace_str_in_file(config_file, "SNOWFLAKE_USER", snowflake_user) + replace_str_in_file(config_file, "SNOWFLAKE_PASSWORD", snowflake_password) + replace_str_in_file(config_file, "SNOWFLAKE_ROLE", snowflake_role) + replace_str_in_file(config_file, "SNOWFLAKE_WAREHOUSE", snowflake_warehouse) + replace_str_in_file(config_file, "SNOWFLAKE_DATABASE", snowflake_database) + if click.confirm( f'Should I upload example data to Snowflake (overwriting "{project_name}_feast_driver_hourly_stats" table)?', default=True, @@ -66,20 +75,6 @@ def bootstrap(): ) conn.close() - repo_path = pathlib.Path(__file__).parent.absolute() - config_file = repo_path / "feature_store.yaml" - driver_file = repo_path / "driver_repo.py" - replace_str_in_file( - config_file, "SNOWFLAKE_DEPLOYMENT_URL", snowflake_deployment_url - ) - replace_str_in_file(config_file, "SNOWFLAKE_USER", snowflake_user) - replace_str_in_file(config_file, "SNOWFLAKE_PASSWORD", snowflake_password) - replace_str_in_file(config_file, "SNOWFLAKE_ROLE", snowflake_role) - replace_str_in_file(config_file, "SNOWFLAKE_WAREHOUSE", snowflake_warehouse) - replace_str_in_file(config_file, "SNOWFLAKE_DATABASE", snowflake_database) - - replace_str_in_file(driver_file, "SNOWFLAKE_WAREHOUSE", snowflake_warehouse) - def replace_str_in_file(file_path, match_str, sub_str): with open(file_path, "r") as f: diff --git a/sdk/python/feast/templates/snowflake/driver_repo.py b/sdk/python/feast/templates/snowflake/driver_repo.py index 297a3f5ef0..54f6b67126 100644 --- a/sdk/python/feast/templates/snowflake/driver_repo.py +++ b/sdk/python/feast/templates/snowflake/driver_repo.py @@ -2,8 +2,7 @@ import yaml -from feast import Entity, FeatureService, FeatureView, Field, SnowflakeSource -from feast.types import Float32, Int64 +from feast import Entity, FeatureService, FeatureView, SnowflakeSource # Define an entity for the driver. Entities can be thought of as primary keys used to # retrieve features. Entities are also used to join multiple tables/views during the @@ -25,7 +24,6 @@ # The Snowflake table where features can be found database=yaml.safe_load(open("feature_store.yaml"))["offline_store"]["database"], table=f"{project_name}_feast_driver_hourly_stats", - warehouse="SNOWFLAKE_WAREHOUSE", # The event timestamp is used for point-in-time joins and for ensuring only # features within the TTL are returned timestamp_field="event_timestamp", @@ -51,14 +49,6 @@ # amount of historical scanning required for historical feature values # during retrieval ttl=timedelta(weeks=52), - # The list of features defined below act as a schema to both define features - # for both materialization of features into a store, and are used as references - # during retrieval for building a training dataset or serving features - schema=[ - Field(name="conv_rate", dtype=Float32), - Field(name="acc_rate", dtype=Float32), - Field(name="avg_daily_trips", dtype=Int64), - ], # Batch sources are used to find feature values. In the case of this feature # view we will query a source table on Redshift for driver statistics # features diff --git a/sdk/python/feast/templates/snowflake/feature_store.yaml b/sdk/python/feast/templates/snowflake/feature_store.yaml index 9757ea2ead..39f266f89f 100644 --- a/sdk/python/feast/templates/snowflake/feature_store.yaml +++ b/sdk/python/feast/templates/snowflake/feature_store.yaml @@ -9,3 +9,12 @@ offline_store: role: SNOWFLAKE_ROLE warehouse: SNOWFLAKE_WAREHOUSE database: SNOWFLAKE_DATABASE +entity_key_serialization_version: 2 +online_store: + type: snowflake.online + account: SNOWFLAKE_DEPLOYMENT_URL + user: SNOWFLAKE_USER + password: SNOWFLAKE_PASSWORD + role: SNOWFLAKE_ROLE + warehouse: SNOWFLAKE_WAREHOUSE + database: SNOWFLAKE_DATABASE diff --git a/sdk/python/feast/templates/snowflake/test.py b/sdk/python/feast/templates/snowflake/test.py index 32aa6380d5..3c33f6aefd 100644 --- a/sdk/python/feast/templates/snowflake/test.py +++ b/sdk/python/feast/templates/snowflake/test.py @@ -54,7 +54,8 @@ def main(): # Retrieve features from the online store online_features = fs.get_online_features( - features=features, entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], + features=features, + entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], ).to_dict() print() diff --git a/sdk/python/feast/templates/spark/example.py b/sdk/python/feast/templates/spark/example.py index d006353118..8ad48f53fc 100644 --- a/sdk/python/feast/templates/spark/example.py +++ b/sdk/python/feast/templates/spark/example.py @@ -16,8 +16,14 @@ # Entity definitions -driver = Entity(name="driver", description="driver id",) -customer = Entity(name="customer", description="customer id",) +driver = Entity( + name="driver", + description="driver id", +) +customer = Entity( + name="customer", + description="customer id", +) # Sources driver_hourly_stats = SparkSource( diff --git a/sdk/python/feast/templates/spark/feature_store.yaml b/sdk/python/feast/templates/spark/feature_store.yaml index 2ea0ddfcc9..91e3ecf472 100644 --- a/sdk/python/feast/templates/spark/feature_store.yaml +++ b/sdk/python/feast/templates/spark/feature_store.yaml @@ -12,3 +12,4 @@ offline_store: spark.sql.session.timeZone: "UTC" online_store: path: data/online_store.db +entity_key_serialization_version: 2 diff --git a/sdk/python/feast/types.py b/sdk/python/feast/types.py index 40c1d62e7d..0ba1725f17 100644 --- a/sdk/python/feast/types.py +++ b/sdk/python/feast/types.py @@ -178,7 +178,9 @@ def __str__(self): } -def from_value_type(value_type: ValueType,) -> FeastType: +def from_value_type( + value_type: ValueType, +) -> FeastType: """ Converts a ValueType enum to a Feast type. diff --git a/sdk/python/feast/ui/package.json b/sdk/python/feast/ui/package.json index 556637aaae..883c19660b 100644 --- a/sdk/python/feast/ui/package.json +++ b/sdk/python/feast/ui/package.json @@ -6,14 +6,14 @@ "@elastic/datemath": "^5.0.3", "@elastic/eui": "^57.0.0", "@emotion/react": "^11.9.0", - "@feast-dev/feast-ui": "^0.20.4", + "@feast-dev/feast-ui": "^0.20.5", "@testing-library/jest-dom": "^5.16.4", "@testing-library/react": "^13.2.0", "@testing-library/user-event": "^13.5.0", "@types/d3": "^7.1.0", "d3": "^7.4.4", "inter-ui": "^3.19.3", - "moment": "^2.29.3", + "moment": "^2.29.4", "prop-types": "^15.8.1", "query-string": "^7.1.1", "react": "^18.1.0", diff --git a/sdk/python/feast/ui/yarn.lock b/sdk/python/feast/ui/yarn.lock index f6301957c8..b44fc5f51a 100644 --- a/sdk/python/feast/ui/yarn.lock +++ b/sdk/python/feast/ui/yarn.lock @@ -1345,10 +1345,10 @@ minimatch "^3.1.2" strip-json-comments "^3.1.1" -"@feast-dev/feast-ui@^0.20.4": - version "0.20.4" - resolved "https://registry.yarnpkg.com/@feast-dev/feast-ui/-/feast-ui-0.20.4.tgz#4b918f8922f3eecd9e3e7323f25ba9cac78a4567" - integrity sha512-KTUhKni7t++G6UwXyPbGWXwWHnTOVTH8ouYCoHXbGorgRL3K4fbq5tCSCJzP9L5FAo+cF1AjVZNRgwzPe6vAgA== +"@feast-dev/feast-ui@^0.20.5": + version "0.20.5" + resolved "https://registry.yarnpkg.com/@feast-dev/feast-ui/-/feast-ui-0.20.5.tgz#bb0d6fc81cbd92ca69b779982ab151a8d9cabaee" + integrity sha512-BwMPJSv1MkylHxPnU/2fZX77AC/G4H2DIf+HAj80ZklwB0zbmeZzhXFrVh4xSheevGZFh0L839JeL14WfXPZsA== dependencies: "@elastic/datemath" "^5.0.3" "@elastic/eui" "^55.0.1" @@ -1617,25 +1617,47 @@ "@jridgewell/set-array" "^1.0.0" "@jridgewell/sourcemap-codec" "^1.4.10" +"@jridgewell/gen-mapping@^0.3.0": + version "0.3.2" + resolved "https://registry.yarnpkg.com/@jridgewell/gen-mapping/-/gen-mapping-0.3.2.tgz#c1aedc61e853f2bb9f5dfe6d4442d3b565b253b9" + integrity sha512-mh65xKQAzI6iBcFzwv28KVWSmCkdRBWoOh+bYQGW3+6OZvbbN3TqMGo5hqYxQniRcH9F2VZIoJCm4pa3BPDK/A== + dependencies: + "@jridgewell/set-array" "^1.0.1" + "@jridgewell/sourcemap-codec" "^1.4.10" + "@jridgewell/trace-mapping" "^0.3.9" + "@jridgewell/resolve-uri@^3.0.3": - version "3.0.7" - resolved "https://registry.yarnpkg.com/@jridgewell/resolve-uri/-/resolve-uri-3.0.7.tgz#30cd49820a962aff48c8fffc5cd760151fca61fe" - integrity sha512-8cXDaBBHOr2pQ7j77Y6Vp5VDT2sIqWyWQ56TjEq4ih/a4iST3dItRe8Q9fp0rrIl9DoKhWQtUQz/YpOxLkXbNA== + version "3.1.0" + resolved "https://registry.yarnpkg.com/@jridgewell/resolve-uri/-/resolve-uri-3.1.0.tgz#2203b118c157721addfe69d47b70465463066d78" + integrity sha512-F2msla3tad+Mfht5cJq7LSXcdudKTWCVYUgw6pLFOOHSTtZlj6SWNYAp+AhuqLmWdBO2X5hPrLcu8cVP8fy28w== "@jridgewell/set-array@^1.0.0": version "1.1.1" resolved "https://registry.yarnpkg.com/@jridgewell/set-array/-/set-array-1.1.1.tgz#36a6acc93987adcf0ba50c66908bd0b70de8afea" integrity sha512-Ct5MqZkLGEXTVmQYbGtx9SVqD2fqwvdubdps5D3djjAkgkKwT918VNOz65pEHFaYTeWcukmJmH5SwsA9Tn2ObQ== +"@jridgewell/set-array@^1.0.1": + version "1.1.2" + resolved "https://registry.yarnpkg.com/@jridgewell/set-array/-/set-array-1.1.2.tgz#7c6cf998d6d20b914c0a55a91ae928ff25965e72" + integrity sha512-xnkseuNADM0gt2bs+BvhO0p78Mk762YnZdsuzFV018NoG1Sj1SCQvpSqa7XUaTam5vAGasABV9qXASMKnFMwMw== + +"@jridgewell/source-map@^0.3.2": + version "0.3.2" + resolved "https://registry.yarnpkg.com/@jridgewell/source-map/-/source-map-0.3.2.tgz#f45351aaed4527a298512ec72f81040c998580fb" + integrity sha512-m7O9o2uR8k2ObDysZYzdfhb08VuEml5oWGiosa1VdaPZ/A6QyPkAJuwN0Q1lhULOf6B7MtQmHENS743hWtCrgw== + dependencies: + "@jridgewell/gen-mapping" "^0.3.0" + "@jridgewell/trace-mapping" "^0.3.9" + "@jridgewell/sourcemap-codec@^1.4.10": - version "1.4.13" - resolved "https://registry.yarnpkg.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.13.tgz#b6461fb0c2964356c469e115f504c95ad97ab88c" - integrity sha512-GryiOJmNcWbovBxTfZSF71V/mXbgcV3MewDe3kIMCLyIh5e7SKAeUZs+rMnJ8jkMolZ/4/VsdBmMrw3l+VdZ3w== + version "1.4.14" + resolved "https://registry.yarnpkg.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.14.tgz#add4c98d341472a289190b424efbdb096991bb24" + integrity sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw== "@jridgewell/trace-mapping@^0.3.9": - version "0.3.12" - resolved "https://registry.yarnpkg.com/@jridgewell/trace-mapping/-/trace-mapping-0.3.12.tgz#ccd8cd83ad894bae98a79eecd6a885b211bfe217" - integrity sha512-6GMdw8fZlZjs9CJONrWeWyjl8zYqbyOMSxS9FABnEw3i+wz99SESjWMWRRIsbIp8HVsMeXggi5b7+a9qO6W1fQ== + version "0.3.14" + resolved "https://registry.yarnpkg.com/@jridgewell/trace-mapping/-/trace-mapping-0.3.14.tgz#b231a081d8f66796e475ad588a1ef473112701ed" + integrity sha512-bJWEfQ9lPTvm3SneWwRFVLzrh6nhjwqw7TUFFBEMzwvg7t7PCDenf2lDwqo4NQXzdpgBXyFgDWnQA+2vkruksQ== dependencies: "@jridgewell/resolve-uri" "^3.0.3" "@jridgewell/sourcemap-codec" "^1.4.10" @@ -7143,10 +7165,10 @@ mkdirp@~0.5.1: dependencies: minimist "^1.2.6" -moment@^2.29.1, moment@^2.29.3: - version "2.29.3" - resolved "https://registry.yarnpkg.com/moment/-/moment-2.29.3.tgz#edd47411c322413999f7a5940d526de183c031f3" - integrity sha512-c6YRvhEo//6T2Jz/vVtYzqBzwvPT95JBQ+smCytzf7c50oMZRsR/a4w88aD34I+/QVSfnoAnSBFPJHItlOMJVw== +moment@^2.29.1, moment@^2.29.4: + version "2.29.4" + resolved "https://registry.yarnpkg.com/moment/-/moment-2.29.4.tgz#3dbe052889fe7c1b2ed966fcb3a77328964ef108" + integrity sha512-5LC9SOxjSc2HF6vO2CyuTDNivEdoz2IvyJJGj6X8DJ0eFyfszE0QiEd+iXmBvUP3WHxSjFH/vIsA0EN00cgr8w== ms@2.0.0: version "2.0.0" @@ -9228,7 +9250,7 @@ source-map@^0.7.3: resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.7.3.tgz#5302f8169031735226544092e64981f751750383" integrity sha512-CkCj6giN3S+n9qrYiBTX5gystlENnRW5jZeNLHpe6aue+SrHcG5VYwujhW9s4dY31mEGsxBDrHR6oI69fTXsaQ== -source-map@^0.8.0-beta.0, source-map@~0.8.0-beta.0: +source-map@^0.8.0-beta.0: version "0.8.0-beta.0" resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.8.0-beta.0.tgz#d4c1bb42c3f7ee925f005927ba10709e0d1d1f11" integrity sha512-2ymg6oRBpebeZi9UUNsgQ89bhx01TcTkmNTGnNO88imTmbSgy4nfujrgVEFKWpMTEGA11EDkTt7mqObTPdigIA== @@ -9633,13 +9655,13 @@ terser-webpack-plugin@^5.1.3, terser-webpack-plugin@^5.2.5: terser "^5.7.2" terser@^5.0.0, terser@^5.10.0, terser@^5.7.2: - version "5.13.1" - resolved "https://registry.yarnpkg.com/terser/-/terser-5.13.1.tgz#66332cdc5a01b04a224c9fad449fc1a18eaa1799" - integrity sha512-hn4WKOfwnwbYfe48NgrQjqNOH9jzLqRcIfbYytOXCOv46LBfWr9bDS17MQqOi+BWGD0sJK3Sj5NC/gJjiojaoA== + version "5.14.2" + resolved "https://registry.yarnpkg.com/terser/-/terser-5.14.2.tgz#9ac9f22b06994d736174f4091aa368db896f1c10" + integrity sha512-oL0rGeM/WFQCUd0y2QrWxYnq7tfSuKBiqTjRPWrRgB46WD/kiwHwF8T23z78H6Q6kGCuuHcPB+KULHRdxvVGQA== dependencies: + "@jridgewell/source-map" "^0.3.2" acorn "^8.5.0" commander "^2.20.0" - source-map "~0.8.0-beta.0" source-map-support "~0.5.20" test-exclude@^6.0.0: diff --git a/sdk/python/feast/ui_server.py b/sdk/python/feast/ui_server.py index cb275c8f91..4d1fd67dc1 100644 --- a/sdk/python/feast/ui_server.py +++ b/sdk/python/feast/ui_server.py @@ -83,7 +83,9 @@ def catch_all(): return Response(content, media_type="text/html") app.mount( - "/", StaticFiles(directory=ui_dir, html=True), name="site", + "/", + StaticFiles(directory=ui_dir, html=True), + name="site", ) return app diff --git a/sdk/python/feast/usage.py b/sdk/python/feast/usage.py index 90b659479d..5e78aa52d2 100644 --- a/sdk/python/feast/usage.py +++ b/sdk/python/feast/usage.py @@ -35,11 +35,12 @@ USAGE_ENDPOINT = "https://usage.feast.dev" _logger = logging.getLogger(__name__) -_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) +_executor = concurrent.futures.ThreadPoolExecutor(max_workers=3) _is_enabled = os.getenv(FEAST_USAGE, default=DEFAULT_FEAST_USAGE_VALUE) == "True" _constant_attributes = { + "project_id": "", "session_id": str(uuid.uuid4()), "installation_id": None, "version": get_version(), @@ -53,6 +54,10 @@ } +def set_current_project_uuid(project_uuid: str): + _constant_attributes["project_id"] = project_uuid + + @dataclasses.dataclass class FnCall: fn_name: str diff --git a/sdk/python/feast/utils.py b/sdk/python/feast/utils.py index a40f423c53..1b99934159 100644 --- a/sdk/python/feast/utils.py +++ b/sdk/python/feast/utils.py @@ -1,8 +1,24 @@ +import typing +from collections import defaultdict from datetime import datetime +from typing import Dict, List, Optional, Tuple, Union +import pandas as pd +import pyarrow +from dask import dataframe as dd from dateutil.tz import tzlocal from pytz import utc +from feast.entity import Entity +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.type_map import python_values_to_proto_values +from feast.value_type import ValueType + +if typing.TYPE_CHECKING: + from feast.feature_view import FeatureView + from feast.on_demand_feature_view import OnDemandFeatureView + def make_tzaware(t: datetime) -> datetime: """We assume tz-naive datetimes are UTC""" @@ -12,6 +28,15 @@ def make_tzaware(t: datetime) -> datetime: return t +def make_df_tzaware(t: pd.DataFrame) -> pd.DataFrame: + """Make all datetime type columns tzaware; leave everything else intact.""" + df = t.copy() # don't modify incoming dataframe inplace + for column in df.columns: + if pd.api.types.is_datetime64_any_dtype(df[column]): + df[column] = pd.to_datetime(df[column], utc=True) + return df + + def to_naive_utc(ts: datetime) -> datetime: if ts.tzinfo is None: return ts @@ -24,3 +49,203 @@ def maybe_local_tz(t: datetime) -> datetime: return t.replace(tzinfo=tzlocal()) else: return t + + +def _get_requested_feature_views_to_features_dict( + feature_refs: List[str], + feature_views: List["FeatureView"], + on_demand_feature_views: List["OnDemandFeatureView"], +) -> Tuple[Dict["FeatureView", List[str]], Dict["OnDemandFeatureView", List[str]]]: + """Create a dict of FeatureView -> List[Feature] for all requested features. + Set full_feature_names to True to have feature names prefixed by their feature view name.""" + + feature_views_to_feature_map: Dict["FeatureView", List[str]] = defaultdict(list) + on_demand_feature_views_to_feature_map: Dict[ + "OnDemandFeatureView", List[str] + ] = defaultdict(list) + + for ref in feature_refs: + ref_parts = ref.split(":") + feature_view_from_ref = ref_parts[0] + feature_from_ref = ref_parts[1] + + found = False + for fv in feature_views: + if fv.projection.name_to_use() == feature_view_from_ref: + found = True + feature_views_to_feature_map[fv].append(feature_from_ref) + for odfv in on_demand_feature_views: + if odfv.projection.name_to_use() == feature_view_from_ref: + found = True + on_demand_feature_views_to_feature_map[odfv].append(feature_from_ref) + + if not found: + raise ValueError(f"Could not find feature view from reference {ref}") + + return feature_views_to_feature_map, on_demand_feature_views_to_feature_map + + +def _get_column_names( + feature_view: "FeatureView", entities: List[Entity] +) -> Tuple[List[str], List[str], str, Optional[str]]: + """ + If a field mapping exists, run it in reverse on the join keys, + feature names, event timestamp column, and created timestamp column + to get the names of the relevant columns in the offline feature store table. + + Returns: + Tuple containing the list of reverse-mapped join_keys, + reverse-mapped feature names, reverse-mapped event timestamp column, + and reverse-mapped created timestamp column that will be passed into + the query to the offline store. + """ + # if we have mapped fields, use the original field names in the call to the offline store + timestamp_field = feature_view.batch_source.timestamp_field + feature_names = [feature.name for feature in feature_view.features] + created_timestamp_column = feature_view.batch_source.created_timestamp_column + + from feast.feature_view import DUMMY_ENTITY_ID + + join_keys = [ + entity.join_key for entity in entities if entity.join_key != DUMMY_ENTITY_ID + ] + if feature_view.batch_source.field_mapping is not None: + reverse_field_mapping = { + v: k for k, v in feature_view.batch_source.field_mapping.items() + } + timestamp_field = ( + reverse_field_mapping[timestamp_field] + if timestamp_field in reverse_field_mapping.keys() + else timestamp_field + ) + created_timestamp_column = ( + reverse_field_mapping[created_timestamp_column] + if created_timestamp_column + and created_timestamp_column in reverse_field_mapping.keys() + else created_timestamp_column + ) + join_keys = [ + reverse_field_mapping[col] if col in reverse_field_mapping.keys() else col + for col in join_keys + ] + feature_names = [ + reverse_field_mapping[col] if col in reverse_field_mapping.keys() else col + for col in feature_names + ] + + # We need to exclude join keys and timestamp columns from the list of features, after they are mapped to + # their final column names via the `field_mapping` field of the source. + feature_names = [ + name + for name in feature_names + if name not in join_keys + and name != timestamp_field + and name != created_timestamp_column + ] + return ( + join_keys, + feature_names, + timestamp_field, + created_timestamp_column, + ) + + +def _run_pyarrow_field_mapping( + table: pyarrow.Table, + field_mapping: Dict[str, str], +) -> pyarrow.Table: + # run field mapping in the forward direction + cols = table.column_names + mapped_cols = [ + field_mapping[col] if col in field_mapping.keys() else col for col in cols + ] + table = table.rename_columns(mapped_cols) + return table + + +def _run_dask_field_mapping( + table: dd.DataFrame, + field_mapping: Dict[str, str], +): + if field_mapping: + # run field mapping in the forward direction + table = table.rename(columns=field_mapping) + table = table.persist() + + return table + + +def _coerce_datetime(ts): + """ + Depending on underlying time resolution, arrow to_pydict() sometimes returns pd + timestamp type (for nanosecond resolution), and sometimes you get standard python datetime + (for microsecond resolution). + While pd timestamp class is a subclass of python datetime, it doesn't always behave the + same way. We convert it to normal datetime so that consumers downstream don't have to deal + with these quirks. + """ + if isinstance(ts, pd.Timestamp): + return ts.to_pydatetime() + else: + return ts + + +def _convert_arrow_to_proto( + table: Union[pyarrow.Table, pyarrow.RecordBatch], + feature_view: "FeatureView", + join_keys: Dict[str, ValueType], +) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]]: + # Avoid ChunkedArrays which guarantees `zero_copy_only` available. + if isinstance(table, pyarrow.Table): + table = table.to_batches()[0] + + columns = [ + (field.name, field.dtype.to_value_type()) for field in feature_view.features + ] + list(join_keys.items()) + + proto_values_by_column = { + column: python_values_to_proto_values( + table.column(column).to_numpy(zero_copy_only=False), value_type + ) + for column, value_type in columns + } + + entity_keys = [ + EntityKeyProto( + join_keys=join_keys, + entity_values=[proto_values_by_column[k][idx] for k in join_keys], + ) + for idx in range(table.num_rows) + ] + + # Serialize the features per row + feature_dict = { + feature.name: proto_values_by_column[feature.name] + for feature in feature_view.features + } + features = [dict(zip(feature_dict, vars)) for vars in zip(*feature_dict.values())] + + # Convert event_timestamps + event_timestamps = [ + _coerce_datetime(val) + for val in pd.to_datetime( + table.column(feature_view.batch_source.timestamp_field).to_numpy( + zero_copy_only=False + ) + ) + ] + + # Convert created_timestamps if they exist + if feature_view.batch_source.created_timestamp_column: + created_timestamps = [ + _coerce_datetime(val) + for val in pd.to_datetime( + table.column( + feature_view.batch_source.created_timestamp_column + ).to_numpy(zero_copy_only=False) + ) + ] + else: + created_timestamps = [None] * table.num_rows + + return list(zip(entity_keys, features, event_timestamps, created_timestamps)) diff --git a/sdk/python/requirements/py3.10-ci-requirements.txt b/sdk/python/requirements/py3.10-ci-requirements.txt index 3bdf468bb2..4ff99c247f 100644 --- a/sdk/python/requirements/py3.10-ci-requirements.txt +++ b/sdk/python/requirements/py3.10-ci-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --extra=ci --output-file=sdk/python/requirements/py3.10-ci-requirements.txt # -absl-py==1.1.0 +absl-py==1.2.0 # via tensorflow-metadata adal==1.2.7 # via @@ -33,9 +33,7 @@ anyio==3.6.1 # starlette # watchfiles appdirs==1.4.4 - # via - # black - # fissix + # via fissix appnope==0.1.3 # via ipython asn1crypto==1.5.1 @@ -50,16 +48,15 @@ async-timeout==4.0.2 # via # aiohttp # redis -attrs==21.4.0 +attrs==22.1.0 # via # aiohttp - # black # bowler # jsonschema # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.24.1 +azure-core==1.24.2 # via # adlfs # azure-identity @@ -69,13 +66,13 @@ azure-datalake-store==0.0.52 # via adlfs azure-identity==1.10.0 # via adlfs -azure-storage-blob==12.12.0 +azure-storage-blob==12.13.0 # via adlfs babel==2.10.3 # via sphinx backcall==0.2.0 # via ipython -black==19.10b0 +black==22.6.0 # via feast (setup.py) boto3==1.20.23 # via @@ -90,10 +87,12 @@ botocore==1.23.24 bowler==0.9.0 # via feast (setup.py) build==0.8.0 - # via feast (setup.py) + # via + # feast (setup.py) + # pip-tools cachecontrol==0.12.11 # via firebase-admin -cachetools==4.2.4 +cachetools==5.2.0 # via google-auth certifi==2022.6.15 # via @@ -101,7 +100,7 @@ certifi==2022.6.15 # msrest # requests # snowflake-connector-python -cffi==1.15.0 +cffi==1.15.1 # via # azure-datalake-store # cryptography @@ -113,7 +112,7 @@ charset-normalizer==2.0.12 # aiohttp # requests # snowflake-connector-python -click==8.0.1 +click==8.1.3 # via # black # bowler @@ -128,7 +127,7 @@ colorama==0.4.5 # via # feast (setup.py) # great-expectations -coverage[toml]==6.4.1 +coverage[toml]==6.4.2 # via pytest-cov cryptography==35.0.0 # via @@ -145,6 +144,8 @@ dask==2022.1.1 # via feast (setup.py) dataclasses==0.6 # via great-expectations +db-dtypes==1.0.2 + # via google-cloud-bigquery decorator==5.1.1 # via # gcsfs @@ -155,7 +156,7 @@ deprecation==2.1.0 # via testcontainers dill==0.3.5.1 # via feast (setup.py) -distlib==0.3.4 +distlib==0.3.5 # via virtualenv docker==5.0.3 # via @@ -169,23 +170,23 @@ entrypoints==0.4 # via altair execnet==1.9.0 # via pytest-xdist -executing==0.8.3 +executing==0.9.1 # via stack-data -fastapi==0.78.0 +fastapi==0.79.0 # via feast (setup.py) -fastavro==1.5.1 +fastavro==1.5.4 # via # feast (setup.py) # pandavro -fastjsonschema==2.15.3 +fastjsonschema==2.16.1 # via nbformat filelock==3.7.1 # via virtualenv -firebase-admin==4.5.2 +firebase-admin==5.2.0 # via feast (setup.py) fissix==21.11.13 # via bowler -flake8==4.0.1 +flake8==5.0.2 # via feast (setup.py) frozenlist==1.3.0 # via @@ -199,7 +200,7 @@ fsspec==2022.1.0 # s3fs gcsfs==2022.1.0 # via feast (setup.py) -google-api-core[grpc]==1.31.6 +google-api-core[grpc]==2.8.2 # via # feast (setup.py) # firebase-admin @@ -209,9 +210,10 @@ google-api-core[grpc]==1.31.6 # google-cloud-core # google-cloud-datastore # google-cloud-firestore -google-api-python-client==2.51.0 + # google-cloud-storage +google-api-python-client==2.55.0 # via firebase-admin -google-auth==1.35.0 +google-auth==2.9.1 # via # gcsfs # google-api-core @@ -224,54 +226,57 @@ google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.2 # via gcsfs -google-cloud-bigquery==2.34.4 - # via feast (setup.py) -google-cloud-bigquery-storage==2.13.2 +google-cloud-bigquery[pandas]==3.3.0 # via feast (setup.py) -google-cloud-core==1.7.2 +google-cloud-bigquery-storage==2.14.1 # via # feast (setup.py) # google-cloud-bigquery +google-cloud-core==2.3.2 + # via + # google-cloud-bigquery # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.7.1 +google-cloud-datastore==2.8.0 # via feast (setup.py) -google-cloud-firestore==2.5.3 +google-cloud-firestore==2.6.0 # via firebase-admin -google-cloud-storage==1.40.0 +google-cloud-storage==2.4.0 # via # feast (setup.py) # firebase-admin # gcsfs google-crc32c==1.3.0 # via google-resumable-media -google-resumable-media==1.3.3 +google-resumable-media==2.3.3 # via # google-cloud-bigquery # google-cloud-storage -googleapis-common-protos==1.56.3 +googleapis-common-protos==1.56.4 # via # feast (setup.py) # google-api-core + # grpcio-status # tensorflow-metadata great-expectations==0.14.13 # via feast (setup.py) -greenlet==1.1.2 - # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) # google-api-core # google-cloud-bigquery # grpcio-reflection + # grpcio-status # grpcio-testing # grpcio-tools grpcio-reflection==1.47.0 # via feast (setup.py) -grpcio-testing==1.44.0 +grpcio-status==1.47.0 + # via google-api-core +grpcio-testing==1.47.0 # via feast (setup.py) -grpcio-tools==1.44.0 +grpcio-tools==1.47.0 # via feast (setup.py) h11==0.13.0 # via uvicorn @@ -285,7 +290,7 @@ httplib2==0.20.4 # google-auth-httplib2 httptools==0.4.0 # via uvicorn -identify==2.5.1 +identify==2.5.2 # via pre-commit idna==3.3 # via @@ -293,9 +298,9 @@ idna==3.3 # requests # snowflake-connector-python # yarl -imagesize==1.3.0 +imagesize==1.4.1 # via sphinx -importlib-metadata==4.11.4 +importlib-metadata==4.12.0 # via great-expectations iniconfig==1.1.1 # via pytest @@ -322,13 +327,13 @@ jsonpatch==1.32 # via great-expectations jsonpointer==2.3 # via jsonpatch -jsonschema==4.6.0 +jsonschema==4.9.0 # via # altair # feast (setup.py) # great-expectations # nbformat -jupyter-core==4.10.0 +jupyter-core==4.11.1 # via nbformat locket==1.0.0 # via partd @@ -336,13 +341,14 @@ markupsafe==2.1.1 # via # jinja2 # moto + # werkzeug matplotlib-inline==0.1.3 # via ipython -mccabe==0.6.1 +mccabe==0.7.0 # via flake8 minio==7.1.0 # via feast (setup.py) -mistune==2.0.2 +mistune==2.0.4 # via great-expectations mmh3==3.0.0 # via feast (setup.py) @@ -350,7 +356,7 @@ mock==2.0.0 # via feast (setup.py) moreorless==0.4.0 # via bowler -moto==3.1.14 +moto==3.1.16 # via feast (setup.py) msal==1.18.0 # via @@ -370,23 +376,26 @@ multidict==6.0.2 # via # aiohttp # yarl -mypy==0.961 +mypy==0.971 # via # feast (setup.py) # sqlalchemy mypy-extensions==0.4.3 - # via mypy + # via + # black + # mypy mypy-protobuf==3.1 # via feast (setup.py) mysqlclient==2.1.1 # via feast (setup.py) nbformat==5.4.0 # via great-expectations -nodeenv==1.6.0 +nodeenv==1.7.0 # via pre-commit -numpy==1.21.6 +numpy==1.23.1 # via # altair + # db-dtypes # feast (setup.py) # great-expectations # pandas @@ -401,8 +410,8 @@ packaging==21.3 # via # build # dask + # db-dtypes # deprecation - # google-api-core # google-cloud-bigquery # great-expectations # pytest @@ -411,7 +420,9 @@ packaging==21.3 pandas==1.4.3 # via # altair + # db-dtypes # feast (setup.py) + # google-cloud-bigquery # great-expectations # pandavro # snowflake-connector-python @@ -425,27 +436,27 @@ pathspec==0.9.0 # via black pbr==5.9.0 # via mock -pep517==0.12.0 - # via - # build - # pip-tools +pep517==0.13.0 + # via build pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.6.2 +pip-tools==6.8.0 # via feast (setup.py) platformdirs==2.5.2 - # via virtualenv + # via + # black + # virtualenv pluggy==1.0.0 # via pytest ply==3.11 # via thriftpy2 -portalocker==2.4.0 +portalocker==2.5.1 # via msal-extensions -pre-commit==2.19.0 +pre-commit==2.20.0 # via feast (setup.py) -prompt-toolkit==3.0.29 +prompt-toolkit==3.0.30 # via ipython proto-plus==1.20.6 # via @@ -454,7 +465,7 @@ proto-plus==1.20.6 # google-cloud-bigquery-storage # google-cloud-datastore # google-cloud-firestore -protobuf==3.19.4 +protobuf==3.20.1 # via # feast (setup.py) # google-api-core @@ -464,6 +475,7 @@ protobuf==3.19.4 # google-cloud-firestore # googleapis-common-protos # grpcio-reflection + # grpcio-status # grpcio-testing # grpcio-tools # mypy-protobuf @@ -487,7 +499,9 @@ py4j==0.10.9.5 # via pyspark pyarrow==6.0.1 # via + # db-dtypes # feast (setup.py) + # google-cloud-bigquery # snowflake-connector-python pyasn1==0.4.8 # via @@ -497,7 +511,7 @@ pyasn1-modules==0.2.8 # via google-auth pybindgen==0.22.1 # via feast (setup.py) -pycodestyle==2.8.0 +pycodestyle==2.9.0 # via flake8 pycparser==2.21 # via cffi @@ -507,7 +521,7 @@ pydantic==1.9.1 # via # fastapi # feast (setup.py) -pyflakes==2.4.0 +pyflakes==2.5.0 # via flake8 pygments==2.12.0 # via @@ -570,7 +584,6 @@ python-dotenv==0.20.0 pytz==2022.1 # via # babel - # google-api-core # great-expectations # moto # pandas @@ -586,9 +599,7 @@ pyyaml==6.0 # uvicorn redis==4.2.2 # via feast (setup.py) -regex==2022.6.2 - # via black -requests==2.28.0 +requests==2.28.1 # via # adal # adlfs @@ -615,7 +626,7 @@ requests-oauthlib==1.3.1 # msrest responses==0.21.0 # via moto -rsa==4.8 +rsa==4.9 # via google-auth ruamel-yaml==0.17.17 # via great-expectations @@ -623,24 +634,20 @@ s3fs==2022.1.0 # via feast (setup.py) s3transfer==0.5.2 # via boto3 -scipy==1.8.1 +scipy==1.9.0 # via great-expectations six==1.16.0 # via # azure-core # azure-identity - # google-api-core # google-auth # google-auth-httplib2 - # google-cloud-core - # google-resumable-media # grpcio # happybase # mock # msrestazure # pandavro # python-dateutil - # virtualenv sniffio==1.2.0 # via anyio snowballstemmer==2.2.0 @@ -665,7 +672,7 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -sqlalchemy[mypy]==1.4.38 +sqlalchemy[mypy]==1.4.39 # via feast (setup.py) sqlalchemy2-stubs==0.0.2a24 # via sqlalchemy @@ -687,17 +694,17 @@ thriftpy2==0.4.14 # via happybase toml==0.10.2 # via - # black # feast (setup.py) # pre-commit tomli==2.0.1 # via + # black # build # coverage # mypy # pep517 # pytest -toolz==0.11.2 +toolz==0.12.0 # via # altair # dask @@ -712,33 +719,31 @@ traitlets==5.3.0 # jupyter-core # matplotlib-inline # nbformat -trino==0.313.0 +trino==0.315.0 # via feast (setup.py) -typed-ast==1.5.4 - # via black typeguard==2.13.3 # via feast (setup.py) types-protobuf==3.19.22 # via # feast (setup.py) # mypy-protobuf -types-python-dateutil==2.8.18 +types-python-dateutil==2.8.19 # via feast (setup.py) -types-pytz==2022.1.0 +types-pytz==2022.1.2 # via feast (setup.py) -types-pyyaml==6.0.8 +types-pyyaml==6.0.11 # via feast (setup.py) -types-redis==4.3.2 +types-redis==4.3.13 # via feast (setup.py) -types-requests==2.27.31 +types-requests==2.28.6 # via feast (setup.py) -types-setuptools==57.4.17 +types-setuptools==63.2.2 # via feast (setup.py) -types-tabulate==0.8.10 +types-tabulate==0.8.11 # via feast (setup.py) -types-urllib3==1.26.15 +types-urllib3==1.26.20 # via types-requests -typing-extensions==4.2.0 +typing-extensions==4.3.0 # via # azure-core # great-expectations @@ -751,7 +756,7 @@ tzlocal==4.2 # via great-expectations uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.9 +urllib3==1.26.11 # via # botocore # feast (setup.py) @@ -759,15 +764,15 @@ urllib3==1.26.9 # minio # requests # responses -uvicorn[standard]==0.18.1 +uvicorn[standard]==0.18.2 # via feast (setup.py) uvloop==0.16.0 # via uvicorn -virtualenv==20.14.1 +virtualenv==20.16.2 # via pre-commit volatile==2.1.0 # via bowler -watchfiles==0.15.0 +watchfiles==0.16.1 # via uvicorn wcwidth==0.2.5 # via prompt-toolkit @@ -775,7 +780,7 @@ websocket-client==1.3.3 # via docker websockets==10.3 # via uvicorn -werkzeug==2.1.2 +werkzeug==2.2.1 # via moto wheel==0.37.1 # via pip-tools @@ -786,9 +791,9 @@ wrapt==1.14.1 # testcontainers xmltodict==0.13.0 # via moto -yarl==1.7.2 +yarl==1.8.0 # via aiohttp -zipp==3.8.0 +zipp==3.8.1 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/sdk/python/requirements/py3.10-requirements.txt b/sdk/python/requirements/py3.10-requirements.txt index 15ee46aff5..8ae219f1fe 100644 --- a/sdk/python/requirements/py3.10-requirements.txt +++ b/sdk/python/requirements/py3.10-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=sdk/python/requirements/py3.10-requirements.txt # -absl-py==1.1.0 +absl-py==1.2.0 # via tensorflow-metadata anyio==3.6.1 # via @@ -12,7 +12,7 @@ anyio==3.6.1 # watchfiles appdirs==1.4.4 # via fissix -attrs==21.4.0 +attrs==22.1.0 # via # bowler # jsonschema @@ -22,9 +22,9 @@ cachetools==5.2.0 # via google-auth certifi==2022.6.15 # via requests -charset-normalizer==2.0.12 +charset-normalizer==2.1.0 # via requests -click==8.0.1 +click==8.1.3 # via # bowler # feast (setup.py) @@ -38,27 +38,25 @@ dask==2022.1.1 # via feast (setup.py) dill==0.3.5.1 # via feast (setup.py) -fastapi==0.78.0 +fastapi==0.79.0 # via feast (setup.py) -fastavro==1.5.1 +fastavro==1.5.4 # via # feast (setup.py) # pandavro fissix==21.11.13 # via bowler -fsspec==2022.5.0 +fsspec==2022.7.1 # via dask google-api-core==2.8.2 # via feast (setup.py) -google-auth==2.8.0 +google-auth==2.9.1 # via google-api-core -googleapis-common-protos==1.56.3 +googleapis-common-protos==1.56.4 # via # feast (setup.py) # google-api-core # tensorflow-metadata -greenlet==1.1.2 - # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -75,7 +73,7 @@ idna==3.3 # requests jinja2==3.1.2 # via feast (setup.py) -jsonschema==4.6.0 +jsonschema==4.9.0 # via feast (setup.py) locket==1.0.0 # via partd @@ -85,11 +83,11 @@ mmh3==3.0.0 # via feast (setup.py) moreorless==0.4.0 # via bowler -mypy==0.961 +mypy==0.971 # via sqlalchemy mypy-extensions==0.4.3 # via mypy -numpy==1.21.6 +numpy==1.23.1 # via # feast (setup.py) # pandas @@ -107,7 +105,7 @@ partd==1.2.0 # via dask proto-plus==1.20.6 # via feast (setup.py) -protobuf==3.19.4 +protobuf==3.20.1 # via # feast (setup.py) # google-api-core @@ -115,7 +113,7 @@ protobuf==3.19.4 # grpcio-reflection # proto-plus # tensorflow-metadata -pyarrow==6.0.1 +pyarrow==8.0.0 # via feast (setup.py) pyasn1==0.4.8 # via @@ -144,9 +142,9 @@ pyyaml==6.0 # dask # feast (setup.py) # uvicorn -requests==2.28.0 +requests==2.28.1 # via google-api-core -rsa==4.8 +rsa==4.9 # via google-auth six==1.16.0 # via @@ -156,7 +154,7 @@ six==1.16.0 # python-dateutil sniffio==1.2.0 # via anyio -sqlalchemy[mypy]==1.4.38 +sqlalchemy[mypy]==1.4.39 # via feast (setup.py) sqlalchemy2-stubs==0.0.2a24 # via sqlalchemy @@ -172,7 +170,7 @@ toml==0.10.2 # via feast (setup.py) tomli==2.0.1 # via mypy -toolz==0.11.2 +toolz==0.12.0 # via # dask # partd @@ -180,20 +178,20 @@ tqdm==4.64.0 # via feast (setup.py) typeguard==2.13.3 # via feast (setup.py) -typing-extensions==4.2.0 +typing-extensions==4.3.0 # via # mypy # pydantic # sqlalchemy2-stubs -urllib3==1.26.9 +urllib3==1.26.11 # via requests -uvicorn[standard]==0.18.1 +uvicorn[standard]==0.18.2 # via feast (setup.py) uvloop==0.16.0 # via uvicorn volatile==2.1.0 # via bowler -watchfiles==0.15.0 +watchfiles==0.16.1 # via uvicorn websockets==10.3 # via uvicorn diff --git a/sdk/python/requirements/py3.8-ci-requirements.txt b/sdk/python/requirements/py3.8-ci-requirements.txt index dd21fae0a2..931a7d1e24 100644 --- a/sdk/python/requirements/py3.8-ci-requirements.txt +++ b/sdk/python/requirements/py3.8-ci-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --extra=ci --output-file=sdk/python/requirements/py3.8-ci-requirements.txt # -absl-py==1.1.0 +absl-py==1.2.0 # via tensorflow-metadata adal==1.2.7 # via @@ -33,9 +33,7 @@ anyio==3.6.1 # starlette # watchfiles appdirs==1.4.4 - # via - # black - # fissix + # via fissix appnope==0.1.3 # via ipython asn1crypto==1.5.1 @@ -50,16 +48,15 @@ async-timeout==4.0.2 # via # aiohttp # redis -attrs==21.4.0 +attrs==22.1.0 # via # aiohttp - # black # bowler # jsonschema # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.24.1 +azure-core==1.24.2 # via # adlfs # azure-identity @@ -69,7 +66,7 @@ azure-datalake-store==0.0.52 # via adlfs azure-identity==1.10.0 # via adlfs -azure-storage-blob==12.12.0 +azure-storage-blob==12.13.0 # via adlfs babel==2.10.3 # via sphinx @@ -79,7 +76,7 @@ backports-zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal -black==19.10b0 +black==22.6.0 # via feast (setup.py) boto3==1.20.23 # via @@ -94,10 +91,12 @@ botocore==1.23.24 bowler==0.9.0 # via feast (setup.py) build==0.8.0 - # via feast (setup.py) + # via + # feast (setup.py) + # pip-tools cachecontrol==0.12.11 # via firebase-admin -cachetools==4.2.4 +cachetools==5.2.0 # via google-auth certifi==2022.6.15 # via @@ -105,7 +104,7 @@ certifi==2022.6.15 # msrest # requests # snowflake-connector-python -cffi==1.15.0 +cffi==1.15.1 # via # azure-datalake-store # cryptography @@ -117,7 +116,7 @@ charset-normalizer==2.0.12 # aiohttp # requests # snowflake-connector-python -click==8.0.1 +click==8.1.3 # via # black # bowler @@ -132,7 +131,7 @@ colorama==0.4.5 # via # feast (setup.py) # great-expectations -coverage[toml]==6.4.1 +coverage[toml]==6.4.2 # via pytest-cov cryptography==35.0.0 # via @@ -149,6 +148,8 @@ dask==2022.1.1 # via feast (setup.py) dataclasses==0.6 # via great-expectations +db-dtypes==1.0.2 + # via google-cloud-bigquery decorator==5.1.1 # via # gcsfs @@ -159,7 +160,7 @@ deprecation==2.1.0 # via testcontainers dill==0.3.5.1 # via feast (setup.py) -distlib==0.3.4 +distlib==0.3.5 # via virtualenv docker==5.0.3 # via @@ -173,23 +174,23 @@ entrypoints==0.4 # via altair execnet==1.9.0 # via pytest-xdist -executing==0.8.3 +executing==0.9.1 # via stack-data -fastapi==0.78.0 +fastapi==0.79.0 # via feast (setup.py) -fastavro==1.5.1 +fastavro==1.5.4 # via # feast (setup.py) # pandavro -fastjsonschema==2.15.3 +fastjsonschema==2.16.1 # via nbformat filelock==3.7.1 # via virtualenv -firebase-admin==4.5.2 +firebase-admin==5.2.0 # via feast (setup.py) fissix==21.11.13 # via bowler -flake8==4.0.1 +flake8==5.0.2 # via feast (setup.py) frozenlist==1.3.0 # via @@ -203,7 +204,7 @@ fsspec==2022.1.0 # s3fs gcsfs==2022.1.0 # via feast (setup.py) -google-api-core[grpc]==1.31.6 +google-api-core[grpc]==2.8.2 # via # feast (setup.py) # firebase-admin @@ -213,9 +214,10 @@ google-api-core[grpc]==1.31.6 # google-cloud-core # google-cloud-datastore # google-cloud-firestore -google-api-python-client==2.51.0 + # google-cloud-storage +google-api-python-client==2.55.0 # via firebase-admin -google-auth==1.35.0 +google-auth==2.9.1 # via # gcsfs # google-api-core @@ -228,54 +230,57 @@ google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.2 # via gcsfs -google-cloud-bigquery==2.34.4 - # via feast (setup.py) -google-cloud-bigquery-storage==2.13.2 +google-cloud-bigquery[pandas]==3.3.0 # via feast (setup.py) -google-cloud-core==1.7.2 +google-cloud-bigquery-storage==2.14.1 # via # feast (setup.py) # google-cloud-bigquery +google-cloud-core==2.3.2 + # via + # google-cloud-bigquery # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.7.1 +google-cloud-datastore==2.8.0 # via feast (setup.py) -google-cloud-firestore==2.5.3 +google-cloud-firestore==2.6.0 # via firebase-admin -google-cloud-storage==1.40.0 +google-cloud-storage==2.4.0 # via # feast (setup.py) # firebase-admin # gcsfs google-crc32c==1.3.0 # via google-resumable-media -google-resumable-media==1.3.3 +google-resumable-media==2.3.3 # via # google-cloud-bigquery # google-cloud-storage -googleapis-common-protos==1.56.3 +googleapis-common-protos==1.56.4 # via # feast (setup.py) # google-api-core + # grpcio-status # tensorflow-metadata great-expectations==0.14.13 # via feast (setup.py) -greenlet==1.1.2 - # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) # google-api-core # google-cloud-bigquery # grpcio-reflection + # grpcio-status # grpcio-testing # grpcio-tools grpcio-reflection==1.47.0 # via feast (setup.py) -grpcio-testing==1.44.0 +grpcio-status==1.47.0 + # via google-api-core +grpcio-testing==1.47.0 # via feast (setup.py) -grpcio-tools==1.44.0 +grpcio-tools==1.47.0 # via feast (setup.py) h11==0.13.0 # via uvicorn @@ -289,7 +294,7 @@ httplib2==0.20.4 # google-auth-httplib2 httptools==0.4.0 # via uvicorn -identify==2.5.1 +identify==2.5.2 # via pre-commit idna==3.3 # via @@ -297,11 +302,11 @@ idna==3.3 # requests # snowflake-connector-python # yarl -imagesize==1.3.0 +imagesize==1.4.1 # via sphinx -importlib-metadata==4.11.4 +importlib-metadata==4.12.0 # via great-expectations -importlib-resources==5.8.0 +importlib-resources==5.9.0 # via jsonschema iniconfig==1.1.1 # via pytest @@ -328,13 +333,13 @@ jsonpatch==1.32 # via great-expectations jsonpointer==2.3 # via jsonpatch -jsonschema==4.6.0 +jsonschema==4.9.0 # via # altair # feast (setup.py) # great-expectations # nbformat -jupyter-core==4.10.0 +jupyter-core==4.11.1 # via nbformat locket==1.0.0 # via partd @@ -342,13 +347,14 @@ markupsafe==2.1.1 # via # jinja2 # moto + # werkzeug matplotlib-inline==0.1.3 # via ipython -mccabe==0.6.1 +mccabe==0.7.0 # via flake8 minio==7.1.0 # via feast (setup.py) -mistune==2.0.2 +mistune==2.0.4 # via great-expectations mmh3==3.0.0 # via feast (setup.py) @@ -356,7 +362,7 @@ mock==2.0.0 # via feast (setup.py) moreorless==0.4.0 # via bowler -moto==3.1.14 +moto==3.1.16 # via feast (setup.py) msal==1.18.0 # via @@ -376,23 +382,26 @@ multidict==6.0.2 # via # aiohttp # yarl -mypy==0.961 +mypy==0.971 # via # feast (setup.py) # sqlalchemy mypy-extensions==0.4.3 - # via mypy + # via + # black + # mypy mypy-protobuf==3.1 # via feast (setup.py) mysqlclient==2.1.1 # via feast (setup.py) nbformat==5.4.0 # via great-expectations -nodeenv==1.6.0 +nodeenv==1.7.0 # via pre-commit -numpy==1.21.6 +numpy==1.23.1 # via # altair + # db-dtypes # feast (setup.py) # great-expectations # pandas @@ -407,8 +416,8 @@ packaging==21.3 # via # build # dask + # db-dtypes # deprecation - # google-api-core # google-cloud-bigquery # great-expectations # pytest @@ -417,7 +426,9 @@ packaging==21.3 pandas==1.4.3 # via # altair + # db-dtypes # feast (setup.py) + # google-cloud-bigquery # great-expectations # pandavro # snowflake-connector-python @@ -431,27 +442,29 @@ pathspec==0.9.0 # via black pbr==5.9.0 # via mock -pep517==0.12.0 - # via - # build - # pip-tools +pep517==0.13.0 + # via build pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.6.2 +pip-tools==6.8.0 # via feast (setup.py) +pkgutil-resolve-name==1.3.10 + # via jsonschema platformdirs==2.5.2 - # via virtualenv + # via + # black + # virtualenv pluggy==1.0.0 # via pytest ply==3.11 # via thriftpy2 -portalocker==2.4.0 +portalocker==2.5.1 # via msal-extensions -pre-commit==2.19.0 +pre-commit==2.20.0 # via feast (setup.py) -prompt-toolkit==3.0.29 +prompt-toolkit==3.0.30 # via ipython proto-plus==1.20.6 # via @@ -460,7 +473,7 @@ proto-plus==1.20.6 # google-cloud-bigquery-storage # google-cloud-datastore # google-cloud-firestore -protobuf==3.19.4 +protobuf==3.20.1 # via # feast (setup.py) # google-api-core @@ -470,6 +483,7 @@ protobuf==3.19.4 # google-cloud-firestore # googleapis-common-protos # grpcio-reflection + # grpcio-status # grpcio-testing # grpcio-tools # mypy-protobuf @@ -493,7 +507,9 @@ py4j==0.10.9.5 # via pyspark pyarrow==6.0.1 # via + # db-dtypes # feast (setup.py) + # google-cloud-bigquery # snowflake-connector-python pyasn1==0.4.8 # via @@ -503,7 +519,7 @@ pyasn1-modules==0.2.8 # via google-auth pybindgen==0.22.1 # via feast (setup.py) -pycodestyle==2.8.0 +pycodestyle==2.9.0 # via flake8 pycparser==2.21 # via cffi @@ -513,7 +529,7 @@ pydantic==1.9.1 # via # fastapi # feast (setup.py) -pyflakes==2.4.0 +pyflakes==2.5.0 # via flake8 pygments==2.12.0 # via @@ -576,7 +592,6 @@ python-dotenv==0.20.0 pytz==2022.1 # via # babel - # google-api-core # great-expectations # moto # pandas @@ -592,9 +607,7 @@ pyyaml==6.0 # uvicorn redis==4.2.2 # via feast (setup.py) -regex==2022.6.2 - # via black -requests==2.28.0 +requests==2.28.1 # via # adal # adlfs @@ -621,7 +634,7 @@ requests-oauthlib==1.3.1 # msrest responses==0.21.0 # via moto -rsa==4.8 +rsa==4.9 # via google-auth ruamel-yaml==0.17.17 # via great-expectations @@ -631,24 +644,20 @@ s3fs==2022.1.0 # via feast (setup.py) s3transfer==0.5.2 # via boto3 -scipy==1.8.1 +scipy==1.9.0 # via great-expectations six==1.16.0 # via # azure-core # azure-identity - # google-api-core # google-auth # google-auth-httplib2 - # google-cloud-core - # google-resumable-media # grpcio # happybase # mock # msrestazure # pandavro # python-dateutil - # virtualenv sniffio==1.2.0 # via anyio snowballstemmer==2.2.0 @@ -673,7 +682,7 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -sqlalchemy[mypy]==1.4.38 +sqlalchemy[mypy]==1.4.39 # via feast (setup.py) sqlalchemy2-stubs==0.0.2a24 # via sqlalchemy @@ -695,17 +704,17 @@ thriftpy2==0.4.14 # via happybase toml==0.10.2 # via - # black # feast (setup.py) # pre-commit tomli==2.0.1 # via + # black # build # coverage # mypy # pep517 # pytest -toolz==0.11.2 +toolz==0.12.0 # via # altair # dask @@ -720,36 +729,35 @@ traitlets==5.3.0 # jupyter-core # matplotlib-inline # nbformat -trino==0.313.0 +trino==0.315.0 # via feast (setup.py) -typed-ast==1.5.4 - # via black typeguard==2.13.3 # via feast (setup.py) types-protobuf==3.19.22 # via # feast (setup.py) # mypy-protobuf -types-python-dateutil==2.8.18 +types-python-dateutil==2.8.19 # via feast (setup.py) -types-pytz==2022.1.0 +types-pytz==2022.1.2 # via feast (setup.py) -types-pyyaml==6.0.8 +types-pyyaml==6.0.11 # via feast (setup.py) -types-redis==4.3.2 +types-redis==4.3.13 # via feast (setup.py) -types-requests==2.27.31 +types-requests==2.28.6 # via feast (setup.py) -types-setuptools==57.4.17 +types-setuptools==63.2.2 # via feast (setup.py) -types-tabulate==0.8.10 +types-tabulate==0.8.11 # via feast (setup.py) -types-urllib3==1.26.15 +types-urllib3==1.26.20 # via types-requests -typing-extensions==4.2.0 +typing-extensions==4.3.0 # via # aioitertools # azure-core + # black # great-expectations # mypy # pydantic @@ -761,7 +769,7 @@ tzlocal==4.2 # via great-expectations uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.9 +urllib3==1.26.11 # via # botocore # feast (setup.py) @@ -769,15 +777,15 @@ urllib3==1.26.9 # minio # requests # responses -uvicorn[standard]==0.18.1 +uvicorn[standard]==0.18.2 # via feast (setup.py) uvloop==0.16.0 # via uvicorn -virtualenv==20.14.1 +virtualenv==20.16.2 # via pre-commit volatile==2.1.0 # via bowler -watchfiles==0.15.0 +watchfiles==0.16.1 # via uvicorn wcwidth==0.2.5 # via prompt-toolkit @@ -785,7 +793,7 @@ websocket-client==1.3.3 # via docker websockets==10.3 # via uvicorn -werkzeug==2.1.2 +werkzeug==2.2.1 # via moto wheel==0.37.1 # via pip-tools @@ -796,9 +804,9 @@ wrapt==1.14.1 # testcontainers xmltodict==0.13.0 # via moto -yarl==1.7.2 +yarl==1.8.0 # via aiohttp -zipp==3.8.0 +zipp==3.8.1 # via # importlib-metadata # importlib-resources diff --git a/sdk/python/requirements/py3.8-requirements.txt b/sdk/python/requirements/py3.8-requirements.txt index 7756acad31..362780d69e 100644 --- a/sdk/python/requirements/py3.8-requirements.txt +++ b/sdk/python/requirements/py3.8-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=sdk/python/requirements/py3.8-requirements.txt # -absl-py==1.1.0 +absl-py==1.2.0 # via tensorflow-metadata anyio==3.6.1 # via @@ -12,7 +12,7 @@ anyio==3.6.1 # watchfiles appdirs==1.4.4 # via fissix -attrs==21.4.0 +attrs==22.1.0 # via # bowler # jsonschema @@ -22,9 +22,9 @@ cachetools==5.2.0 # via google-auth certifi==2022.6.15 # via requests -charset-normalizer==2.0.12 +charset-normalizer==2.1.0 # via requests -click==8.0.1 +click==8.1.3 # via # bowler # feast (setup.py) @@ -38,27 +38,25 @@ dask==2022.1.1 # via feast (setup.py) dill==0.3.5.1 # via feast (setup.py) -fastapi==0.78.0 +fastapi==0.79.0 # via feast (setup.py) -fastavro==1.5.1 +fastavro==1.5.4 # via # feast (setup.py) # pandavro fissix==21.11.13 # via bowler -fsspec==2022.5.0 +fsspec==2022.7.1 # via dask google-api-core==2.8.2 # via feast (setup.py) -google-auth==2.8.0 +google-auth==2.9.1 # via google-api-core -googleapis-common-protos==1.56.3 +googleapis-common-protos==1.56.4 # via # feast (setup.py) # google-api-core # tensorflow-metadata -greenlet==1.1.2 - # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -73,11 +71,11 @@ idna==3.3 # via # anyio # requests -importlib-resources==5.8.0 +importlib-resources==5.9.0 # via jsonschema jinja2==3.1.2 # via feast (setup.py) -jsonschema==4.6.0 +jsonschema==4.9.0 # via feast (setup.py) locket==1.0.0 # via partd @@ -87,11 +85,11 @@ mmh3==3.0.0 # via feast (setup.py) moreorless==0.4.0 # via bowler -mypy==0.961 +mypy==0.971 # via sqlalchemy mypy-extensions==0.4.3 # via mypy -numpy==1.21.6 +numpy==1.23.1 # via # feast (setup.py) # pandas @@ -107,9 +105,11 @@ pandavro==1.5.2 # via feast (setup.py) partd==1.2.0 # via dask +pkgutil-resolve-name==1.3.10 + # via jsonschema proto-plus==1.20.6 # via feast (setup.py) -protobuf==3.19.4 +protobuf==3.20.1 # via # feast (setup.py) # google-api-core @@ -117,7 +117,7 @@ protobuf==3.19.4 # grpcio-reflection # proto-plus # tensorflow-metadata -pyarrow==6.0.1 +pyarrow==8.0.0 # via feast (setup.py) pyasn1==0.4.8 # via @@ -146,9 +146,9 @@ pyyaml==6.0 # dask # feast (setup.py) # uvicorn -requests==2.28.0 +requests==2.28.1 # via google-api-core -rsa==4.8 +rsa==4.9 # via google-auth six==1.16.0 # via @@ -158,7 +158,7 @@ six==1.16.0 # python-dateutil sniffio==1.2.0 # via anyio -sqlalchemy[mypy]==1.4.38 +sqlalchemy[mypy]==1.4.39 # via feast (setup.py) sqlalchemy2-stubs==0.0.2a24 # via sqlalchemy @@ -174,7 +174,7 @@ toml==0.10.2 # via feast (setup.py) tomli==2.0.1 # via mypy -toolz==0.11.2 +toolz==0.12.0 # via # dask # partd @@ -182,23 +182,23 @@ tqdm==4.64.0 # via feast (setup.py) typeguard==2.13.3 # via feast (setup.py) -typing-extensions==4.2.0 +typing-extensions==4.3.0 # via # mypy # pydantic # sqlalchemy2-stubs # starlette -urllib3==1.26.9 +urllib3==1.26.11 # via requests -uvicorn[standard]==0.18.1 +uvicorn[standard]==0.18.2 # via feast (setup.py) uvloop==0.16.0 # via uvicorn volatile==2.1.0 # via bowler -watchfiles==0.15.0 +watchfiles==0.16.1 # via uvicorn websockets==10.3 # via uvicorn -zipp==3.8.0 +zipp==3.8.1 # via importlib-resources diff --git a/sdk/python/requirements/py3.9-ci-requirements.txt b/sdk/python/requirements/py3.9-ci-requirements.txt index f9f65633f0..5d118a3ae2 100644 --- a/sdk/python/requirements/py3.9-ci-requirements.txt +++ b/sdk/python/requirements/py3.9-ci-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --extra=ci --output-file=sdk/python/requirements/py3.9-ci-requirements.txt # -absl-py==1.1.0 +absl-py==1.2.0 # via tensorflow-metadata adal==1.2.7 # via @@ -33,9 +33,7 @@ anyio==3.6.1 # starlette # watchfiles appdirs==1.4.4 - # via - # black - # fissix + # via fissix appnope==0.1.3 # via ipython asn1crypto==1.5.1 @@ -50,16 +48,15 @@ async-timeout==4.0.2 # via # aiohttp # redis -attrs==21.4.0 +attrs==22.1.0 # via # aiohttp - # black # bowler # jsonschema # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.24.1 +azure-core==1.24.2 # via # adlfs # azure-identity @@ -69,13 +66,13 @@ azure-datalake-store==0.0.52 # via adlfs azure-identity==1.10.0 # via adlfs -azure-storage-blob==12.12.0 +azure-storage-blob==12.13.0 # via adlfs babel==2.10.3 # via sphinx backcall==0.2.0 # via ipython -black==19.10b0 +black==22.6.0 # via feast (setup.py) boto3==1.20.23 # via @@ -90,10 +87,12 @@ botocore==1.23.24 bowler==0.9.0 # via feast (setup.py) build==0.8.0 - # via feast (setup.py) + # via + # feast (setup.py) + # pip-tools cachecontrol==0.12.11 # via firebase-admin -cachetools==4.2.4 +cachetools==5.2.0 # via google-auth certifi==2022.6.15 # via @@ -101,7 +100,7 @@ certifi==2022.6.15 # msrest # requests # snowflake-connector-python -cffi==1.15.0 +cffi==1.15.1 # via # azure-datalake-store # cryptography @@ -113,7 +112,7 @@ charset-normalizer==2.0.12 # aiohttp # requests # snowflake-connector-python -click==8.0.1 +click==8.1.3 # via # black # bowler @@ -128,7 +127,7 @@ colorama==0.4.5 # via # feast (setup.py) # great-expectations -coverage[toml]==6.4.1 +coverage[toml]==6.4.2 # via pytest-cov cryptography==35.0.0 # via @@ -145,6 +144,8 @@ dask==2022.1.1 # via feast (setup.py) dataclasses==0.6 # via great-expectations +db-dtypes==1.0.2 + # via google-cloud-bigquery decorator==5.1.1 # via # gcsfs @@ -155,7 +156,7 @@ deprecation==2.1.0 # via testcontainers dill==0.3.5.1 # via feast (setup.py) -distlib==0.3.4 +distlib==0.3.5 # via virtualenv docker==5.0.3 # via @@ -169,23 +170,23 @@ entrypoints==0.4 # via altair execnet==1.9.0 # via pytest-xdist -executing==0.8.3 +executing==0.9.1 # via stack-data -fastapi==0.78.0 +fastapi==0.79.0 # via feast (setup.py) -fastavro==1.5.1 +fastavro==1.5.4 # via # feast (setup.py) # pandavro -fastjsonschema==2.15.3 +fastjsonschema==2.16.1 # via nbformat filelock==3.7.1 # via virtualenv -firebase-admin==4.5.2 +firebase-admin==5.2.0 # via feast (setup.py) fissix==21.11.13 # via bowler -flake8==4.0.1 +flake8==5.0.2 # via feast (setup.py) frozenlist==1.3.0 # via @@ -199,7 +200,7 @@ fsspec==2022.1.0 # s3fs gcsfs==2022.1.0 # via feast (setup.py) -google-api-core[grpc]==1.31.6 +google-api-core[grpc]==2.8.2 # via # feast (setup.py) # firebase-admin @@ -209,9 +210,10 @@ google-api-core[grpc]==1.31.6 # google-cloud-core # google-cloud-datastore # google-cloud-firestore -google-api-python-client==2.51.0 + # google-cloud-storage +google-api-python-client==2.55.0 # via firebase-admin -google-auth==1.35.0 +google-auth==2.9.1 # via # gcsfs # google-api-core @@ -224,54 +226,57 @@ google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.2 # via gcsfs -google-cloud-bigquery==2.34.4 - # via feast (setup.py) -google-cloud-bigquery-storage==2.13.2 +google-cloud-bigquery[pandas]==3.3.0 # via feast (setup.py) -google-cloud-core==1.7.2 +google-cloud-bigquery-storage==2.14.1 # via # feast (setup.py) # google-cloud-bigquery +google-cloud-core==2.3.2 + # via + # google-cloud-bigquery # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.7.1 +google-cloud-datastore==2.8.0 # via feast (setup.py) -google-cloud-firestore==2.5.3 +google-cloud-firestore==2.6.0 # via firebase-admin -google-cloud-storage==1.40.0 +google-cloud-storage==2.4.0 # via # feast (setup.py) # firebase-admin # gcsfs google-crc32c==1.3.0 # via google-resumable-media -google-resumable-media==1.3.3 +google-resumable-media==2.3.3 # via # google-cloud-bigquery # google-cloud-storage -googleapis-common-protos==1.56.3 +googleapis-common-protos==1.56.4 # via # feast (setup.py) # google-api-core + # grpcio-status # tensorflow-metadata great-expectations==0.14.13 # via feast (setup.py) -greenlet==1.1.2 - # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) # google-api-core # google-cloud-bigquery # grpcio-reflection + # grpcio-status # grpcio-testing # grpcio-tools grpcio-reflection==1.47.0 # via feast (setup.py) -grpcio-testing==1.44.0 +grpcio-status==1.47.0 + # via google-api-core +grpcio-testing==1.47.0 # via feast (setup.py) -grpcio-tools==1.44.0 +grpcio-tools==1.47.0 # via feast (setup.py) h11==0.13.0 # via uvicorn @@ -285,7 +290,7 @@ httplib2==0.20.4 # google-auth-httplib2 httptools==0.4.0 # via uvicorn -identify==2.5.1 +identify==2.5.2 # via pre-commit idna==3.3 # via @@ -293,9 +298,9 @@ idna==3.3 # requests # snowflake-connector-python # yarl -imagesize==1.3.0 +imagesize==1.4.1 # via sphinx -importlib-metadata==4.11.4 +importlib-metadata==4.12.0 # via great-expectations iniconfig==1.1.1 # via pytest @@ -322,13 +327,13 @@ jsonpatch==1.32 # via great-expectations jsonpointer==2.3 # via jsonpatch -jsonschema==4.6.0 +jsonschema==4.9.0 # via # altair # feast (setup.py) # great-expectations # nbformat -jupyter-core==4.10.0 +jupyter-core==4.11.1 # via nbformat locket==1.0.0 # via partd @@ -336,13 +341,14 @@ markupsafe==2.1.1 # via # jinja2 # moto + # werkzeug matplotlib-inline==0.1.3 # via ipython -mccabe==0.6.1 +mccabe==0.7.0 # via flake8 minio==7.1.0 # via feast (setup.py) -mistune==2.0.2 +mistune==2.0.4 # via great-expectations mmh3==3.0.0 # via feast (setup.py) @@ -350,7 +356,7 @@ mock==2.0.0 # via feast (setup.py) moreorless==0.4.0 # via bowler -moto==3.1.14 +moto==3.1.16 # via feast (setup.py) msal==1.18.0 # via @@ -370,23 +376,26 @@ multidict==6.0.2 # via # aiohttp # yarl -mypy==0.961 +mypy==0.971 # via # feast (setup.py) # sqlalchemy mypy-extensions==0.4.3 - # via mypy + # via + # black + # mypy mypy-protobuf==3.1 # via feast (setup.py) mysqlclient==2.1.1 # via feast (setup.py) nbformat==5.4.0 # via great-expectations -nodeenv==1.6.0 +nodeenv==1.7.0 # via pre-commit -numpy==1.21.6 +numpy==1.23.1 # via # altair + # db-dtypes # feast (setup.py) # great-expectations # pandas @@ -401,8 +410,8 @@ packaging==21.3 # via # build # dask + # db-dtypes # deprecation - # google-api-core # google-cloud-bigquery # great-expectations # pytest @@ -411,7 +420,9 @@ packaging==21.3 pandas==1.4.3 # via # altair + # db-dtypes # feast (setup.py) + # google-cloud-bigquery # great-expectations # pandavro # snowflake-connector-python @@ -425,27 +436,27 @@ pathspec==0.9.0 # via black pbr==5.9.0 # via mock -pep517==0.12.0 - # via - # build - # pip-tools +pep517==0.13.0 + # via build pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.6.2 +pip-tools==6.8.0 # via feast (setup.py) platformdirs==2.5.2 - # via virtualenv + # via + # black + # virtualenv pluggy==1.0.0 # via pytest ply==3.11 # via thriftpy2 -portalocker==2.4.0 +portalocker==2.5.1 # via msal-extensions -pre-commit==2.19.0 +pre-commit==2.20.0 # via feast (setup.py) -prompt-toolkit==3.0.29 +prompt-toolkit==3.0.30 # via ipython proto-plus==1.20.6 # via @@ -454,7 +465,7 @@ proto-plus==1.20.6 # google-cloud-bigquery-storage # google-cloud-datastore # google-cloud-firestore -protobuf==3.19.4 +protobuf==3.20.1 # via # feast (setup.py) # google-api-core @@ -464,6 +475,7 @@ protobuf==3.19.4 # google-cloud-firestore # googleapis-common-protos # grpcio-reflection + # grpcio-status # grpcio-testing # grpcio-tools # mypy-protobuf @@ -487,7 +499,9 @@ py4j==0.10.9.5 # via pyspark pyarrow==6.0.1 # via + # db-dtypes # feast (setup.py) + # google-cloud-bigquery # snowflake-connector-python pyasn1==0.4.8 # via @@ -497,7 +511,7 @@ pyasn1-modules==0.2.8 # via google-auth pybindgen==0.22.1 # via feast (setup.py) -pycodestyle==2.8.0 +pycodestyle==2.9.0 # via flake8 pycparser==2.21 # via cffi @@ -507,7 +521,7 @@ pydantic==1.9.1 # via # fastapi # feast (setup.py) -pyflakes==2.4.0 +pyflakes==2.5.0 # via flake8 pygments==2.12.0 # via @@ -570,7 +584,6 @@ python-dotenv==0.20.0 pytz==2022.1 # via # babel - # google-api-core # great-expectations # moto # pandas @@ -586,9 +599,7 @@ pyyaml==6.0 # uvicorn redis==4.2.2 # via feast (setup.py) -regex==2022.6.2 - # via black -requests==2.28.0 +requests==2.28.1 # via # adal # adlfs @@ -615,7 +626,7 @@ requests-oauthlib==1.3.1 # msrest responses==0.21.0 # via moto -rsa==4.8 +rsa==4.9 # via google-auth ruamel-yaml==0.17.17 # via great-expectations @@ -625,24 +636,20 @@ s3fs==2022.1.0 # via feast (setup.py) s3transfer==0.5.2 # via boto3 -scipy==1.8.1 +scipy==1.9.0 # via great-expectations six==1.16.0 # via # azure-core # azure-identity - # google-api-core # google-auth # google-auth-httplib2 - # google-cloud-core - # google-resumable-media # grpcio # happybase # mock # msrestazure # pandavro # python-dateutil - # virtualenv sniffio==1.2.0 # via anyio snowballstemmer==2.2.0 @@ -667,7 +674,7 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -sqlalchemy[mypy]==1.4.38 +sqlalchemy[mypy]==1.4.39 # via feast (setup.py) sqlalchemy2-stubs==0.0.2a24 # via sqlalchemy @@ -689,17 +696,17 @@ thriftpy2==0.4.14 # via happybase toml==0.10.2 # via - # black # feast (setup.py) # pre-commit tomli==2.0.1 # via + # black # build # coverage # mypy # pep517 # pytest -toolz==0.11.2 +toolz==0.12.0 # via # altair # dask @@ -714,36 +721,35 @@ traitlets==5.3.0 # jupyter-core # matplotlib-inline # nbformat -trino==0.313.0 +trino==0.315.0 # via feast (setup.py) -typed-ast==1.5.4 - # via black typeguard==2.13.3 # via feast (setup.py) types-protobuf==3.19.22 # via # feast (setup.py) # mypy-protobuf -types-python-dateutil==2.8.18 +types-python-dateutil==2.8.19 # via feast (setup.py) -types-pytz==2022.1.0 +types-pytz==2022.1.2 # via feast (setup.py) -types-pyyaml==6.0.8 +types-pyyaml==6.0.11 # via feast (setup.py) -types-redis==4.3.2 +types-redis==4.3.13 # via feast (setup.py) -types-requests==2.27.31 +types-requests==2.28.6 # via feast (setup.py) -types-setuptools==57.4.17 +types-setuptools==63.2.2 # via feast (setup.py) -types-tabulate==0.8.10 +types-tabulate==0.8.11 # via feast (setup.py) -types-urllib3==1.26.15 +types-urllib3==1.26.20 # via types-requests -typing-extensions==4.2.0 +typing-extensions==4.3.0 # via # aioitertools # azure-core + # black # great-expectations # mypy # pydantic @@ -755,7 +761,7 @@ tzlocal==4.2 # via great-expectations uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.9 +urllib3==1.26.11 # via # botocore # feast (setup.py) @@ -763,15 +769,15 @@ urllib3==1.26.9 # minio # requests # responses -uvicorn[standard]==0.18.1 +uvicorn[standard]==0.18.2 # via feast (setup.py) uvloop==0.16.0 # via uvicorn -virtualenv==20.14.1 +virtualenv==20.16.2 # via pre-commit volatile==2.1.0 # via bowler -watchfiles==0.15.0 +watchfiles==0.16.1 # via uvicorn wcwidth==0.2.5 # via prompt-toolkit @@ -779,7 +785,7 @@ websocket-client==1.3.3 # via docker websockets==10.3 # via uvicorn -werkzeug==2.1.2 +werkzeug==2.2.1 # via moto wheel==0.37.1 # via pip-tools @@ -790,9 +796,9 @@ wrapt==1.14.1 # testcontainers xmltodict==0.13.0 # via moto -yarl==1.7.2 +yarl==1.8.0 # via aiohttp -zipp==3.8.0 +zipp==3.8.1 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/sdk/python/requirements/py3.9-requirements.txt b/sdk/python/requirements/py3.9-requirements.txt index f5c15dad5d..1ef60c531a 100644 --- a/sdk/python/requirements/py3.9-requirements.txt +++ b/sdk/python/requirements/py3.9-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=sdk/python/requirements/py3.9-requirements.txt # -absl-py==1.1.0 +absl-py==1.2.0 # via tensorflow-metadata anyio==3.6.1 # via @@ -12,7 +12,7 @@ anyio==3.6.1 # watchfiles appdirs==1.4.4 # via fissix -attrs==21.4.0 +attrs==22.1.0 # via # bowler # jsonschema @@ -22,9 +22,9 @@ cachetools==5.2.0 # via google-auth certifi==2022.6.15 # via requests -charset-normalizer==2.0.12 +charset-normalizer==2.1.0 # via requests -click==8.0.1 +click==8.1.3 # via # bowler # feast (setup.py) @@ -38,27 +38,25 @@ dask==2022.1.1 # via feast (setup.py) dill==0.3.5.1 # via feast (setup.py) -fastapi==0.78.0 +fastapi==0.79.0 # via feast (setup.py) -fastavro==1.5.1 +fastavro==1.5.4 # via # feast (setup.py) # pandavro fissix==21.11.13 # via bowler -fsspec==2022.5.0 +fsspec==2022.7.1 # via dask google-api-core==2.8.2 # via feast (setup.py) -google-auth==2.8.0 +google-auth==2.9.1 # via google-api-core -googleapis-common-protos==1.56.3 +googleapis-common-protos==1.56.4 # via # feast (setup.py) # google-api-core # tensorflow-metadata -greenlet==1.1.2 - # via sqlalchemy grpcio==1.47.0 # via # feast (setup.py) @@ -75,7 +73,7 @@ idna==3.3 # requests jinja2==3.1.2 # via feast (setup.py) -jsonschema==4.6.0 +jsonschema==4.9.0 # via feast (setup.py) locket==1.0.0 # via partd @@ -85,11 +83,11 @@ mmh3==3.0.0 # via feast (setup.py) moreorless==0.4.0 # via bowler -mypy==0.961 +mypy==0.971 # via sqlalchemy mypy-extensions==0.4.3 # via mypy -numpy==1.21.6 +numpy==1.23.1 # via # feast (setup.py) # pandas @@ -107,7 +105,7 @@ partd==1.2.0 # via dask proto-plus==1.20.6 # via feast (setup.py) -protobuf==3.19.4 +protobuf==3.20.1 # via # feast (setup.py) # google-api-core @@ -115,7 +113,7 @@ protobuf==3.19.4 # grpcio-reflection # proto-plus # tensorflow-metadata -pyarrow==6.0.1 +pyarrow==8.0.0 # via feast (setup.py) pyasn1==0.4.8 # via @@ -144,9 +142,9 @@ pyyaml==6.0 # dask # feast (setup.py) # uvicorn -requests==2.28.0 +requests==2.28.1 # via google-api-core -rsa==4.8 +rsa==4.9 # via google-auth six==1.16.0 # via @@ -156,7 +154,7 @@ six==1.16.0 # python-dateutil sniffio==1.2.0 # via anyio -sqlalchemy[mypy]==1.4.38 +sqlalchemy[mypy]==1.4.39 # via feast (setup.py) sqlalchemy2-stubs==0.0.2a24 # via sqlalchemy @@ -172,7 +170,7 @@ toml==0.10.2 # via feast (setup.py) tomli==2.0.1 # via mypy -toolz==0.11.2 +toolz==0.12.0 # via # dask # partd @@ -180,21 +178,21 @@ tqdm==4.64.0 # via feast (setup.py) typeguard==2.13.3 # via feast (setup.py) -typing-extensions==4.2.0 +typing-extensions==4.3.0 # via # mypy # pydantic # sqlalchemy2-stubs # starlette -urllib3==1.26.9 +urllib3==1.26.11 # via requests -uvicorn[standard]==0.18.1 +uvicorn[standard]==0.18.2 # via feast (setup.py) uvloop==0.16.0 # via uvicorn volatile==2.1.0 # via bowler -watchfiles==0.15.0 +watchfiles==0.16.1 # via uvicorn websockets==10.3 # via uvicorn diff --git a/sdk/python/tests/benchmarks/test_benchmark_universal_online_retrieval.py b/sdk/python/tests/benchmarks/test_benchmark_universal_online_retrieval.py index 6e22c93e5f..03070887c4 100644 --- a/sdk/python/tests/benchmarks/test_benchmark_universal_online_retrieval.py +++ b/sdk/python/tests/benchmarks/test_benchmark_universal_online_retrieval.py @@ -60,5 +60,7 @@ def test_online_retrieval(environment, universal_data_sources, benchmark): unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") benchmark( - fs.get_online_features, features=feature_refs, entity_rows=entity_rows, + fs.get_online_features, + features=feature_refs, + entity_rows=entity_rows, ) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index bf69a85fa3..b4bcccd9c6 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -13,8 +13,7 @@ # limitations under the License. import logging import multiprocessing -import socket -from contextlib import closing +import os from datetime import datetime, timedelta from multiprocessing import Process from sys import platform @@ -24,13 +23,15 @@ import pytest from _pytest.nodes import Item -from feast import FeatureStore -from feast.wait import wait_retry_backoff -from tests.data.data_creator import create_dataset -from tests.integration.feature_repos.integration_test_repo_config import ( +os.environ["FEAST_USAGE"] = "False" +os.environ["IS_TEST"] = "True" +from feast import FeatureStore # noqa: E402 +from feast.wait import wait_retry_backoff # noqa: E402 +from tests.data.data_creator import create_basic_driver_dataset # noqa: E402 +from tests.integration.feature_repos.integration_test_repo_config import ( # noqa: E402 IntegrationTestRepoConfig, ) -from tests.integration.feature_repos.repo_configuration import ( +from tests.integration.feature_repos.repo_configuration import ( # noqa: E402 AVAILABLE_OFFLINE_STORES, AVAILABLE_ONLINE_STORES, OFFLINE_STORE_TO_PROVIDER_CONFIG, @@ -39,12 +40,29 @@ construct_test_environment, construct_universal_test_data, ) -from tests.integration.feature_repos.universal.data_sources.file import ( +from tests.integration.feature_repos.universal.data_sources.file import ( # noqa: E402 FileDataSourceCreator, ) +from tests.utils.http_server import check_port_open, free_port # noqa: E402 logger = logging.getLogger(__name__) +level = logging.INFO +logging.basicConfig( + format="%(asctime)s %(name)s %(levelname)s: %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + level=level, +) +# Override the logging level for already created loggers (due to loggers being created at the import time) +# Note, that format & datefmt does not need to be set, because by default child loggers don't override them + +# Also note, that mypy complains that logging.root doesn't have "manager" because of the way it's written. +# So we have to put a type ignore hint for mypy. +for logger_name in logging.root.manager.loggerDict: # type: ignore + if "feast" in logger_name: + logger = logging.getLogger(logger_name) + logger.setLevel(level) + def pytest_configure(config): if platform in ["darwin", "windows"]: @@ -76,7 +94,10 @@ def pytest_addoption(parser): help="Run tests with external dependencies", ) parser.addoption( - "--benchmark", action="store_true", default=False, help="Run benchmark tests", + "--benchmark", + action="store_true", + default=False, + help="Run benchmark tests", ) parser.addoption( "--goserver", @@ -110,7 +131,10 @@ def pytest_collection_modifyitems(config, items: List[Item]): items.append(t) goserver_tests = [t for t in items if "goserver" in t.keywords] - if should_run_goserver: + if not should_run_goserver: + for t in goserver_tests: + items.remove(t) + else: items.clear() for t in goserver_tests: items.append(t) @@ -161,7 +185,7 @@ def start_test_local_server(repo_path: str, port: int): fs.serve("localhost", port, no_access_log=True) -@pytest.fixture(scope="session") +@pytest.fixture def environment(request, worker_id): e = construct_test_environment( request.param, worker_id=worker_id, fixture_request=request @@ -246,7 +270,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): ) if "goserver" in markers: - extra_dimensions.append({"go_feature_retrieval": True}) + extra_dimensions.append({"go_feature_serving": True}) configs = [] if offline_stores: @@ -261,7 +285,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): **dim, } # temporary Go works only with redis - if config.get("go_feature_retrieval") and ( + if config.get("go_feature_serving") and ( not isinstance(online_store, dict) or online_store["type"] != "redis" ): @@ -293,7 +317,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): ) -@pytest.fixture(scope="session") +@pytest.fixture def feature_server_endpoint(environment): if ( not environment.python_feature_server @@ -302,7 +326,7 @@ def feature_server_endpoint(environment): yield environment.feature_store.get_feature_server_endpoint() return - port = _free_port() + port = free_port() proc = Process( target=start_test_local_server, @@ -315,7 +339,8 @@ def feature_server_endpoint(environment): proc.start() # Wait for server to start wait_retry_backoff( - lambda: (None, _check_port_open("localhost", port)), timeout_secs=10, + lambda: (None, check_port_open("localhost", port)), + timeout_secs=10, ) yield f"http://localhost:{port}" @@ -327,33 +352,24 @@ def feature_server_endpoint(environment): wait_retry_backoff( lambda: ( None, - not _check_port_open("localhost", environment.get_local_server_port()), + not check_port_open("localhost", environment.get_local_server_port()), ), timeout_secs=30, ) -def _check_port_open(host, port) -> bool: - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: - return sock.connect_ex((host, port)) == 0 - - -def _free_port(): - sock = socket.socket() - sock.bind(("", 0)) - return sock.getsockname()[1] - - -@pytest.fixture(scope="session") +@pytest.fixture def universal_data_sources(environment) -> TestData: return construct_universal_test_data(environment) -@pytest.fixture(scope="session") +@pytest.fixture def e2e_data_sources(environment: Environment): - df = create_dataset() + df = create_basic_driver_dataset() data_source = environment.data_source_creator.create_data_source( - df, environment.feature_store.project, field_mapping={"ts_1": "ts"}, + df, + environment.feature_store.project, + field_mapping={"ts_1": "ts"}, ) return df, data_source diff --git a/sdk/python/tests/data/data_creator.py b/sdk/python/tests/data/data_creator.py index 186c39b9ef..2155468445 100644 --- a/sdk/python/tests/data/data_creator.py +++ b/sdk/python/tests/data/data_creator.py @@ -7,7 +7,7 @@ from feast.types import FeastType, Float32, Int32, Int64, String -def create_dataset( +def create_basic_driver_dataset( entity_type: FeastType = Int32, feature_dtype: str = None, feature_is_list: bool = False, diff --git a/sdk/python/tests/doctest/test_all.py b/sdk/python/tests/doctest/test_all.py index 31f181ad53..0412e34c36 100644 --- a/sdk/python/tests/doctest/test_all.py +++ b/sdk/python/tests/doctest/test_all.py @@ -17,7 +17,10 @@ def setup_feature_store(): init_repo("feature_repo", "local") fs = FeatureStore(repo_path="feature_repo") - driver = Entity(name="driver_id", description="driver id",) + driver = Entity( + name="driver_id", + description="driver id", + ) driver_hourly_stats = FileSource( path="feature_repo/data/driver_stats.parquet", timestamp_field="event_timestamp", @@ -88,7 +91,8 @@ def test_docstrings(): setup_function() test_suite = doctest.DocTestSuite( - temp_module, optionflags=doctest.ELLIPSIS, + temp_module, + optionflags=doctest.ELLIPSIS, ) if test_suite.countTestCases() > 0: result = unittest.TextTestRunner(sys.stdout).run(test_suite) diff --git a/sdk/python/tests/example_repos/empty_feature_repo.py b/sdk/python/tests/example_repos/empty_feature_repo.py new file mode 100644 index 0000000000..8353c2a7fd --- /dev/null +++ b/sdk/python/tests/example_repos/empty_feature_repo.py @@ -0,0 +1,3 @@ +# This example feature repo is deliberately left empty. It should be used for tests that do not need +# any feature views or other objects (for example, a test that checks that a feature service can be +# applied and retrieved correctly). diff --git a/sdk/python/tests/example_repos/example_feature_repo_1.py b/sdk/python/tests/example_repos/example_feature_repo_1.py index 8d6d96d9ef..200065f0b1 100644 --- a/sdk/python/tests/example_repos/example_feature_repo_1.py +++ b/sdk/python/tests/example_repos/example_feature_repo_1.py @@ -1,38 +1,32 @@ from datetime import timedelta -from feast import BigQuerySource, Entity, FeatureService, FeatureView, Field, PushSource +from feast import Entity, FeatureService, FeatureView, Field, FileSource, PushSource from feast.types import Float32, Int64, String -driver_locations_source = BigQuerySource( - table="feast-oss.public.drivers", - timestamp_field="event_timestamp", - created_timestamp_column="created_timestamp", -) +# Note that file source paths are not validated, so there doesn't actually need to be any data +# at the paths for these file sources. Since these paths are effectively fake, this example +# feature repo should not be used for historical retrieval. -driver_locations_source_query = BigQuerySource( - query="SELECT * from feast-oss.public.drivers", +driver_locations_source = FileSource( + path="data/driver_locations.parquet", timestamp_field="event_timestamp", created_timestamp_column="created_timestamp", ) -driver_locations_source_query_2 = BigQuerySource( - query="SELECT lat * 2 FROM feast-oss.public.drivers", - timestamp_field="event_timestamp", - created_timestamp_column="created_timestamp", -) - -customer_profile_source = BigQuerySource( +customer_profile_source = FileSource( name="customer_profile_source", - table="feast-oss.public.customers", + path="data/customer_profiles.parquet", timestamp_field="event_timestamp", ) -customer_driver_combined_source = BigQuerySource( - table="feast-oss.public.customer_driver", timestamp_field="event_timestamp", +customer_driver_combined_source = FileSource( + path="data/customer_driver_combined.parquet", + timestamp_field="event_timestamp", ) driver_locations_push_source = PushSource( - name="driver_locations_push", batch_source=driver_locations_source, + name="driver_locations_push", + batch_source=driver_locations_source, ) driver = Entity( diff --git a/sdk/python/tests/example_repos/example_feature_repo_2.py b/sdk/python/tests/example_repos/example_feature_repo_2.py index 073c48c1c1..21476e3779 100644 --- a/sdk/python/tests/example_repos/example_feature_repo_2.py +++ b/sdk/python/tests/example_repos/example_feature_repo_2.py @@ -9,7 +9,10 @@ created_timestamp_column="created", ) -driver = Entity(name="driver_id", description="driver id",) +driver = Entity( + name="driver_id", + description="driver id", +) driver_hourly_stats_view = FeatureView( diff --git a/sdk/python/tests/example_repos/example_feature_repo_version_0_19.py b/sdk/python/tests/example_repos/example_feature_repo_version_0_19.py index a65c031cea..68681794f9 100644 --- a/sdk/python/tests/example_repos/example_feature_repo_version_0_19.py +++ b/sdk/python/tests/example_repos/example_feature_repo_version_0_19.py @@ -57,7 +57,8 @@ request_source = RequestDataSource( - name="conv_rate_input", schema={"val_to_add": ValueType.INT64}, + name="conv_rate_input", + schema={"val_to_add": ValueType.INT64}, ) diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_driver_stats_feature_view.py b/sdk/python/tests/example_repos/example_feature_repo_with_driver_stats_feature_view.py new file mode 100644 index 0000000000..b6525abbfc --- /dev/null +++ b/sdk/python/tests/example_repos/example_feature_repo_with_driver_stats_feature_view.py @@ -0,0 +1,30 @@ +from datetime import timedelta + +from feast import Entity, FeatureView, Field, FileSource +from feast.types import Float32, Int32, Int64 + +driver_hourly_stats = FileSource( + path="data/driver_stats.parquet", # Fake path + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +driver = Entity( + name="driver_id", + description="driver id", +) + +driver_hourly_stats_view = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=1), + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + Field(name="driver_id", dtype=Int32), + ], + online=True, + source=driver_hourly_stats, + tags={}, +) diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_duplicated_featureview_names.py b/sdk/python/tests/example_repos/example_feature_repo_with_duplicated_featureview_names.py index 4b079999ed..77b435ecc9 100644 --- a/sdk/python/tests/example_repos/example_feature_repo_with_duplicated_featureview_names.py +++ b/sdk/python/tests/example_repos/example_feature_repo_with_duplicated_featureview_names.py @@ -6,7 +6,11 @@ path="driver_stats.parquet", # this parquet is not real and will not be read ) -driver = Entity(name="driver_id", description="driver id", join_keys=["driver"],) +driver = Entity( + name="driver_id", + description="driver id", + join_keys=["driver"], +) driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", # Intentionally use the same FeatureView name diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_entity_join_key.py b/sdk/python/tests/example_repos/example_feature_repo_with_entity_join_key.py index 0663150531..c30b933eaf 100644 --- a/sdk/python/tests/example_repos/example_feature_repo_with_entity_join_key.py +++ b/sdk/python/tests/example_repos/example_feature_repo_with_entity_join_key.py @@ -11,7 +11,11 @@ # The join key here is deliberately different from the parquet file to test the failure path. -driver = Entity(name="driver_id", description="driver id", join_keys=["driver"],) +driver = Entity( + name="driver_id", + description="driver id", + join_keys=["driver"], +) driver_hourly_stats_view = FeatureView( diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_feature_service.py b/sdk/python/tests/example_repos/example_feature_repo_with_feature_service.py new file mode 100644 index 0000000000..372bd9afb7 --- /dev/null +++ b/sdk/python/tests/example_repos/example_feature_repo_with_feature_service.py @@ -0,0 +1,36 @@ +from datetime import timedelta + +from feast import Entity, FeatureService, FeatureView, Field, FileSource +from feast.types import Float32, Int64, String + +driver_locations_source = FileSource( + path="data/driver_locations.parquet", + timestamp_field="event_timestamp", + created_timestamp_column="created_timestamp", +) + +driver = Entity( + name="driver", # The name is derived from this argument, not object name. + join_keys=["driver_id"], + description="driver id", +) + +driver_locations = FeatureView( + name="driver_locations", + entities=[driver], + ttl=timedelta(days=1), + schema=[ + Field(name="lat", dtype=Float32), + Field(name="lon", dtype=String), + Field(name="driver_id", dtype=Int64), + ], + online=True, + batch_source=driver_locations_source, + tags={}, +) + +all_drivers_feature_service = FeatureService( + name="driver_locations_service", + features=[driver_locations], + tags={"release": "production"}, +) diff --git a/sdk/python/tests/example_repos/on_demand_feature_view_repo.py b/sdk/python/tests/example_repos/on_demand_feature_view_repo.py index ac572d5747..5df0ee1c6f 100644 --- a/sdk/python/tests/example_repos/on_demand_feature_view_repo.py +++ b/sdk/python/tests/example_repos/on_demand_feature_view_repo.py @@ -15,7 +15,10 @@ owner="test2@gmail.com", ) -driver = Entity(name="driver_id", description="driver id",) +driver = Entity( + name="driver_id", + description="driver id", +) driver_daily_features_view = FeatureView( name="driver_daily_features", diff --git a/sdk/python/tests/foo_provider.py b/sdk/python/tests/foo_provider.py index bd6f9811e8..7866465b91 100644 --- a/sdk/python/tests/foo_provider.py +++ b/sdk/python/tests/foo_provider.py @@ -31,7 +31,10 @@ def update_infra( pass def teardown_infra( - self, project: str, tables: Sequence[FeatureView], entities: Sequence[Entity], + self, + project: str, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ): pass diff --git a/sdk/python/tests/integration/e2e/test_go_feature_server.py b/sdk/python/tests/integration/e2e/test_go_feature_server.py index 4fd003c194..0f972e45df 100644 --- a/sdk/python/tests/integration/e2e/test_go_feature_server.py +++ b/sdk/python/tests/integration/e2e/test_go_feature_server.py @@ -1,7 +1,5 @@ -import socket import threading import time -from contextlib import closing from datetime import datetime from typing import List @@ -11,10 +9,10 @@ import pytz import requests -from feast import FeatureService, FeatureView, ValueType from feast.embedded_go.online_features_service import EmbeddedOnlineFeatureServer from feast.feast_object import FeastObject from feast.feature_logging import LoggingConfig +from feast.feature_service import FeatureService from feast.infra.feature_servers.base_config import FeatureLoggingConfig from feast.protos.feast.serving.ServingService_pb2 import ( FieldStatus, @@ -24,6 +22,7 @@ from feast.protos.feast.serving.ServingService_pb2_grpc import ServingServiceStub from feast.protos.feast.types.Value_pb2 import RepeatedValue from feast.type_map import python_values_to_proto_values +from feast.value_type import ValueType from feast.wait import wait_retry_backoff from tests.integration.feature_repos.repo_configuration import ( construct_universal_feature_views, @@ -33,92 +32,8 @@ driver, location, ) - - -@pytest.fixture(scope="session") -def initialized_registry(environment, universal_data_sources): - fs = environment.feature_store - - _, _, data_sources = universal_data_sources - feature_views = construct_universal_feature_views(data_sources) - - feature_service = FeatureService( - name="driver_features", - features=[feature_views.driver], - logging_config=LoggingConfig( - destination=environment.data_source_creator.create_logged_features_destination(), - sample_rate=1.0, - ), - ) - feast_objects: List[FeastObject] = [feature_service] - feast_objects.extend(feature_views.values()) - feast_objects.extend([driver(), customer(), location()]) - - fs.apply(feast_objects) - fs.materialize(environment.start_date, environment.end_date) - - -def server_port(environment, server_type: str): - if not environment.test_repo_config.go_feature_retrieval: - pytest.skip("Only for Go path") - - fs = environment.feature_store - - embedded = EmbeddedOnlineFeatureServer( - repo_path=str(fs.repo_path.absolute()), repo_config=fs.config, feature_store=fs, - ) - port = free_port() - if server_type == "grpc": - target = embedded.start_grpc_server - elif server_type == "http": - target = embedded.start_http_server - else: - raise ValueError("Server Type must be either 'http' or 'grpc'") - - t = threading.Thread( - target=target, - args=("127.0.0.1", port), - kwargs=dict( - enable_logging=True, - logging_options=FeatureLoggingConfig( - enabled=True, - queue_capacity=100, - write_to_disk_interval_secs=1, - flush_interval_secs=1, - emit_timeout_micro_secs=10000, - ), - ), - ) - t.start() - - wait_retry_backoff( - lambda: (None, check_port_open("127.0.0.1", port)), timeout_secs=15 - ) - - yield port - if server_type == "grpc": - embedded.stop_grpc_server() - else: - embedded.stop_http_server() - - # wait for graceful stop - time.sleep(5) - - -@pytest.fixture -def grpc_server_port(environment, initialized_registry): - yield from server_port(environment, "grpc") - - -@pytest.fixture -def http_server_port(environment, initialized_registry): - yield from server_port(environment, "http") - - -@pytest.fixture -def grpc_client(grpc_server_port): - ch = grpc.insecure_channel(f"localhost:{grpc_server_port}") - yield ServingServiceStub(ch) +from tests.utils.http_server import check_port_open, free_port +from tests.utils.test_log_creator import generate_expected_logs, get_latest_rows @pytest.mark.integration @@ -252,43 +167,97 @@ def retrieve(): pd.testing.assert_frame_equal(expected_logs, persisted_logs, check_dtype=False) -def free_port(): - sock = socket.socket() - sock.bind(("", 0)) - return sock.getsockname()[1] +""" +Start go feature server either on http or grpc based on the repo configuration for testing. +""" -def check_port_open(host, port) -> bool: - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: - return sock.connect_ex((host, port)) == 0 +def _server_port(environment, server_type: str): + if not environment.test_repo_config.go_feature_serving: + pytest.skip("Only for Go path") + fs = environment.feature_store + + embedded = EmbeddedOnlineFeatureServer( + repo_path=str(fs.repo_path.absolute()), + repo_config=fs.config, + feature_store=fs, + ) + port = free_port() + if server_type == "grpc": + target = embedded.start_grpc_server + elif server_type == "http": + target = embedded.start_http_server + else: + raise ValueError("Server Type must be either 'http' or 'grpc'") -def get_latest_rows(df, join_key, entity_values): - rows = df[df[join_key].isin(entity_values)] - return rows.loc[rows.groupby(join_key)["event_timestamp"].idxmax()] + t = threading.Thread( + target=target, + args=("127.0.0.1", port), + kwargs=dict( + enable_logging=True, + logging_options=FeatureLoggingConfig( + enabled=True, + queue_capacity=100, + write_to_disk_interval_secs=1, + flush_interval_secs=1, + emit_timeout_micro_secs=10000, + ), + ), + ) + t.start() + wait_retry_backoff( + lambda: (None, check_port_open("127.0.0.1", port)), timeout_secs=15 + ) -def generate_expected_logs( - df: pd.DataFrame, - feature_view: FeatureView, - features: List[str], - join_keys: List[str], - timestamp_column: str, -): - logs = pd.DataFrame() - for join_key in join_keys: - logs[join_key] = df[join_key] - - for feature in features: - col = f"{feature_view.name}__{feature}" - logs[col] = df[feature] - logs[f"{col}__timestamp"] = df[timestamp_column] - logs[f"{col}__status"] = FieldStatus.PRESENT - if feature_view.ttl: - logs[f"{col}__status"] = logs[f"{col}__status"].mask( - df[timestamp_column] - < datetime.utcnow().replace(tzinfo=pytz.UTC) - feature_view.ttl, - FieldStatus.OUTSIDE_MAX_AGE, - ) + yield port + if server_type == "grpc": + embedded.stop_grpc_server() + else: + embedded.stop_http_server() - return logs.sort_values(by=join_keys).reset_index(drop=True) + # wait for graceful stop + time.sleep(5) + + +# Go test fixtures + + +@pytest.fixture +def initialized_registry(environment, universal_data_sources): + fs = environment.feature_store + + _, _, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + + feature_service = FeatureService( + name="driver_features", + features=[feature_views.driver], + logging_config=LoggingConfig( + destination=environment.data_source_creator.create_logged_features_destination(), + sample_rate=1.0, + ), + ) + feast_objects: List[FeastObject] = [feature_service] + feast_objects.extend(feature_views.values()) + feast_objects.extend([driver(), customer(), location()]) + + fs.apply(feast_objects) + fs.materialize(environment.start_date, environment.end_date) + + +@pytest.fixture +def grpc_server_port(environment, initialized_registry): + yield from _server_port(environment, "grpc") + + +@pytest.fixture +def http_server_port(environment, initialized_registry): + yield from _server_port(environment, "http") + + +@pytest.fixture +def grpc_client(grpc_server_port): + ch = grpc.insecure_channel(f"localhost:{grpc_server_port}") + yield ServingServiceStub(ch) diff --git a/sdk/python/tests/integration/e2e/test_python_feature_server.py b/sdk/python/tests/integration/e2e/test_python_feature_server.py index ea4c35a1ca..9c61f6fa19 100644 --- a/sdk/python/tests/integration/e2e/test_python_feature_server.py +++ b/sdk/python/tests/integration/e2e/test_python_feature_server.py @@ -7,13 +7,8 @@ from feast.feast_object import FeastObject from feast.feature_server import get_app -from tests.integration.feature_repos.integration_test_repo_config import ( - IntegrationTestRepoConfig, -) from tests.integration.feature_repos.repo_configuration import ( - construct_test_environment, construct_universal_feature_views, - construct_universal_test_data, ) from tests.integration.feature_repos.universal.entities import ( customer, @@ -63,7 +58,9 @@ def test_get_online_features(python_fs_client): @pytest.mark.integration @pytest.mark.universal_online_stores def test_push(python_fs_client): - initial_temp = get_temperatures(python_fs_client, location_ids=[1])[0] + initial_temp = _get_temperatures_from_feature_server( + python_fs_client, location_ids=[1] + )[0] json_data = json.dumps( { "push_source_name": "location_stats_push_source", @@ -75,14 +72,19 @@ def test_push(python_fs_client): }, } ) - response = python_fs_client.post("/push", data=json_data,) + response = python_fs_client.post( + "/push", + data=json_data, + ) # Check new pushed temperature is fetched assert response.status_code == 200 - assert get_temperatures(python_fs_client, location_ids=[1]) == [initial_temp * 100] + assert _get_temperatures_from_feature_server( + python_fs_client, location_ids=[1] + ) == [initial_temp * 100] -def get_temperatures(client, location_ids: List[int]): +def _get_temperatures_from_feature_server(client, location_ids: List[int]): get_request_data = { "features": ["pushable_location_stats:temperature"], "entities": {"location_id": location_ids}, @@ -99,20 +101,14 @@ def get_temperatures(client, location_ids: List[int]): @pytest.fixture -def python_fs_client(request): - config = IntegrationTestRepoConfig() - environment = construct_test_environment(config, fixture_request=request) +def python_fs_client(environment, universal_data_sources, request): fs = environment.feature_store - try: - entities, datasets, data_sources = construct_universal_test_data(environment) - feature_views = construct_universal_feature_views(data_sources) - feast_objects: List[FeastObject] = [] - feast_objects.extend(feature_views.values()) - feast_objects.extend([driver(), customer(), location()]) - fs.apply(feast_objects) - fs.materialize(environment.start_date, environment.end_date) - client = TestClient(get_app(fs)) - yield client - finally: - fs.teardown() - environment.data_source_creator.teardown() + entities, datasets, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + feast_objects: List[FeastObject] = [] + feast_objects.extend(feature_views.values()) + feast_objects.extend([driver(), customer(), location()]) + fs.apply(feast_objects) + fs.materialize(environment.start_date, environment.end_date) + client = TestClient(get_app(fs)) + yield client diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index a42a96e594..202ae859ae 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -1,14 +1,10 @@ -import math -from datetime import datetime, timedelta -from typing import Optional +from datetime import timedelta -import pandas as pd import pytest -from pytz import utc -from feast import FeatureStore, FeatureView from tests.integration.feature_repos.universal.entities import driver from tests.integration.feature_repos.universal.feature_views import driver_feature_view +from tests.utils.e2e_test_validation import validate_offline_online_store_consistency @pytest.mark.integration @@ -30,133 +26,4 @@ def test_e2e_consistency(environment, e2e_data_sources, infer_features): # we use timestamp from generated dataframe as a split point split_dt = df["ts_1"][4].to_pydatetime() - timedelta(seconds=1) - run_offline_online_store_consistency_test(fs, fv, split_dt) - - -def check_offline_and_online_features( - fs: FeatureStore, - fv: FeatureView, - driver_id: int, - event_timestamp: datetime, - expected_value: Optional[float], - full_feature_names: bool, - check_offline_store: bool = True, -) -> None: - # Check online store - response_dict = fs.get_online_features( - [f"{fv.name}:value"], - [{"driver_id": driver_id}], - full_feature_names=full_feature_names, - ).to_dict() - - if full_feature_names: - - if expected_value: - assert response_dict[f"{fv.name}__value"][0], f"Response: {response_dict}" - assert ( - abs(response_dict[f"{fv.name}__value"][0] - expected_value) < 1e-6 - ), f"Response: {response_dict}, Expected: {expected_value}" - else: - assert response_dict[f"{fv.name}__value"][0] is None - else: - if expected_value: - assert response_dict["value"][0], f"Response: {response_dict}" - assert ( - abs(response_dict["value"][0] - expected_value) < 1e-6 - ), f"Response: {response_dict}, Expected: {expected_value}" - else: - assert response_dict["value"][0] is None - - # Check offline store - if check_offline_store: - df = fs.get_historical_features( - entity_df=pd.DataFrame.from_dict( - {"driver_id": [driver_id], "event_timestamp": [event_timestamp]} - ), - features=[f"{fv.name}:value"], - full_feature_names=full_feature_names, - ).to_df() - - if full_feature_names: - if expected_value: - assert ( - abs( - df.to_dict(orient="list")[f"{fv.name}__value"][0] - - expected_value - ) - < 1e-6 - ) - else: - assert not df.to_dict(orient="list")[f"{fv.name}__value"] or math.isnan( - df.to_dict(orient="list")[f"{fv.name}__value"][0] - ) - else: - if expected_value: - assert ( - abs(df.to_dict(orient="list")["value"][0] - expected_value) < 1e-6 - ) - else: - assert not df.to_dict(orient="list")["value"] or math.isnan( - df.to_dict(orient="list")["value"][0] - ) - - -def run_offline_online_store_consistency_test( - fs: FeatureStore, fv: FeatureView, split_dt: datetime -) -> None: - now = datetime.utcnow() - - full_feature_names = True - check_offline_store: bool = True - - # Run materialize() - # use both tz-naive & tz-aware timestamps to test that they're both correctly handled - start_date = (now - timedelta(hours=5)).replace(tzinfo=utc) - end_date = split_dt - fs.materialize(feature_views=[fv.name], start_date=start_date, end_date=end_date) - - # check result of materialize() - check_offline_and_online_features( - fs=fs, - fv=fv, - driver_id=1, - event_timestamp=end_date, - expected_value=0.3, - full_feature_names=full_feature_names, - check_offline_store=check_offline_store, - ) - - check_offline_and_online_features( - fs=fs, - fv=fv, - driver_id=2, - event_timestamp=end_date, - expected_value=None, - full_feature_names=full_feature_names, - check_offline_store=check_offline_store, - ) - - # check prior value for materialize_incremental() - check_offline_and_online_features( - fs=fs, - fv=fv, - driver_id=3, - event_timestamp=end_date, - expected_value=4, - full_feature_names=full_feature_names, - check_offline_store=check_offline_store, - ) - - # run materialize_incremental() - fs.materialize_incremental(feature_views=[fv.name], end_date=now) - - # check result of materialize_incremental() - check_offline_and_online_features( - fs=fs, - fv=fv, - driver_id=3, - event_timestamp=now, - expected_value=5, - full_feature_names=full_feature_names, - check_offline_store=check_offline_store, - ) + validate_offline_online_store_consistency(fs, fv, split_dt) diff --git a/sdk/python/tests/integration/e2e/test_usage_e2e.py b/sdk/python/tests/integration/e2e/test_usage_e2e.py index 53e4a32a82..5c95bd50b1 100644 --- a/sdk/python/tests/integration/e2e/test_usage_e2e.py +++ b/sdk/python/tests/integration/e2e/test_usage_e2e.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +# This file tests our usage tracking system in `usage.py`. import os import sys import tempfile diff --git a/sdk/python/tests/integration/e2e/test_validation.py b/sdk/python/tests/integration/e2e/test_validation.py index 0d65640dca..7062948f53 100644 --- a/sdk/python/tests/integration/e2e/test_validation.py +++ b/sdk/python/tests/integration/e2e/test_validation.py @@ -26,8 +26,8 @@ driver, location, ) -from tests.utils.cli_utils import CliRunner -from tests.utils.logged_features import prepare_logs +from tests.utils.cli_repo_creator import CliRunner +from tests.utils.test_log_creator import prepare_logs _features = [ "customer_profile:current_balance", @@ -39,72 +39,6 @@ ] -@ge_profiler -def configurable_profiler(dataset: PandasDataset) -> ExpectationSuite: - from great_expectations.profile.user_configurable_profiler import ( - UserConfigurableProfiler, - ) - - return UserConfigurableProfiler( - profile_dataset=dataset, - ignored_columns=["event_timestamp"], - excluded_expectations=[ - "expect_table_columns_to_match_ordered_list", - "expect_table_row_count_to_be_between", - ], - value_set_threshold="few", - ).build_suite() - - -@ge_profiler(with_feature_metadata=True) -def profiler_with_feature_metadata(dataset: PandasDataset) -> ExpectationSuite: - from great_expectations.profile.user_configurable_profiler import ( - UserConfigurableProfiler, - ) - - # always present - dataset.expect_column_values_to_be_in_set( - "global_stats__avg_ride_length__status", {FieldStatus.PRESENT} - ) - - # present at least in 70% of rows - dataset.expect_column_values_to_be_in_set( - "customer_profile__current_balance__status", {FieldStatus.PRESENT}, mostly=0.7 - ) - - return UserConfigurableProfiler( - profile_dataset=dataset, - ignored_columns=["event_timestamp"] - + [ - c - for c in dataset.columns - if c.endswith("__timestamp") or c.endswith("__status") - ], - excluded_expectations=[ - "expect_table_columns_to_match_ordered_list", - "expect_table_row_count_to_be_between", - ], - value_set_threshold="few", - ).build_suite() - - -@ge_profiler -def profiler_with_unrealistic_expectations(dataset: PandasDataset) -> ExpectationSuite: - # need to create dataframe with corrupted data first - df = pd.DataFrame() - df["current_balance"] = [-100] - df["avg_passenger_count"] = [0] - - other_ds = PandasDataset(df) - other_ds.expect_column_max_to_be_between("current_balance", -1000, -100) - other_ds.expect_column_values_to_be_in_set("avg_passenger_count", value_set={0}) - - # this should pass - other_ds.expect_column_min_to_be_between("avg_passenger_count", 0, 1000) - - return other_ds.get_expectation_suite() - - @pytest.mark.integration @pytest.mark.universal_offline_stores def test_historical_retrieval_with_validation(environment, universal_data_sources): @@ -118,9 +52,13 @@ def test_historical_retrieval_with_validation(environment, universal_data_source columns=["order_id", "origin_id", "destination_id"] ) reference_job = store.get_historical_features( - entity_df=entity_df, features=_features, + entity_df=entity_df, + features=_features, + ) + job = store.get_historical_features( + entity_df=entity_df, + features=_features, ) - job = store.get_historical_features(entity_df=entity_df, features=_features,) # Save dataset using reference job and retrieve it store.create_saved_dataset( @@ -149,7 +87,8 @@ def test_historical_retrieval_fails_on_validation(environment, universal_data_so ) reference_job = store.get_historical_features( - entity_df=entity_df, features=_features, + entity_df=entity_df, + features=_features, ) store.create_saved_dataset( @@ -158,7 +97,10 @@ def test_historical_retrieval_fails_on_validation(environment, universal_data_so storage=environment.data_source_creator.create_saved_dataset_destination(), ) - job = store.get_historical_features(entity_df=entity_df, features=_features,) + job = store.get_historical_features( + entity_df=entity_df, + features=_features, + ) with pytest.raises(ValidationFailed) as exc_info: job.to_df( @@ -349,3 +291,72 @@ def test_e2e_validation_via_cli(environment, universal_data_sources): p = runner.run(validate_args, cwd=local_repo.repo_path) assert p.returncode == 1, p.stdout.decode() assert "Validation failed" in p.stdout.decode(), p.stderr.decode() + + +# Great expectations profilers created for testing + + +@ge_profiler +def configurable_profiler(dataset: PandasDataset) -> ExpectationSuite: + from great_expectations.profile.user_configurable_profiler import ( + UserConfigurableProfiler, + ) + + return UserConfigurableProfiler( + profile_dataset=dataset, + ignored_columns=["event_timestamp"], + excluded_expectations=[ + "expect_table_columns_to_match_ordered_list", + "expect_table_row_count_to_be_between", + ], + value_set_threshold="few", + ).build_suite() + + +@ge_profiler(with_feature_metadata=True) +def profiler_with_feature_metadata(dataset: PandasDataset) -> ExpectationSuite: + from great_expectations.profile.user_configurable_profiler import ( + UserConfigurableProfiler, + ) + + # always present + dataset.expect_column_values_to_be_in_set( + "global_stats__avg_ride_length__status", {FieldStatus.PRESENT} + ) + + # present at least in 70% of rows + dataset.expect_column_values_to_be_in_set( + "customer_profile__current_balance__status", {FieldStatus.PRESENT}, mostly=0.7 + ) + + return UserConfigurableProfiler( + profile_dataset=dataset, + ignored_columns=["event_timestamp"] + + [ + c + for c in dataset.columns + if c.endswith("__timestamp") or c.endswith("__status") + ], + excluded_expectations=[ + "expect_table_columns_to_match_ordered_list", + "expect_table_row_count_to_be_between", + ], + value_set_threshold="few", + ).build_suite() + + +@ge_profiler +def profiler_with_unrealistic_expectations(dataset: PandasDataset) -> ExpectationSuite: + # need to create dataframe with corrupted data first + df = pd.DataFrame() + df["current_balance"] = [-100] + df["avg_passenger_count"] = [0] + + other_ds = PandasDataset(df) + other_ds.expect_column_max_to_be_between("current_balance", -1000, -100) + other_ds.expect_column_values_to_be_in_set("avg_passenger_count", value_set={0}) + + # this should pass + other_ds.expect_column_min_to_be_between("avg_passenger_count", 0, 1000) + + return other_ds.get_expectation_suite() diff --git a/sdk/python/tests/integration/feature_repos/integration_test_repo_config.py b/sdk/python/tests/integration/feature_repos/integration_test_repo_config.py index 74ce37f17a..4662734383 100644 --- a/sdk/python/tests/integration/feature_repos/integration_test_repo_config.py +++ b/sdk/python/tests/integration/feature_repos/integration_test_repo_config.py @@ -1,5 +1,6 @@ import hashlib from dataclasses import dataclass +from enum import Enum from typing import Dict, Optional, Type, Union from tests.integration.feature_repos.universal.data_source_creator import ( @@ -13,6 +14,11 @@ ) +class RegistryLocation(Enum): + Local = 1 + S3 = 2 + + @dataclass(frozen=False) class IntegrationTestRepoConfig: """ @@ -25,10 +31,13 @@ class IntegrationTestRepoConfig: offline_store_creator: Type[DataSourceCreator] = FileDataSourceCreator online_store_creator: Optional[Type[OnlineStoreCreator]] = None + batch_engine: Optional[Union[str, Dict]] = "local" + registry_location: RegistryLocation = RegistryLocation.Local + full_feature_names: bool = True infer_features: bool = False python_feature_server: bool = False - go_feature_retrieval: bool = False + go_feature_serving: bool = False def __repr__(self) -> str: if not self.online_store_creator: @@ -52,7 +61,7 @@ def __repr__(self) -> str: f"{self.offline_store_creator.__name__.split('.')[-1].replace('DataSourceCreator', '')}", online_store_type, f"python_fs:{self.python_feature_server}", - f"go_fs:{self.go_feature_retrieval}", + f"go_fs:{self.go_feature_serving}", ] ) @@ -68,6 +77,6 @@ def __eq__(self, other): and self.online_store == other.online_store and self.offline_store_creator == other.offline_store_creator and self.online_store_creator == other.online_store_creator - and self.go_feature_retrieval == other.go_feature_retrieval + and self.go_feature_serving == other.go_feature_serving and self.python_feature_server == other.python_feature_server ) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 6f40d3171b..776fff3bb9 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -22,6 +22,7 @@ from feast.repo_config import RegistryConfig, RepoConfig from tests.integration.feature_repos.integration_test_repo_config import ( IntegrationTestRepoConfig, + RegistryLocation, ) from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, @@ -64,8 +65,6 @@ ) DYNAMO_CONFIG = {"type": "dynamodb", "region": "us-west-2"} -# Port 12345 will chosen as default for redis node configuration because Redis Cluster is started off of nodes -# 6379 -> 6384. This causes conflicts in cli integration tests so we manually keep them separate. REDIS_CONFIG = {"type": "redis", "connection_string": "localhost:6379,db=0"} REDIS_CLUSTER_CONFIG = { "type": "redis", @@ -74,11 +73,22 @@ "connection_string": "127.0.0.1:6001,127.0.0.1:6002,127.0.0.1:6003", } +SNOWFLAKE_CONFIG = { + "type": "snowflake.online", + "account": os.environ.get("SNOWFLAKE_CI_DEPLOYMENT", ""), + "user": os.environ.get("SNOWFLAKE_CI_USER", ""), + "password": os.environ.get("SNOWFLAKE_CI_PASSWORD", ""), + "role": os.environ.get("SNOWFLAKE_CI_ROLE", ""), + "warehouse": os.environ.get("SNOWFLAKE_CI_WAREHOUSE", ""), + "database": "FEAST", + "schema": "ONLINE", +} + OFFLINE_STORE_TO_PROVIDER_CONFIG: Dict[str, DataSourceCreator] = { "file": ("local", FileDataSourceCreator), "bigquery": ("gcp", BigQueryDataSourceCreator), "redshift": ("aws", RedshiftDataSourceCreator), - "snowflake": ("aws", RedshiftDataSourceCreator), + "snowflake": ("aws", SnowflakeDataSourceCreator), } AVAILABLE_OFFLINE_STORES: List[Tuple[str, Type[DataSourceCreator]]] = [ @@ -91,6 +101,7 @@ "sqlite": ({"type": "sqlite"}, None), } +# Only configure Cloud DWH if running full integration tests if os.getenv("FEAST_IS_LOCAL_TEST", "False") != "True": AVAILABLE_OFFLINE_STORES.extend( [ @@ -103,6 +114,7 @@ AVAILABLE_ONLINE_STORES["redis"] = (REDIS_CONFIG, None) AVAILABLE_ONLINE_STORES["dynamodb"] = (DYNAMO_CONFIG, None) AVAILABLE_ONLINE_STORES["datastore"] = ("datastore", None) + AVAILABLE_ONLINE_STORES["snowflake"] = (SNOWFLAKE_CONFIG, None) full_repo_configs_module = os.environ.get(FULL_REPO_CONFIGS_MODULE_ENV_NAME) @@ -141,6 +153,7 @@ } +# Replace online stores with emulated online stores if we're running local integration tests if os.getenv("FEAST_LOCAL_ONLINE_CONTAINER", "False").lower() == "true": replacements: Dict[ str, Tuple[Union[str, Dict[str, str]], Optional[Type[OnlineStoreCreator]]] @@ -304,11 +317,12 @@ def values(self): def construct_universal_feature_views( - data_sources: UniversalDataSources, with_odfv: bool = True, + data_sources: UniversalDataSources, + with_odfv: bool = True, ) -> UniversalFeatureViews: driver_hourly_stats = create_driver_hourly_stats_feature_view(data_sources.driver) - driver_hourly_stats_base_feature_view = create_driver_hourly_stats_batch_feature_view( - data_sources.driver + driver_hourly_stats_base_feature_view = ( + create_driver_hourly_stats_batch_feature_view(data_sources.driver) ) return UniversalFeatureViews( customer=create_customer_daily_profile_feature_view(data_sources.customer), @@ -379,8 +393,6 @@ def construct_test_environment( online_creator = None online_store = test_repo_config.online_store - repo_dir_name = tempfile.mkdtemp() - if test_repo_config.python_feature_server and test_repo_config.provider == "aws": from feast.infra.feature_servers.aws_lambda.config import ( AwsLambdaFeatureServerConfig, @@ -388,28 +400,43 @@ def construct_test_environment( feature_server = AwsLambdaFeatureServerConfig( enabled=True, - execution_role_name="arn:aws:iam::402087665549:role/lambda_execution_role", + execution_role_name=os.getenv( + "AWS_LAMBDA_ROLE", + "arn:aws:iam::402087665549:role/lambda_execution_role", + ), ) - registry = ( - f"s3://feast-integration-tests/registries/{project}/registry.db" - ) # type: Union[str, RegistryConfig] else: feature_server = LocalFeatureServerConfig( feature_logging=FeatureLoggingConfig(enabled=True) ) + + repo_dir_name = tempfile.mkdtemp() + if ( + test_repo_config.python_feature_server and test_repo_config.provider == "aws" + ) or test_repo_config.registry_location == RegistryLocation.S3: + aws_registry_path = os.getenv( + "AWS_REGISTRY_PATH", "s3://feast-integration-tests/registries" + ) + registry: Union[ + str, RegistryConfig + ] = f"{aws_registry_path}/{project}/registry.db" + else: registry = RegistryConfig( - path=str(Path(repo_dir_name) / "registry.db"), cache_ttl_seconds=1, + path=str(Path(repo_dir_name) / "registry.db"), + cache_ttl_seconds=1, ) + config = RepoConfig( registry=registry, project=project, provider=test_repo_config.provider, offline_store=offline_store_config, online_store=online_store, + batch_engine=test_repo_config.batch_engine, repo_path=repo_dir_name, feature_server=feature_server, - go_feature_retrieval=test_repo_config.go_feature_retrieval, + go_feature_serving=test_repo_config.go_feature_serving, ) # Create feature_store.yaml out of the config @@ -419,7 +446,7 @@ def construct_test_environment( fs = FeatureStore(repo_dir_name) # We need to initialize the registry, because if nothing is applied in the test before tearing down # the feature store, that will cause the teardown method to blow up. - fs.registry._initialize_registry() + fs.registry._initialize_registry(project) environment = Environment( name=project, test_repo_config=test_repo_config, diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py index 620f444159..384037eef1 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py @@ -1,3 +1,4 @@ +import os import uuid from typing import Dict, List, Optional @@ -13,6 +14,7 @@ BigQueryLoggingDestination, SavedDatasetBigQueryStorage, ) +from feast.utils import make_df_tzaware from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, ) @@ -51,7 +53,12 @@ def teardown(self): self.dataset = None def create_offline_store_config(self): - return BigQueryOfflineStoreConfig() + return BigQueryOfflineStoreConfig( + location=os.getenv("GCS_REGION", "US"), + gcs_staging_location=os.getenv( + "GCS_STAGING_LOCATION", "gs://feast-export/" + ), + ) def create_data_source( self, @@ -72,6 +79,10 @@ def create_data_source( f"{self.gcp_project}.{self.project_name}.{destination_name}" ) + # Make all datetime columns timezone aware. This should be the behaviour of + # `BigQueryOfflineStore.offline_write_batch`, but since we're bypassing that API here, we should follow the same + # rule. The schema of this initial dataframe determines the schema in the newly created BigQuery table. + df = make_df_tzaware(df) job = self.client.load_table_from_dataframe(df, destination_name) job.result() diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py index ccc1544bb8..7b8e5e80e6 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py @@ -124,7 +124,9 @@ def _upload_parquet_file(self, df, file_name, minio_endpoint): if not client.bucket_exists(self.bucket): client.make_bucket(self.bucket) client.fput_object( - self.bucket, file_name, self.f.name, + self.bucket, + file_name, + self.f.name, ) def create_data_source( diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py index 3b2794393f..c92a413616 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py @@ -1,3 +1,4 @@ +import os import uuid from typing import Dict, List, Optional @@ -24,16 +25,23 @@ class RedshiftDataSourceCreator(DataSourceCreator): def __init__(self, project_name: str, *args, **kwargs): super().__init__(project_name) - self.client = aws_utils.get_redshift_data_client("us-west-2") - self.s3 = aws_utils.get_s3_resource("us-west-2") + self.client = aws_utils.get_redshift_data_client( + os.getenv("AWS_REGION", "us-west-2") + ) + self.s3 = aws_utils.get_s3_resource(os.getenv("AWS_REGION", "us-west-2")) self.offline_store_config = RedshiftOfflineStoreConfig( - cluster_id="feast-integration-tests", - region="us-west-2", - user="admin", - database="feast", - s3_staging_location="s3://feast-integration-tests/redshift/tests/ingestion", - iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", + cluster_id=os.getenv("AWS_CLUSTER_ID", "feast-integration-tests"), + region=os.getenv("AWS_REGION", "us-west-2"), + user=os.getenv("AWS_USER", "admin"), + database=os.getenv("AWS_DB", "feast"), + s3_staging_location=os.getenv( + "AWS_STAGING_LOCATION", + "s3://feast-integration-tests/redshift/tests/ingestion", + ), + iam_role=os.getenv( + "AWS_IAM_ROLE", "arn:aws:iam::402087665549:role/redshift_s3_access_role" + ), ) def create_data_source( diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py index 23466bc00c..ae83ea8eb0 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py @@ -34,6 +34,8 @@ def __init__(self, project_name: str, *args, **kwargs): warehouse=os.environ["SNOWFLAKE_CI_WAREHOUSE"], database="FEAST", schema="OFFLINE", + storage_integration_name="FEAST_S3", + blob_export_location="s3://feast-snowflake-offload/export", ) def create_data_source( diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index 3fee0b7001..b6e9aa8fc0 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -130,7 +130,8 @@ def similarity_feature_view( def create_conv_rate_request_source(): return RequestSource( - name="conv_rate_input", schema=[Field(name="val_to_add", dtype=Int32)], + name="conv_rate_input", + schema=[Field(name="val_to_add", dtype=Int32)], ) @@ -296,7 +297,8 @@ def create_field_mapping_feature_view(source): def create_pushable_feature_view(batch_source: DataSource): push_source = PushSource( - name="location_stats_push_source", batch_source=batch_source, + name="location_stats_push_source", + batch_source=batch_source, ) return StreamFeatureView( name="pushable_location_stats", diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/datastore.py b/sdk/python/tests/integration/feature_repos/universal/online_store/datastore.py index 6067a1ff4b..b5bbb94f7c 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/datastore.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/datastore.py @@ -27,7 +27,7 @@ def create_online_store(self) -> Dict[str, str]: self.container.start() log_string_to_wait_for = r"\[datastore\] Dev App Server is now running" wait_for_logs( - container=self.container, predicate=log_string_to_wait_for, timeout=5 + container=self.container, predicate=log_string_to_wait_for, timeout=10 ) exposed_port = self.container.get_exposed_port("8081") os.environ[datastore.client.DATASTORE_EMULATOR_HOST] = f"0.0.0.0:{exposed_port}" diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/dynamodb.py b/sdk/python/tests/integration/feature_repos/universal/online_store/dynamodb.py index 473b7acee9..1aefdffb24 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/dynamodb.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/dynamodb.py @@ -21,7 +21,7 @@ def create_online_store(self) -> Dict[str, str]: "Initializing DynamoDB Local with the following configuration:" ) wait_for_logs( - container=self.container, predicate=log_string_to_wait_for, timeout=5 + container=self.container, predicate=log_string_to_wait_for, timeout=10 ) exposed_port = self.container.get_exposed_port("8000") return { diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/hbase.py b/sdk/python/tests/integration/feature_repos/universal/online_store/hbase.py index ecaace8709..dba611b30b 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/hbase.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/hbase.py @@ -19,7 +19,7 @@ def create_online_store(self) -> Dict[str, str]: "Initializing Hbase Local with the following configuration:" ) wait_for_logs( - container=self.container, predicate=log_string_to_wait_for, timeout=5 + container=self.container, predicate=log_string_to_wait_for, timeout=10 ) exposed_port = self.container.get_exposed_port("9090") return {"type": "hbase", "host": "127.0.0.1", "port": exposed_port} diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/redis.py b/sdk/python/tests/integration/feature_repos/universal/online_store/redis.py index 4995187665..11d62d9d30 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/redis.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/redis.py @@ -17,7 +17,7 @@ def create_online_store(self) -> Dict[str, str]: self.container.start() log_string_to_wait_for = "Ready to accept connections" wait_for_logs( - container=self.container, predicate=log_string_to_wait_for, timeout=5 + container=self.container, predicate=log_string_to_wait_for, timeout=10 ) exposed_port = self.container.get_exposed_port("6379") return {"type": "redis", "connection_string": f"localhost:{exposed_port},db=0"} diff --git a/sdk/python/tests/integration/materialization/test_lambda.py b/sdk/python/tests/integration/materialization/test_lambda.py new file mode 100644 index 0000000000..8ffd31e0cd --- /dev/null +++ b/sdk/python/tests/integration/materialization/test_lambda.py @@ -0,0 +1,74 @@ +from datetime import timedelta + +import pytest + +from feast.entity import Entity +from feast.feature import Feature +from feast.feature_view import FeatureView +from feast.types import ValueType +from tests.data.data_creator import create_basic_driver_dataset +from tests.integration.feature_repos.integration_test_repo_config import ( + IntegrationTestRepoConfig, + RegistryLocation, +) +from tests.integration.feature_repos.repo_configuration import ( + construct_test_environment, +) +from tests.integration.feature_repos.universal.data_sources.redshift import ( + RedshiftDataSourceCreator, +) +from tests.utils.e2e_test_validation import validate_offline_online_store_consistency + + +@pytest.mark.integration +def test_lambda_materialization_consistency(): + lambda_config = IntegrationTestRepoConfig( + provider="aws", + online_store={"type": "dynamodb", "region": "us-west-2"}, + offline_store_creator=RedshiftDataSourceCreator, + batch_engine={ + "type": "lambda", + "materialization_image": "402087665549.dkr.ecr.us-west-2.amazonaws.com/feast-lambda-consumer:v1", + "lambda_role": "arn:aws:iam::402087665549:role/lambda_execution_role", + }, + registry_location=RegistryLocation.S3, + ) + lambda_environment = construct_test_environment(lambda_config, None) + + df = create_basic_driver_dataset() + ds = lambda_environment.data_source_creator.create_data_source( + df, + lambda_environment.feature_store.project, + field_mapping={"ts_1": "ts"}, + ) + + fs = lambda_environment.feature_store + driver = Entity( + name="driver_id", + join_key="driver_id", + value_type=ValueType.INT64, + ) + + driver_stats_fv = FeatureView( + name="driver_hourly_stats", + entities=["driver_id"], + ttl=timedelta(weeks=52), + features=[Feature(name="value", dtype=ValueType.FLOAT)], + batch_source=ds, + ) + + try: + + fs.apply([driver, driver_stats_fv]) + + print(df) + + # materialization is run in two steps and + # we use timestamp from generated dataframe as a split point + split_dt = df["ts_1"][4].to_pydatetime() - timedelta(seconds=1) + + print(f"Split datetime: {split_dt}") + + validate_offline_online_store_consistency(fs, driver_stats_fv, split_dt) + finally: + fs.teardown() diff --git a/sdk/python/tests/integration/offline_store/test_feature_logging.py b/sdk/python/tests/integration/offline_store/test_feature_logging.py index a6f8e56de7..eba994544d 100644 --- a/sdk/python/tests/integration/offline_store/test_feature_logging.py +++ b/sdk/python/tests/integration/offline_store/test_feature_logging.py @@ -22,7 +22,7 @@ location, ) from tests.integration.feature_repos.universal.feature_views import conv_rate_plus_100 -from tests.utils.logged_features import prepare_logs, to_logs_dataset +from tests.utils.test_log_creator import prepare_logs, to_logs_dataset @pytest.mark.integration @@ -65,12 +65,14 @@ def test_feature_service_logging(environment, universal_data_sources, pass_as_pa with to_logs_dataset(first_batch, pass_as_path) as logs: store.write_logged_features( - source=feature_service, logs=logs, + source=feature_service, + logs=logs, ) with to_logs_dataset(second_batch, pass_as_path) as logs: store.write_logged_features( - source=feature_service, logs=logs, + source=feature_service, + logs=logs, ) expected_columns = list(set(logs_df.columns) - {LOG_DATE_FIELD}) diff --git a/sdk/python/tests/integration/offline_store/test_offline_write.py b/sdk/python/tests/integration/offline_store/test_offline_write.py index 30ead98389..b8c465946d 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_write.py +++ b/sdk/python/tests/integration/offline_store/test_offline_write.py @@ -7,92 +7,55 @@ from feast import FeatureView, Field from feast.types import Float32, Int32 +from tests.integration.feature_repos.repo_configuration import ( + construct_universal_feature_views, +) from tests.integration.feature_repos.universal.entities import driver @pytest.mark.integration -@pytest.mark.universal_offline_stores(only=["file", "redshift"]) -@pytest.mark.universal_online_stores(only=["sqlite"]) -def test_writing_columns_in_incorrect_order_fails(environment, universal_data_sources): - # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in +@pytest.mark.universal_offline_stores +def test_reorder_columns(environment, universal_data_sources): + """Tests that a dataframe with columns in the wrong order is reordered.""" store = environment.feature_store _, _, data_sources = universal_data_sources - driver_stats = FeatureView( - name="driver_stats", - entities=["driver"], - schema=[ - Field(name="avg_daily_trips", dtype=Int32), - Field(name="conv_rate", dtype=Float32), - ], - source=data_sources.driver, - ) + feature_views = construct_universal_feature_views(data_sources) + driver_fv = feature_views.driver + store.apply([driver(), driver_fv]) now = datetime.utcnow() ts = pd.Timestamp(now).round("ms") - entity_df = pd.DataFrame.from_dict( - {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts]} - ) - - store.apply([driver(), driver_stats]) - df = store.get_historical_features( - entity_df=entity_df, - features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], - full_feature_names=False, - ).to_df() - - assert df["conv_rate"].isnull().all() - assert df["avg_daily_trips"].isnull().all() - - expected_df = pd.DataFrame.from_dict( + # This dataframe has columns in the wrong order. + df_to_write = pd.DataFrame.from_dict( { - "driver_id": [1001, 1002], - "event_timestamp": [ts - timedelta(hours=3), ts], - "conv_rate": [random.random(), random.random()], "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], "created": [ts, ts], + "conv_rate": [random.random(), random.random()], + "event_timestamp": [ts, ts], + "acc_rate": [random.random(), random.random()], + "driver_id": [1001, 1001], }, ) - with pytest.raises(ValueError): - store._write_to_offline_store( - driver_stats.name, expected_df, allow_registry_cache=False - ) + + store.write_to_offline_store( + driver_fv.name, df_to_write, allow_registry_cache=False + ) @pytest.mark.integration -@pytest.mark.universal_offline_stores(only=["file", "redshift"]) -@pytest.mark.universal_online_stores(only=["sqlite"]) +@pytest.mark.universal_offline_stores def test_writing_incorrect_schema_fails(environment, universal_data_sources): - # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in + """Tests that writing a dataframe with an incorrect schema fails.""" store = environment.feature_store _, _, data_sources = universal_data_sources - driver_stats = FeatureView( - name="driver_stats", - entities=["driver"], - schema=[ - Field(name="avg_daily_trips", dtype=Int32), - Field(name="conv_rate", dtype=Float32), - ], - source=data_sources.driver, - ) + feature_views = construct_universal_feature_views(data_sources) + driver_fv = feature_views.driver + store.apply([driver(), driver_fv]) now = datetime.utcnow() ts = pd.Timestamp(now).round("ms") - entity_df = pd.DataFrame.from_dict( - {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts]} - ) - - store.apply([driver(), driver_stats]) - df = store.get_historical_features( - entity_df=entity_df, - features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], - full_feature_names=False, - ).to_df() - - assert df["conv_rate"].isnull().all() - assert df["avg_daily_trips"].isnull().all() - expected_df = pd.DataFrame.from_dict( { "event_timestamp": [ts - timedelta(hours=3), ts], @@ -103,20 +66,20 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): }, ) with pytest.raises(ValueError): - store._write_to_offline_store( - driver_stats.name, expected_df, allow_registry_cache=False + store.write_to_offline_store( + driver_fv.name, expected_df, allow_registry_cache=False ) @pytest.mark.integration @pytest.mark.universal_offline_stores -@pytest.mark.universal_online_stores(only=["sqlite"]) def test_writing_consecutively_to_offline_store(environment, universal_data_sources): store = environment.feature_store _, _, data_sources = universal_data_sources + driver_entity = driver() driver_stats = FeatureView( name="driver_stats", - entities=["driver"], + entities=[driver_entity], schema=[ Field(name="avg_daily_trips", dtype=Int32), Field(name="conv_rate", dtype=Float32), @@ -134,23 +97,28 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour entity_df = pd.DataFrame.from_dict( { "driver_id": [1001, 1001], - "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)], + "event_timestamp": [ts + timedelta(hours=3), ts + timedelta(hours=4)], } ) - store.apply([driver(), driver_stats]) + store.apply([driver_entity, driver_stats]) df = store.get_historical_features( entity_df=entity_df, - features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], + features=[ + "driver_stats:conv_rate", + "driver_stats:acc_rate", + "driver_stats:avg_daily_trips", + ], full_feature_names=False, ).to_df() assert df["conv_rate"].isnull().all() + assert df["acc_rate"].isnull().all() assert df["avg_daily_trips"].isnull().all() first_df = pd.DataFrame.from_dict( { - "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)], + "event_timestamp": [ts + timedelta(hours=3), ts + timedelta(hours=4)], "driver_id": [1001, 1001], "conv_rate": [random.random(), random.random()], "acc_rate": [random.random(), random.random()], @@ -158,29 +126,43 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour "created": [ts, ts], }, ) - store._write_to_offline_store( + first_df = first_df.astype({"conv_rate": "float32", "acc_rate": "float32"}) + store.write_to_offline_store( driver_stats.name, first_df, allow_registry_cache=False ) - after_write_df = store.get_historical_features( + after_write_df: pd.DataFrame = store.get_historical_features( entity_df=entity_df, - features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], + features=[ + "driver_stats:conv_rate", + "driver_stats:acc_rate", + "driver_stats:avg_daily_trips", + ], full_feature_names=False, ).to_df() - - assert len(after_write_df) == len(first_df) - assert np.where( - after_write_df["conv_rate"].reset_index(drop=True) - == first_df["conv_rate"].reset_index(drop=True) + after_write_df = after_write_df.sort_values("event_timestamp").reset_index( + drop=True ) - assert np.where( - after_write_df["avg_daily_trips"].reset_index(drop=True) - == first_df["avg_daily_trips"].reset_index(drop=True) + + print(f"After: {after_write_df}\nFirst: {first_df}") + print( + f"After: {after_write_df['conv_rate'].reset_index(drop=True)}\nFirst: {first_df['conv_rate'].reset_index(drop=True)}" ) + assert len(after_write_df) == len(first_df) + for field in ["conv_rate", "acc_rate", "avg_daily_trips"]: + assert np.equal( + after_write_df[field].reset_index(drop=True), + first_df[field].reset_index(drop=True), + ).all(), ( + f"Field: {field}\n" + f"After: {after_write_df[field].reset_index(drop=True)}\n" + f"First: {first_df[field].reset_index(drop=True)}" + ) + second_df = pd.DataFrame.from_dict( { - "event_timestamp": [ts - timedelta(hours=1), ts], + "event_timestamp": [ts + timedelta(hours=5), ts + timedelta(hours=6)], "driver_id": [1001, 1001], "conv_rate": [random.random(), random.random()], "acc_rate": [random.random(), random.random()], @@ -188,8 +170,9 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour "created": [ts, ts], }, ) + second_df = second_df.astype({"conv_rate": "float32", "acc_rate": "float32"}) - store._write_to_offline_store( + store.write_to_offline_store( driver_stats.name, second_df, allow_registry_cache=False ) @@ -197,10 +180,10 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour { "driver_id": [1001, 1001, 1001, 1001], "event_timestamp": [ - ts - timedelta(hours=4), - ts - timedelta(hours=3), - ts - timedelta(hours=1), - ts, + ts + timedelta(hours=3), + ts + timedelta(hours=4), + ts + timedelta(hours=5), + ts + timedelta(hours=6), ], } ) @@ -214,18 +197,17 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour ], full_feature_names=False, ).to_df() - + after_write_df = after_write_df.sort_values("event_timestamp").reset_index( + drop=True + ) expected_df = pd.concat([first_df, second_df]) assert len(after_write_df) == len(expected_df) - assert np.where( - after_write_df["conv_rate"].reset_index(drop=True) - == expected_df["conv_rate"].reset_index(drop=True) - ) - assert np.where( - after_write_df["acc_rate"].reset_index(drop=True) - == expected_df["acc_rate"].reset_index(drop=True) - ) - assert np.where( - after_write_df["avg_daily_trips"].reset_index(drop=True) - == expected_df["avg_daily_trips"].reset_index(drop=True) - ) + for field in ["conv_rate", "acc_rate", "avg_daily_trips"]: + assert np.equal( + after_write_df[field].reset_index(drop=True), + expected_df[field].reset_index(drop=True), + ).all(), ( + f"Field: {field}\n" + f"After: {after_write_df[field].reset_index(drop=True)}\n" + f"First: {expected_df[field].reset_index(drop=True)}" + ) diff --git a/sdk/python/tests/integration/offline_store/test_push_offline_retrieval.py b/sdk/python/tests/integration/offline_store/test_push_features_to_offline_store.py similarity index 81% rename from sdk/python/tests/integration/offline_store/test_push_offline_retrieval.py rename to sdk/python/tests/integration/offline_store/test_push_features_to_offline_store.py index 5cea8a36ef..0b1db9011a 100644 --- a/sdk/python/tests/integration/offline_store/test_push_offline_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_push_features_to_offline_store.py @@ -8,24 +8,19 @@ from tests.integration.feature_repos.repo_configuration import ( construct_universal_feature_views, ) -from tests.integration.feature_repos.universal.entities import ( - customer, - driver, - location, -) +from tests.integration.feature_repos.universal.entities import location @pytest.mark.integration @pytest.mark.universal_offline_stores -@pytest.mark.universal_online_stores(only=["sqlite"]) -def test_push_features_and_read_from_offline_store(environment, universal_data_sources): +def test_push_features_and_read(environment, universal_data_sources): store = environment.feature_store - - (_, _, data_sources) = universal_data_sources + _, _, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) - now = pd.Timestamp(datetime.datetime.utcnow()).round("ms") + location_fv = feature_views.pushed_locations + store.apply([location(), location_fv]) - store.apply([driver(), customer(), location(), *feature_views.values()]) + now = pd.Timestamp(datetime.datetime.utcnow()).round("ms") entity_df = pd.DataFrame.from_dict({"location_id": [1], "event_timestamp": [now]}) before_df = store.get_historical_features( diff --git a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py index dfe14d73f9..645e0f7331 100644 --- a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py +++ b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py @@ -17,7 +17,9 @@ @pytest.mark.skip( reason="No way to run this test today. Credentials conflict with real AWS credentials in CI" ) -def test_registration_and_retrieval_from_custom_s3_endpoint(universal_data_sources,): +def test_registration_and_retrieval_from_custom_s3_endpoint( + universal_data_sources, +): config = IntegrationTestRepoConfig( offline_store_creator="tests.integration.feature_repos.universal.data_sources.file.S3FileDataSourceCreator" ) diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 2076ab2aed..718b7577d9 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -1,20 +1,13 @@ import random import time from datetime import datetime, timedelta -from typing import Any, Dict, List, Optional import numpy as np import pandas as pd import pytest -from pandas.testing import assert_frame_equal as pd_assert_frame_equal -from pytz import utc -from feast import utils from feast.entity import Entity -from feast.errors import ( - FeatureNameCollisionError, - RequestDataNotFoundInEntityDfException, -) +from feast.errors import RequestDataNotFoundInEntityDfException from feast.feature_service import FeatureService from feast.feature_view import FeatureView from feast.field import Field @@ -34,253 +27,20 @@ driver, location, ) +from tests.utils.feature_records import ( + assert_feature_service_correctness, + assert_feature_service_entity_mapping_correctness, + get_expected_training_df, + get_response_feature_name, + validate_dataframes, +) np.random.seed(0) -def convert_timestamp_records_to_utc( - records: List[Dict[str, Any]], column: str -) -> List[Dict[str, Any]]: - for record in records: - record[column] = utils.make_tzaware(record[column]).astimezone(utc) - return records - - -# Find the latest record in the given time range and filter -def find_asof_record( - records: List[Dict[str, Any]], - ts_key: str, - ts_start: datetime, - ts_end: datetime, - filter_keys: Optional[List[str]] = None, - filter_values: Optional[List[Any]] = None, -) -> Dict[str, Any]: - filter_keys = filter_keys or [] - filter_values = filter_values or [] - assert len(filter_keys) == len(filter_values) - found_record: Dict[str, Any] = {} - for record in records: - if ( - all( - [ - record[filter_key] == filter_value - for filter_key, filter_value in zip(filter_keys, filter_values) - ] - ) - and ts_start <= record[ts_key] <= ts_end - ): - if not found_record or found_record[ts_key] < record[ts_key]: - found_record = record - return found_record - - -def get_expected_training_df( - customer_df: pd.DataFrame, - customer_fv: FeatureView, - driver_df: pd.DataFrame, - driver_fv: FeatureView, - orders_df: pd.DataFrame, - order_fv: FeatureView, - location_df: pd.DataFrame, - location_fv: FeatureView, - global_df: pd.DataFrame, - global_fv: FeatureView, - field_mapping_df: pd.DataFrame, - field_mapping_fv: FeatureView, - entity_df: pd.DataFrame, - event_timestamp: str, - full_feature_names: bool = False, -): - # Convert all pandas dataframes into records with UTC timestamps - customer_records = convert_timestamp_records_to_utc( - customer_df.to_dict("records"), customer_fv.batch_source.timestamp_field - ) - driver_records = convert_timestamp_records_to_utc( - driver_df.to_dict("records"), driver_fv.batch_source.timestamp_field - ) - order_records = convert_timestamp_records_to_utc( - orders_df.to_dict("records"), event_timestamp - ) - location_records = convert_timestamp_records_to_utc( - location_df.to_dict("records"), location_fv.batch_source.timestamp_field - ) - global_records = convert_timestamp_records_to_utc( - global_df.to_dict("records"), global_fv.batch_source.timestamp_field - ) - field_mapping_records = convert_timestamp_records_to_utc( - field_mapping_df.to_dict("records"), - field_mapping_fv.batch_source.timestamp_field, - ) - entity_rows = convert_timestamp_records_to_utc( - entity_df.to_dict("records"), event_timestamp - ) - - # Manually do point-in-time join of driver, customer, and order records against - # the entity df - for entity_row in entity_rows: - customer_record = find_asof_record( - customer_records, - ts_key=customer_fv.batch_source.timestamp_field, - ts_start=entity_row[event_timestamp] - customer_fv.ttl, - ts_end=entity_row[event_timestamp], - filter_keys=["customer_id"], - filter_values=[entity_row["customer_id"]], - ) - driver_record = find_asof_record( - driver_records, - ts_key=driver_fv.batch_source.timestamp_field, - ts_start=entity_row[event_timestamp] - driver_fv.ttl, - ts_end=entity_row[event_timestamp], - filter_keys=["driver_id"], - filter_values=[entity_row["driver_id"]], - ) - order_record = find_asof_record( - order_records, - ts_key=customer_fv.batch_source.timestamp_field, - ts_start=entity_row[event_timestamp] - order_fv.ttl, - ts_end=entity_row[event_timestamp], - filter_keys=["customer_id", "driver_id"], - filter_values=[entity_row["customer_id"], entity_row["driver_id"]], - ) - origin_record = find_asof_record( - location_records, - ts_key=location_fv.batch_source.timestamp_field, - ts_start=order_record[event_timestamp] - location_fv.ttl, - ts_end=order_record[event_timestamp], - filter_keys=["location_id"], - filter_values=[order_record["origin_id"]], - ) - destination_record = find_asof_record( - location_records, - ts_key=location_fv.batch_source.timestamp_field, - ts_start=order_record[event_timestamp] - location_fv.ttl, - ts_end=order_record[event_timestamp], - filter_keys=["location_id"], - filter_values=[order_record["destination_id"]], - ) - global_record = find_asof_record( - global_records, - ts_key=global_fv.batch_source.timestamp_field, - ts_start=order_record[event_timestamp] - global_fv.ttl, - ts_end=order_record[event_timestamp], - ) - - field_mapping_record = find_asof_record( - field_mapping_records, - ts_key=field_mapping_fv.batch_source.timestamp_field, - ts_start=order_record[event_timestamp] - field_mapping_fv.ttl, - ts_end=order_record[event_timestamp], - ) - - entity_row.update( - { - ( - f"customer_profile__{k}" if full_feature_names else k - ): customer_record.get(k, None) - for k in ( - "current_balance", - "avg_passenger_count", - "lifetime_trip_count", - ) - } - ) - entity_row.update( - { - (f"driver_stats__{k}" if full_feature_names else k): driver_record.get( - k, None - ) - for k in ("conv_rate", "avg_daily_trips") - } - ) - entity_row.update( - { - (f"order__{k}" if full_feature_names else k): order_record.get(k, None) - for k in ("order_is_success",) - } - ) - entity_row.update( - { - "origin__temperature": origin_record.get("temperature", None), - "destination__temperature": destination_record.get("temperature", None), - } - ) - entity_row.update( - { - (f"global_stats__{k}" if full_feature_names else k): global_record.get( - k, None - ) - for k in ("num_rides", "avg_ride_length",) - } - ) - - # get field_mapping_record by column name, but label by feature name - entity_row.update( - { - ( - f"field_mapping__{feature}" if full_feature_names else feature - ): field_mapping_record.get(column, None) - for ( - column, - feature, - ) in field_mapping_fv.batch_source.field_mapping.items() - } - ) - - # Convert records back to pandas dataframe - expected_df = pd.DataFrame(entity_rows) - - # Move "event_timestamp" column to front - current_cols = expected_df.columns.tolist() - current_cols.remove(event_timestamp) - expected_df = expected_df[[event_timestamp] + current_cols] - - # Cast some columns to expected types, since we lose information when converting pandas DFs into Python objects. - if full_feature_names: - expected_column_types = { - "order__order_is_success": "int32", - "driver_stats__conv_rate": "float32", - "customer_profile__current_balance": "float32", - "customer_profile__avg_passenger_count": "float32", - "global_stats__avg_ride_length": "float32", - "field_mapping__feature_name": "int32", - } - else: - expected_column_types = { - "order_is_success": "int32", - "conv_rate": "float32", - "current_balance": "float32", - "avg_passenger_count": "float32", - "avg_ride_length": "float32", - "feature_name": "int32", - } - - for col, typ in expected_column_types.items(): - expected_df[col] = expected_df[col].astype(typ) - - conv_feature_name = "driver_stats__conv_rate" if full_feature_names else "conv_rate" - conv_plus_feature_name = response_feature_name( - "conv_rate_plus_100", full_feature_names - ) - expected_df[conv_plus_feature_name] = expected_df[conv_feature_name] + 100 - expected_df[ - response_feature_name("conv_rate_plus_100_rounded", full_feature_names) - ] = ( - expected_df[conv_plus_feature_name] - .astype("float") - .round() - .astype(pd.Int32Dtype()) - ) - if "val_to_add" in expected_df.columns: - expected_df[ - response_feature_name("conv_rate_plus_val_to_add", full_feature_names) - ] = (expected_df[conv_feature_name] + expected_df["val_to_add"]) - - return expected_df - - @pytest.mark.integration @pytest.mark.universal_offline_stores -@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: f"full:{v}") def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store @@ -370,6 +130,11 @@ def test_historical_features(environment, universal_data_sources, full_feature_n full_feature_names=full_feature_names, ) + if job_from_df.supports_remote_storage_export(): + files = job_from_df.to_remote_storage() + print(files) + assert len(files) > 0 # This test should be way more detailed + start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() @@ -378,7 +143,7 @@ def test_historical_features(environment, universal_data_sources, full_feature_n print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n")) assert sorted(expected_df.columns) == sorted(actual_df_from_df_entities.columns) - assert_frame_equal( + validate_dataframes( expected_df, actual_df_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], @@ -402,7 +167,7 @@ def test_historical_features(environment, universal_data_sources, full_feature_n ) table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() - assert_frame_equal( + validate_dataframes( expected_df, table_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], @@ -410,7 +175,7 @@ def test_historical_features(environment, universal_data_sources, full_feature_n @pytest.mark.integration -@pytest.mark.universal +@pytest.mark.universal_offline_stores @pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) def test_historical_features_with_shared_batch_source( environment, universal_data_sources, full_feature_names @@ -552,15 +317,15 @@ def test_historical_features_with_entities_from_query( # Not requesting the on demand transform with an entity_df query (can't add request data in them) expected_df_query = full_expected_df.drop( columns=[ - response_feature_name("conv_rate_plus_100", full_feature_names), - response_feature_name("conv_rate_plus_100_rounded", full_feature_names), - response_feature_name("avg_daily_trips", full_feature_names), - response_feature_name("conv_rate", full_feature_names), + get_response_feature_name("conv_rate_plus_100", full_feature_names), + get_response_feature_name("conv_rate_plus_100_rounded", full_feature_names), + get_response_feature_name("avg_daily_trips", full_feature_names), + get_response_feature_name("conv_rate", full_feature_names), "origin__temperature", "destination__temperature", ] ) - assert_frame_equal( + validate_dataframes( expected_df_query, actual_df_from_sql_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], @@ -572,7 +337,7 @@ def test_historical_features_with_entities_from_query( table_from_sql_entities[col].dtype ) - assert_frame_equal( + validate_dataframes( expected_df_query, table_from_sql_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], @@ -636,28 +401,100 @@ def test_historical_features_persisting( full_feature_names, ).drop( columns=[ - response_feature_name("conv_rate_plus_100", full_feature_names), - response_feature_name("conv_rate_plus_100_rounded", full_feature_names), - response_feature_name("avg_daily_trips", full_feature_names), - response_feature_name("conv_rate", full_feature_names), + get_response_feature_name("conv_rate_plus_100", full_feature_names), + get_response_feature_name("conv_rate_plus_100_rounded", full_feature_names), + get_response_feature_name("avg_daily_trips", full_feature_names), + get_response_feature_name("conv_rate", full_feature_names), "origin__temperature", "destination__temperature", ] ) - assert_frame_equal( + validate_dataframes( expected_df, saved_dataset.to_df(), keys=[event_timestamp, "driver_id", "customer_id"], ) - assert_frame_equal( + validate_dataframes( job.to_df(), saved_dataset.to_df(), keys=[event_timestamp, "driver_id", "customer_id"], ) +@pytest.mark.integration +@pytest.mark.universal_offline_stores +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +def test_historical_features_with_no_ttl( + environment, universal_data_sources, full_feature_names +): + store = environment.feature_store + + (entities, datasets, data_sources) = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + + # Remove ttls. + feature_views.customer.ttl = timedelta(seconds=0) + feature_views.order.ttl = timedelta(seconds=0) + feature_views.global_fv.ttl = timedelta(seconds=0) + feature_views.field_mapping.ttl = timedelta(seconds=0) + + store.apply([driver(), customer(), location(), *feature_views.values()]) + + entity_df = datasets.entity_df.drop( + columns=["order_id", "origin_id", "destination_id"] + ) + + job = store.get_historical_features( + entity_df=entity_df, + features=[ + "customer_profile:current_balance", + "customer_profile:avg_passenger_count", + "customer_profile:lifetime_trip_count", + "order:order_is_success", + "global_stats:num_rides", + "global_stats:avg_ride_length", + "field_mapping:feature_name", + ], + full_feature_names=full_feature_names, + ) + + event_timestamp = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL + expected_df = get_expected_training_df( + datasets.customer_df, + feature_views.customer, + datasets.driver_df, + feature_views.driver, + datasets.orders_df, + feature_views.order, + datasets.location_df, + feature_views.location, + datasets.global_df, + feature_views.global_fv, + datasets.field_mapping_df, + feature_views.field_mapping, + entity_df, + event_timestamp, + full_feature_names, + ).drop( + columns=[ + get_response_feature_name("conv_rate_plus_100", full_feature_names), + get_response_feature_name("conv_rate_plus_100_rounded", full_feature_names), + get_response_feature_name("avg_daily_trips", full_feature_names), + get_response_feature_name("conv_rate", full_feature_names), + "origin__temperature", + "destination__temperature", + ] + ) + + validate_dataframes( + expected_df, + job.to_df(), + keys=[event_timestamp, "driver_id", "customer_id"], + ) + + @pytest.mark.integration @pytest.mark.universal_offline_stores def test_historical_features_from_bigquery_sources_containing_backfills(environment): @@ -752,130 +589,4 @@ def test_historical_features_from_bigquery_sources_containing_backfills(environm print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n")) assert sorted(expected_df.columns) == sorted(actual_df.columns) - assert_frame_equal(expected_df, actual_df, keys=["driver_id"]) - - -def response_feature_name(feature: str, full_feature_names: bool) -> str: - if feature in {"conv_rate", "avg_daily_trips"} and full_feature_names: - return f"driver_stats__{feature}" - - if ( - feature - in { - "conv_rate_plus_100", - "conv_rate_plus_100_rounded", - "conv_rate_plus_val_to_add", - } - and full_feature_names - ): - return f"conv_rate_plus_100__{feature}" - - return feature - - -def assert_feature_service_correctness( - store, feature_service, full_feature_names, entity_df, expected_df, event_timestamp -): - - job_from_df = store.get_historical_features( - entity_df=entity_df, - features=feature_service, - full_feature_names=full_feature_names, - ) - - actual_df_from_df_entities = job_from_df.to_df() - - expected_df = expected_df[ - [ - event_timestamp, - "order_id", - "driver_id", - "customer_id", - response_feature_name("conv_rate", full_feature_names), - response_feature_name("conv_rate_plus_100", full_feature_names), - "driver_age", - ] - ] - - assert_frame_equal( - expected_df, - actual_df_from_df_entities, - keys=[event_timestamp, "order_id", "driver_id", "customer_id"], - ) - - -def assert_feature_service_entity_mapping_correctness( - store, feature_service, full_feature_names, entity_df, expected_df, event_timestamp -): - if full_feature_names: - job_from_df = store.get_historical_features( - entity_df=entity_df, - features=feature_service, - full_feature_names=full_feature_names, - ) - actual_df_from_df_entities = job_from_df.to_df() - - expected_df: pd.DataFrame = ( - expected_df.sort_values( - by=[ - event_timestamp, - "order_id", - "driver_id", - "customer_id", - "origin_id", - "destination_id", - ] - ) - .drop_duplicates() - .reset_index(drop=True) - ) - expected_df = expected_df[ - [ - event_timestamp, - "order_id", - "driver_id", - "customer_id", - "origin_id", - "destination_id", - "origin__temperature", - "destination__temperature", - ] - ] - - assert_frame_equal( - expected_df, - actual_df_from_df_entities, - keys=[ - event_timestamp, - "order_id", - "driver_id", - "customer_id", - "origin_id", - "destination_id", - ], - ) - else: - # using 2 of the same FeatureView without full_feature_names=True will result in collision - with pytest.raises(FeatureNameCollisionError): - job_from_df = store.get_historical_features( - entity_df=entity_df, - features=feature_service, - full_feature_names=full_feature_names, - ) - - -def assert_frame_equal(expected_df, actual_df, keys): - expected_df: pd.DataFrame = ( - expected_df.sort_values(by=keys).drop_duplicates().reset_index(drop=True) - ) - - actual_df = ( - actual_df[expected_df.columns] - .sort_values(by=keys) - .drop_duplicates() - .reset_index(drop=True) - ) - - pd_assert_frame_equal( - expected_df, actual_df, check_dtype=False, - ) + validate_dataframes(expected_df, actual_df, keys=["driver_id"]) diff --git a/sdk/python/tests/integration/online_store/test_push_online_retrieval.py b/sdk/python/tests/integration/online_store/test_push_features_to_online_store.py similarity index 80% rename from sdk/python/tests/integration/online_store/test_push_online_retrieval.py rename to sdk/python/tests/integration/online_store/test_push_features_to_online_store.py index aa7e3e7f53..42561563f9 100644 --- a/sdk/python/tests/integration/online_store/test_push_online_retrieval.py +++ b/sdk/python/tests/integration/online_store/test_push_features_to_online_store.py @@ -6,22 +6,18 @@ from tests.integration.feature_repos.repo_configuration import ( construct_universal_feature_views, ) -from tests.integration.feature_repos.universal.entities import ( - customer, - driver, - location, -) +from tests.integration.feature_repos.universal.entities import location @pytest.mark.integration @pytest.mark.universal_online_stores def test_push_features_and_read(environment, universal_data_sources): store = environment.feature_store - - (_, datasets, data_sources) = universal_data_sources + _, _, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) + location_fv = feature_views.pushed_locations + store.apply([location(), location_fv]) - store.apply([driver(), customer(), location(), *feature_views.values()]) data = { "location_id": [1], "temperature": [4], diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index b01448e7cc..738b00f7d7 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -34,7 +34,7 @@ create_driver_hourly_stats_feature_view, driver_feature_view, ) -from tests.utils.data_source_utils import prep_file_source +from tests.utils.data_source_test_creator import prep_file_source @pytest.mark.integration @@ -441,82 +441,6 @@ def test_online_retrieval_with_event_timestamps( ) -@pytest.mark.integration -@pytest.mark.universal_online_stores -@pytest.mark.goserver -@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) -def test_stream_feature_view_online_retrieval( - environment, universal_data_sources, feature_server_endpoint, full_feature_names -): - """ - Tests materialization and online retrieval for stream feature views. - - This test is separate from test_online_retrieval since combining feature views and - stream feature views into a single test resulted in test flakiness. This is tech - debt that should be resolved soon. - """ - # Set up feature store. - fs = environment.feature_store - entities, datasets, data_sources = universal_data_sources - feature_views = construct_universal_feature_views(data_sources) - pushable_feature_view = feature_views.pushed_locations - fs.apply([location(), pushable_feature_view]) - - # Materialize. - fs.materialize( - environment.start_date - timedelta(days=1), - environment.end_date + timedelta(days=1), - ) - - # Get online features by randomly sampling 10 entities that exist in the batch source. - sample_locations = datasets.location_df.sample(10)["location_id"] - entity_rows = [ - {"location_id": sample_location} for sample_location in sample_locations - ] - - feature_refs = [ - "pushable_location_stats:temperature", - ] - unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] - - online_features_dict = get_online_features_dict( - environment=environment, - endpoint=feature_server_endpoint, - features=feature_refs, - entity_rows=entity_rows, - full_feature_names=full_feature_names, - ) - - # Check that the response has the expected set of keys. - keys = set(online_features_dict.keys()) - expected_keys = set( - f.replace(":", "__") if full_feature_names else f.split(":")[-1] - for f in feature_refs - ) | {"location_id"} - assert ( - keys == expected_keys - ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" - - # Check that the feature values match. - tc = unittest.TestCase() - for i, entity_row in enumerate(entity_rows): - df_features = get_latest_feature_values_from_location_df( - entity_row, datasets.location_df - ) - - assert df_features["location_id"] == online_features_dict["location_id"][i] - for unprefixed_feature_ref in unprefixed_feature_refs: - tc.assertAlmostEqual( - df_features[unprefixed_feature_ref], - online_features_dict[ - response_feature_name( - unprefixed_feature_ref, feature_refs, full_feature_names - ) - ][i], - delta=0.0001, - ) - - @pytest.mark.integration @pytest.mark.universal_online_stores @pytest.mark.goserver @@ -534,6 +458,7 @@ def test_online_retrieval( feature_views.driver[["conv_rate"]], feature_views.driver_odfv, feature_views.customer[["current_balance"]], + feature_views.pushed_locations, ], ) feature_service_entity_mapping = FeatureService( @@ -566,7 +491,7 @@ def test_online_retrieval( ) entity_sample = datasets.orders_df.sample(10)[ - ["customer_id", "driver_id", "order_id", "event_timestamp"] + ["customer_id", "driver_id", "order_id", "origin_id", "event_timestamp"] ] orders_df = datasets.orders_df[ ( @@ -585,6 +510,8 @@ def test_online_retrieval( datasets.customer_df["customer_id"].isin(sample_customers) ] + sample_origins = entity_sample["origin_id"] + location_pairs = np.array(list(itertools.permutations(entities.location_vals, 2))) sample_location_pairs = location_pairs[ np.random.choice(len(location_pairs), 10) @@ -597,10 +524,11 @@ def test_online_retrieval( ] global_df = datasets.global_df + location_df = datasets.location_df entity_rows = [ - {"driver_id": d, "customer_id": c, "val_to_add": 50} - for (d, c) in zip(sample_drivers, sample_customers) + {"driver_id": d, "customer_id": c, "location_id": o, "val_to_add": 50} + for (d, c, o) in zip(sample_drivers, sample_customers, sample_origins) ] feature_refs = [ @@ -614,6 +542,7 @@ def test_online_retrieval( "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", + "pushable_location_stats:temperature", ] unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] # Remove the on demand feature view output features, since they're not present in the source dataframe @@ -644,7 +573,7 @@ def test_online_retrieval( expected_keys = set( f.replace(":", "__") if full_feature_names else f.split(":")[-1] for f in feature_refs - ) | {"customer_id", "driver_id"} + ) | {"customer_id", "driver_id", "location_id"} assert ( keys == expected_keys ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" @@ -657,6 +586,7 @@ def test_online_retrieval( orders_df=orders_df, global_df=global_df, entity_row=entity_row, + location_df=location_df, ) assert df_features["customer_id"] == online_features_dict["customer_id"][i] @@ -695,7 +625,9 @@ def test_online_retrieval( environment=environment, endpoint=feature_server_endpoint, features=feature_refs, - entity_rows=[{"driver_id": 0, "customer_id": 0, "val_to_add": 100}], + entity_rows=[ + {"driver_id": 0, "customer_id": 0, "location_id": 0, "val_to_add": 100} + ], full_feature_names=full_feature_names, ) assert missing_responses_dict is not None @@ -715,7 +647,7 @@ def test_online_retrieval( environment=environment, endpoint=feature_server_endpoint, features=feature_refs, - entity_rows=[{"driver_id": 0, "customer_id": 0}], + entity_rows=[{"driver_id": 0, "customer_id": 0, "location_id": 0}], full_feature_names=full_feature_names, ) @@ -729,6 +661,7 @@ def test_online_retrieval( customers_df, orders_df, global_df, + location_df, ) entity_rows = [ @@ -857,6 +790,7 @@ def get_latest_feature_values_from_dataframes( orders_df, entity_row, global_df=None, + location_df=None, origin_df=None, destination_df=None, ): @@ -864,6 +798,12 @@ def get_latest_feature_values_from_dataframes( latest_customer_row = get_latest_row( entity_row, customer_df, "customer_id", "customer_id" ) + latest_location_row = get_latest_row( + entity_row, + location_df, + "location_id", + "location_id", + ) # Since the event timestamp columns may contain timestamps of different timezones, # we must first convert the timestamps to UTC before we can compare them. @@ -883,7 +823,7 @@ def get_latest_feature_values_from_dataframes( global_df["event_timestamp"].idxmax() ].to_dict() if origin_df is not None: - latest_location_row = get_latest_feature_values_for_location_df( + latest_location_aliased_row = get_latest_feature_values_for_location_df( entity_row, origin_df, destination_df ) @@ -896,6 +836,7 @@ def get_latest_feature_values_from_dataframes( **latest_driver_row, **latest_orders_row, **latest_global_row, + **latest_location_row, **request_data_features, } if origin_df is not None: @@ -906,12 +847,14 @@ def get_latest_feature_values_from_dataframes( **latest_driver_row, **latest_orders_row, **latest_location_row, + **latest_location_aliased_row, **request_data_features, } return { **latest_customer_row, **latest_driver_row, **latest_orders_row, + **latest_location_row, **request_data_features, } @@ -949,6 +892,7 @@ def assert_feature_service_correctness( customers_df, orders_df, global_df, + location_df, ): feature_service_online_features_dict = get_online_features_dict( environment=environment, @@ -968,6 +912,7 @@ def assert_feature_service_correctness( assert set(feature_service_keys) == set(expected_feature_refs) | { "customer_id", "driver_id", + "location_id", } tc = unittest.TestCase() @@ -978,6 +923,7 @@ def assert_feature_service_correctness( orders_df=orders_df, global_df=global_df, entity_row=entity_row, + location_df=location_df, ) tc.assertAlmostEqual( feature_service_online_features_dict[ diff --git a/sdk/python/tests/integration/registration/test_cli.py b/sdk/python/tests/integration/registration/test_cli.py deleted file mode 100644 index ecc17fc06c..0000000000 --- a/sdk/python/tests/integration/registration/test_cli.py +++ /dev/null @@ -1,364 +0,0 @@ -import os -import tempfile -import uuid -from contextlib import contextmanager -from pathlib import Path -from textwrap import dedent -from typing import List - -import pytest -import yaml -from assertpy import assertpy - -from feast import FeatureStore, RepoConfig -from tests.integration.feature_repos.integration_test_repo_config import ( - IntegrationTestRepoConfig, -) -from tests.integration.feature_repos.repo_configuration import Environment -from tests.integration.feature_repos.universal.data_source_creator import ( - DataSourceCreator, -) -from tests.integration.feature_repos.universal.data_sources.bigquery import ( - BigQueryDataSourceCreator, -) -from tests.integration.feature_repos.universal.data_sources.file import ( - FileDataSourceCreator, -) -from tests.integration.feature_repos.universal.data_sources.redshift import ( - RedshiftDataSourceCreator, -) -from tests.utils.cli_utils import CliRunner, get_example_repo -from tests.utils.online_read_write_test import basic_rw_test - - -@pytest.mark.integration -@pytest.mark.universal_offline_stores -def test_universal_cli(environment: Environment): - project = f"test_universal_cli_{str(uuid.uuid4()).replace('-', '')[:8]}" - runner = CliRunner() - - with tempfile.TemporaryDirectory() as repo_dir_name: - try: - repo_path = Path(repo_dir_name) - feature_store_yaml = make_feature_store_yaml( - project, environment.test_repo_config, repo_path - ) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text(dedent(feature_store_yaml)) - - repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("example_feature_repo_1.py")) - result = runner.run(["apply"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - - # Store registry contents, to be compared later. - fs = FeatureStore(repo_path=str(repo_path)) - registry_dict = fs.registry.to_dict(project=project) - # Save only the specs, not the metadata. - registry_specs = { - key: [fco["spec"] if "spec" in fco else fco for fco in value] - for key, value in registry_dict.items() - } - - # entity & feature view list commands should succeed - result = runner.run(["entities", "list"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - result = runner.run(["feature-views", "list"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - result = runner.run(["feature-services", "list"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - result = runner.run(["data-sources", "list"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - - # entity & feature view describe commands should succeed when objects exist - result = runner.run(["entities", "describe", "driver"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - result = runner.run( - ["feature-views", "describe", "driver_locations"], cwd=repo_path - ) - assertpy.assert_that(result.returncode).is_equal_to(0) - result = runner.run( - ["feature-services", "describe", "driver_locations_service"], - cwd=repo_path, - ) - assertpy.assert_that(result.returncode).is_equal_to(0) - assertpy.assert_that(fs.list_feature_views()).is_length(4) - result = runner.run( - ["data-sources", "describe", "customer_profile_source"], cwd=repo_path, - ) - assertpy.assert_that(result.returncode).is_equal_to(0) - assertpy.assert_that(fs.list_data_sources()).is_length(4) - - # entity & feature view describe commands should fail when objects don't exist - result = runner.run(["entities", "describe", "foo"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(1) - result = runner.run(["feature-views", "describe", "foo"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(1) - result = runner.run(["feature-services", "describe", "foo"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(1) - result = runner.run(["data-sources", "describe", "foo"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(1) - - # Doing another apply should be a no op, and should not cause errors - result = runner.run(["apply"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - basic_rw_test( - FeatureStore(repo_path=str(repo_path), config=None), - view_name="driver_locations", - ) - - # Confirm that registry contents have not changed. - registry_dict = fs.registry.to_dict(project=project) - assertpy.assert_that(registry_specs).is_equal_to( - { - key: [fco["spec"] if "spec" in fco else fco for fco in value] - for key, value in registry_dict.items() - } - ) - - result = runner.run(["teardown"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - finally: - runner.run(["teardown"], cwd=repo_path) - - -def make_feature_store_yaml(project, test_repo_config, repo_dir_name: Path): - offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(project) - - offline_store_config = offline_creator.create_offline_store_config() - online_store = test_repo_config.online_store - - config = RepoConfig( - registry=str(Path(repo_dir_name) / "registry.db"), - project=project, - provider=test_repo_config.provider, - offline_store=offline_store_config, - online_store=online_store, - repo_path=str(Path(repo_dir_name)), - ) - config_dict = config.dict() - if ( - isinstance(config_dict["online_store"], dict) - and "redis_type" in config_dict["online_store"] - ): - if str(config_dict["online_store"]["redis_type"]) == "RedisType.redis_cluster": - config_dict["online_store"]["redis_type"] = "redis_cluster" - elif str(config_dict["online_store"]["redis_type"]) == "RedisType.redis": - config_dict["online_store"]["redis_type"] = "redis" - config_dict["repo_path"] = str(config_dict["repo_path"]) - return yaml.safe_dump(config_dict) - - -NULLABLE_ONLINE_STORE_CONFIGS: List[IntegrationTestRepoConfig] = [ - IntegrationTestRepoConfig( - provider="local", - offline_store_creator=FileDataSourceCreator, - online_store=None, - ), -] - -if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True": - NULLABLE_ONLINE_STORE_CONFIGS.extend( - [ - IntegrationTestRepoConfig( - provider="gcp", - offline_store_creator=BigQueryDataSourceCreator, - online_store=None, - ), - IntegrationTestRepoConfig( - provider="aws", - offline_store_creator=RedshiftDataSourceCreator, - online_store=None, - ), - ] - ) - - -@pytest.mark.integration -@pytest.mark.parametrize("test_nullable_online_store", NULLABLE_ONLINE_STORE_CONFIGS) -def test_nullable_online_store(test_nullable_online_store) -> None: - project = f"test_nullable_online_store{str(uuid.uuid4()).replace('-', '')[:8]}" - runner = CliRunner() - - with tempfile.TemporaryDirectory() as repo_dir_name: - try: - repo_path = Path(repo_dir_name) - feature_store_yaml = make_feature_store_yaml( - project, test_nullable_online_store, repo_path - ) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text(dedent(feature_store_yaml)) - - repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("example_feature_repo_1.py")) - result = runner.run(["apply"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - finally: - runner.run(["teardown"], cwd=repo_path) - - -@pytest.mark.integration -@pytest.mark.universal_offline_stores -def test_odfv_apply(environment) -> None: - project = f"test_odfv_apply{str(uuid.uuid4()).replace('-', '')[:8]}" - runner = CliRunner() - - with tempfile.TemporaryDirectory() as repo_dir_name: - try: - repo_path = Path(repo_dir_name) - feature_store_yaml = make_feature_store_yaml( - project, environment.test_repo_config, repo_path - ) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text(dedent(feature_store_yaml)) - - repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("on_demand_feature_view_repo.py")) - result = runner.run(["apply"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - - # entity & feature view list commands should succeed - result = runner.run(["entities", "list"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - result = runner.run(["on-demand-feature-views", "list"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - finally: - runner.run(["teardown"], cwd=repo_path) - - -@contextmanager -def setup_third_party_provider_repo(provider_name: str): - with tempfile.TemporaryDirectory() as repo_dir_name: - - # Construct an example repo in a temporary dir - repo_path = Path(repo_dir_name) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text( - dedent( - f""" - project: foo - registry: data/registry.db - provider: {provider_name} - online_store: - path: data/online_store.db - type: sqlite - offline_store: - type: file - """ - ) - ) - - (repo_path / "foo").mkdir() - repo_example = repo_path / "foo/provider.py" - repo_example.write_text( - (Path(__file__).parents[2] / "foo_provider.py").read_text() - ) - - yield repo_path - - -@contextmanager -def setup_third_party_registry_store_repo(registry_store: str): - with tempfile.TemporaryDirectory() as repo_dir_name: - - # Construct an example repo in a temporary dir - repo_path = Path(repo_dir_name) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text( - dedent( - f""" - project: foo - registry: - registry_store_type: {registry_store} - path: foobar://foo.bar - provider: local - online_store: - path: data/online_store.db - type: sqlite - offline_store: - type: file - """ - ) - ) - - (repo_path / "foo").mkdir() - repo_example = repo_path / "foo/registry_store.py" - repo_example.write_text( - (Path(__file__).parents[2] / "foo_registry_store.py").read_text() - ) - - yield repo_path - - -def test_3rd_party_providers() -> None: - """ - Test running apply on third party providers - """ - runner = CliRunner() - # Check with incorrect built-in provider name (no dots) - with setup_third_party_provider_repo("feast123") as repo_path: - return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assertpy.assert_that(return_code).is_equal_to(1) - assertpy.assert_that(output).contains(b"Provider 'feast123' is not implemented") - # Check with incorrect third-party provider name (with dots) - with setup_third_party_provider_repo("feast_foo.Provider") as repo_path: - return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assertpy.assert_that(return_code).is_equal_to(1) - assertpy.assert_that(output).contains( - b"Could not import module 'feast_foo' while attempting to load class 'Provider'" - ) - # Check with incorrect third-party provider name (with dots) - with setup_third_party_provider_repo("foo.FooProvider") as repo_path: - return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assertpy.assert_that(return_code).is_equal_to(1) - assertpy.assert_that(output).contains( - b"Could not import class 'FooProvider' from module 'foo'" - ) - # Check with correct third-party provider name - with setup_third_party_provider_repo("foo.provider.FooProvider") as repo_path: - return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assertpy.assert_that(return_code).is_equal_to(0) - - -def test_3rd_party_registry_store() -> None: - """ - Test running apply on third party registry stores - """ - runner = CliRunner() - # Check with incorrect built-in provider name (no dots) - with setup_third_party_registry_store_repo("feast123") as repo_path: - return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assertpy.assert_that(return_code).is_equal_to(1) - assertpy.assert_that(output).contains( - b'Registry store class name should end with "RegistryStore"' - ) - # Check with incorrect third-party registry store name (with dots) - with setup_third_party_registry_store_repo("feast_foo.RegistryStore") as repo_path: - return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assertpy.assert_that(return_code).is_equal_to(1) - assertpy.assert_that(output).contains( - b"Could not import module 'feast_foo' while attempting to load class 'RegistryStore'" - ) - # Check with incorrect third-party registry store name (with dots) - with setup_third_party_registry_store_repo("foo.FooRegistryStore") as repo_path: - return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assertpy.assert_that(return_code).is_equal_to(1) - assertpy.assert_that(output).contains( - b"Could not import class 'FooRegistryStore' from module 'foo'" - ) - # Check with correct third-party registry store name - with setup_third_party_registry_store_repo( - "foo.registry_store.FooRegistryStore" - ) as repo_path: - return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assertpy.assert_that(return_code).is_equal_to(0) diff --git a/sdk/python/tests/integration/registration/test_feature_store.py b/sdk/python/tests/integration/registration/test_feature_store.py index 88a4b9f249..7b95afadba 100644 --- a/sdk/python/tests/integration/registration/test_feature_store.py +++ b/sdk/python/tests/integration/registration/test_feature_store.py @@ -11,8 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import time -from datetime import datetime, timedelta +from datetime import timedelta from tempfile import mkstemp import pytest @@ -29,88 +30,13 @@ from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig from feast.repo_config import RepoConfig from feast.types import Array, Bytes, Float64, Int64, String -from tests.utils.data_source_utils import ( +from tests.utils.data_source_test_creator import ( prep_file_source, simple_bq_source_using_query_arg, simple_bq_source_using_table_arg, ) -@pytest.fixture -def feature_store_with_local_registry(): - fd, registry_path = mkstemp() - fd, online_store_path = mkstemp() - return FeatureStore( - config=RepoConfig( - registry=registry_path, - project="default", - provider="local", - online_store=SqliteOnlineStoreConfig(path=online_store_path), - ) - ) - - -@pytest.fixture -def feature_store_with_gcs_registry(): - from google.cloud import storage - - storage_client = storage.Client() - bucket_name = f"feast-registry-test-{int(time.time() * 1000)}" - bucket = storage_client.bucket(bucket_name) - bucket = storage_client.create_bucket(bucket) - bucket.add_lifecycle_delete_rule( - age=14 - ) # delete buckets automatically after 14 days - bucket.patch() - bucket.blob("registry.db") - - return FeatureStore( - config=RepoConfig( - registry=f"gs://{bucket_name}/registry.db", - project="default", - provider="gcp", - ) - ) - - -@pytest.fixture -def feature_store_with_s3_registry(): - return FeatureStore( - config=RepoConfig( - registry=f"s3://feast-integration-tests/registries/{int(time.time() * 1000)}/registry.db", - project="default", - provider="aws", - online_store=DynamoDBOnlineStoreConfig(region="us-west-2"), - offline_store=FileOfflineStoreConfig(), - ) - ) - - -@pytest.mark.parametrize( - "test_feature_store", [lazy_fixture("feature_store_with_local_registry")], -) -def test_apply_entity_success(test_feature_store): - entity = Entity( - name="driver_car_id", description="Car driver id", tags={"team": "matchmaking"}, - ) - - # Register Entity - test_feature_store.apply(entity) - - entities = test_feature_store.list_entities() - - entity = entities[0] - assert ( - len(entities) == 1 - and entity.name == "driver_car_id" - and entity.description == "Car driver id" - and "team" in entity.tags - and entity.tags["team"] == "matchmaking" - ) - - test_feature_store.teardown() - - @pytest.mark.integration @pytest.mark.parametrize( "test_feature_store", @@ -121,7 +47,9 @@ def test_apply_entity_success(test_feature_store): ) def test_apply_entity_integration(test_feature_store): entity = Entity( - name="driver_car_id", description="Car driver id", tags={"team": "matchmaking"}, + name="driver_car_id", + description="Car driver id", + tags={"team": "matchmaking"}, ) # Register Entity @@ -149,62 +77,10 @@ def test_apply_entity_integration(test_feature_store): test_feature_store.teardown() -@pytest.mark.parametrize( - "test_feature_store", [lazy_fixture("feature_store_with_local_registry")], -) -def test_apply_feature_view_success(test_feature_store): - # Create Feature Views - batch_source = FileSource( - file_format=ParquetFormat(), - path="file://feast/*", - timestamp_field="ts_col", - created_timestamp_column="timestamp", - date_partition_column="date_partition_col", - ) - - entity = Entity(name="fs1_my_entity_1", join_keys=["entity_id"]) - - fv1 = FeatureView( - name="my_feature_view_1", - schema=[ - Field(name="fs1_my_feature_1", dtype=Int64), - Field(name="fs1_my_feature_2", dtype=String), - Field(name="fs1_my_feature_3", dtype=Array(String)), - Field(name="fs1_my_feature_4", dtype=Array(Bytes)), - Field(name="entity_id", dtype=Int64), - ], - entities=[entity], - tags={"team": "matchmaking"}, - batch_source=batch_source, - ttl=timedelta(minutes=5), - ) - - # Register Feature View - test_feature_store.apply([entity, fv1]) - - feature_views = test_feature_store.list_feature_views() - - # List Feature Views - assert ( - len(feature_views) == 1 - and feature_views[0].name == "my_feature_view_1" - and feature_views[0].features[0].name == "fs1_my_feature_1" - and feature_views[0].features[0].dtype == Int64 - and feature_views[0].features[1].name == "fs1_my_feature_2" - and feature_views[0].features[1].dtype == String - and feature_views[0].features[2].name == "fs1_my_feature_3" - and feature_views[0].features[2].dtype == Array(String) - and feature_views[0].features[3].name == "fs1_my_feature_4" - and feature_views[0].features[3].dtype == Array(Bytes) - and feature_views[0].entities[0] == "fs1_my_entity_1" - ) - - test_feature_store.teardown() - - @pytest.mark.integration @pytest.mark.parametrize( - "test_feature_store", [lazy_fixture("feature_store_with_local_registry")], + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], ) @pytest.mark.parametrize("dataframe_source", [lazy_fixture("simple_dataset_1")]) def test_feature_view_inference_success(test_feature_store, dataframe_source): @@ -344,67 +220,8 @@ def test_apply_feature_view_integration(test_feature_store): test_feature_store.teardown() -@pytest.mark.parametrize( - "test_feature_store", [lazy_fixture("feature_store_with_local_registry")], -) -def test_apply_object_and_read(test_feature_store): - assert isinstance(test_feature_store, FeatureStore) - # Create Feature Views - batch_source = FileSource( - file_format=ParquetFormat(), - path="file://feast/*", - timestamp_field="ts_col", - created_timestamp_column="timestamp", - ) - - e1 = Entity(name="fs1_my_entity_1", description="something") - - e2 = Entity(name="fs1_my_entity_2", description="something") - - fv1 = FeatureView( - name="my_feature_view_1", - schema=[ - Field(name="fs1_my_feature_1", dtype=Int64), - Field(name="fs1_my_feature_2", dtype=String), - Field(name="fs1_my_feature_3", dtype=Array(String)), - Field(name="fs1_my_feature_4", dtype=Array(Bytes)), - Field(name="fs1_my_entity_1", dtype=Int64), - ], - entities=[e1], - tags={"team": "matchmaking"}, - batch_source=batch_source, - ttl=timedelta(minutes=5), - ) - - fv2 = FeatureView( - name="my_feature_view_2", - schema=[ - Field(name="fs1_my_feature_1", dtype=Int64), - Field(name="fs1_my_feature_2", dtype=String), - Field(name="fs1_my_feature_3", dtype=Array(String)), - Field(name="fs1_my_feature_4", dtype=Array(Bytes)), - Field(name="fs1_my_entity_2", dtype=Int64), - ], - entities=[e2], - tags={"team": "matchmaking"}, - batch_source=batch_source, - ttl=timedelta(minutes=5), - ) - - # Register Feature View - test_feature_store.apply([fv1, e1, fv2, e2]) - - fv1_actual = test_feature_store.get_feature_view("my_feature_view_1") - e1_actual = test_feature_store.get_entity("fs1_my_entity_1") - - assert e1 == e1_actual - assert fv2 != fv1_actual - assert e2 != e1_actual - - test_feature_store.teardown() - - -def test_apply_remote_repo(): +@pytest.fixture +def feature_store_with_local_registry(): fd, registry_path = mkstemp() fd, online_store_path = mkstemp() return FeatureStore( @@ -417,93 +234,42 @@ def test_apply_remote_repo(): ) -@pytest.mark.parametrize( - "test_feature_store", [lazy_fixture("feature_store_with_local_registry")], -) -@pytest.mark.parametrize("dataframe_source", [lazy_fixture("simple_dataset_1")]) -def test_reapply_feature_view_success(test_feature_store, dataframe_source): - with prep_file_source(df=dataframe_source, timestamp_field="ts_1") as file_source: - - e = Entity(name="id", join_keys=["id_join_key"]) - - # Create Feature View - fv1 = FeatureView( - name="my_feature_view_1", - schema=[Field(name="string_col", dtype=String)], - entities=[e], - batch_source=file_source, - ttl=timedelta(minutes=5), - ) - - # Register Feature View - test_feature_store.apply([fv1, e]) - - # Check Feature View - fv_stored = test_feature_store.get_feature_view(fv1.name) - assert len(fv_stored.materialization_intervals) == 0 - - # Run materialization - test_feature_store.materialize(datetime(2020, 1, 1), datetime(2021, 1, 1)) - - # Check Feature View - fv_stored = test_feature_store.get_feature_view(fv1.name) - assert len(fv_stored.materialization_intervals) == 1 - - # Apply again - test_feature_store.apply([fv1]) +@pytest.fixture +def feature_store_with_gcs_registry(): + from google.cloud import storage - # Check Feature View - fv_stored = test_feature_store.get_feature_view(fv1.name) - assert len(fv_stored.materialization_intervals) == 1 + storage_client = storage.Client() + bucket_name = f"feast-registry-test-{int(time.time() * 1000)}" + bucket = storage_client.bucket(bucket_name) + bucket = storage_client.create_bucket(bucket) + bucket.add_lifecycle_delete_rule( + age=14 + ) # delete buckets automatically after 14 days + bucket.patch() + bucket.blob("registry.db") - # Change and apply Feature View - fv1 = FeatureView( - name="my_feature_view_1", - schema=[Field(name="int64_col", dtype=Int64)], - entities=[e], - batch_source=file_source, - ttl=timedelta(minutes=5), + return FeatureStore( + config=RepoConfig( + registry=f"gs://{bucket_name}/registry.db", + project="default", + provider="gcp", ) - test_feature_store.apply([fv1]) - - # Check Feature View - fv_stored = test_feature_store.get_feature_view(fv1.name) - assert len(fv_stored.materialization_intervals) == 0 - - test_feature_store.teardown() - - -def test_apply_conflicting_featureview_names(feature_store_with_local_registry): - """Test applying feature views with non-case-insensitively unique names""" - driver = Entity(name="driver", join_keys=["driver_id"]) - customer = Entity(name="customer", join_keys=["customer_id"]) - - driver_stats = FeatureView( - name="driver_hourly_stats", - entities=[driver], - ttl=timedelta(seconds=10), - online=False, - batch_source=FileSource(path="driver_stats.parquet"), - tags={}, ) - customer_stats = FeatureView( - name="DRIVER_HOURLY_STATS", - entities=[customer], - ttl=timedelta(seconds=10), - online=False, - batch_source=FileSource(path="customer_stats.parquet"), - tags={}, + +@pytest.fixture +def feature_store_with_s3_registry(): + aws_registry_path = os.getenv( + "AWS_REGISTRY_PATH", "s3://feast-integration-tests/registries" ) - try: - feature_store_with_local_registry.apply([driver_stats, customer_stats]) - error = None - except ValueError as e: - error = e - assert ( - isinstance(error, ValueError) - and "Please ensure that all feature view names are case-insensitively unique" - in error.args[0] + return FeatureStore( + config=RepoConfig( + registry=f"{aws_registry_path}/{int(time.time() * 1000)}/registry.db", + project="default", + provider="aws", + online_store=DynamoDBOnlineStoreConfig( + region=os.getenv("AWS_REGION", "us-west-2") + ), + offline_store=FileOfflineStoreConfig(), + ) ) - - feature_store_with_local_registry.teardown() diff --git a/sdk/python/tests/integration/registration/test_inference.py b/sdk/python/tests/integration/registration/test_inference.py index 935aa2d1a6..de02fe53fe 100644 --- a/sdk/python/tests/integration/registration/test_inference.py +++ b/sdk/python/tests/integration/registration/test_inference.py @@ -1,79 +1,17 @@ from copy import deepcopy -import pandas as pd import pytest -from feast import ( - BigQuerySource, - Entity, - Feature, - FeatureService, - FileSource, - RedshiftSource, - RepoConfig, - SnowflakeSource, - ValueType, -) -from feast.data_source import RequestSource -from feast.errors import ( - DataSourceNoNameException, - RegistryInferenceFailure, - SpecifiedFeaturesNotPresentError, -) -from feast.feature_view import FeatureView -from feast.field import Field -from feast.inference import ( - update_data_sources_with_inferred_event_timestamp_col, - update_feature_views_with_inferred_features_and_entities, -) -from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( - SparkSource, -) -from feast.on_demand_feature_view import on_demand_feature_view -from feast.types import Float32, Float64, Int64, String, UnixTimestamp -from tests.utils.data_source_utils import ( +from feast import RepoConfig +from feast.errors import RegistryInferenceFailure +from feast.inference import update_data_sources_with_inferred_event_timestamp_col +from tests.utils.data_source_test_creator import ( prep_file_source, simple_bq_source_using_query_arg, simple_bq_source_using_table_arg, ) -def test_infer_datasource_names_file(): - file_path = "path/to/test.csv" - data_source = FileSource(path=file_path) - assert data_source.name == file_path - - source_name = "my_name" - data_source = FileSource(name=source_name, path=file_path) - assert data_source.name == source_name - - -def test_infer_datasource_names_dwh(): - table = "project.table" - dwh_classes = [BigQuerySource, RedshiftSource, SnowflakeSource, SparkSource] - - for dwh_class in dwh_classes: - data_source = dwh_class(table=table) - assert data_source.name == table - - source_name = "my_name" - data_source_with_table = dwh_class(name=source_name, table=table) - assert data_source_with_table.name == source_name - data_source_with_query = dwh_class( - name=source_name, query=f"SELECT * from {table}" - ) - assert data_source_with_query.name == source_name - - # If we have a query and no name, throw an error - if dwh_class == SparkSource: - with pytest.raises(DataSourceNoNameException): - print(f"Testing dwh {dwh_class}") - data_source = dwh_class(query="test_query") - else: - data_source = dwh_class(query="test_query") - assert data_source.name == "" - - @pytest.mark.integration def test_update_file_data_source_with_inferred_event_timestamp_col(simple_dataset_1): df_with_two_viable_timestamp_cols = simple_dataset_1.copy(deep=True) @@ -113,7 +51,8 @@ def test_update_data_sources_with_inferred_event_timestamp_col(universal_data_so data_source.event_timestamp_column = None update_data_sources_with_inferred_event_timestamp_col( - data_sources_copy.values(), RepoConfig(provider="local", project="test"), + data_sources_copy.values(), + RepoConfig(provider="local", project="test"), ) actual_event_timestamp_cols = [ source.timestamp_field for source in data_sources_copy.values() @@ -122,322 +61,3 @@ def test_update_data_sources_with_inferred_event_timestamp_col(universal_data_so assert actual_event_timestamp_cols == ["event_timestamp"] * len( data_sources_copy.values() ) - - -def test_on_demand_features_type_inference(): - # Create Feature Views - date_request = RequestSource( - name="date_request", schema=[Field(name="some_date", dtype=UnixTimestamp)], - ) - - @on_demand_feature_view( - sources=[date_request], - schema=[ - Field(name="output", dtype=UnixTimestamp), - Field(name="string_output", dtype=String), - ], - ) - def test_view(features_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["output"] = features_df["some_date"] - data["string_output"] = features_df["some_date"].astype(pd.StringDtype()) - return data - - test_view.infer_features() - - @on_demand_feature_view( - # Note: we deliberately use `inputs` instead of `sources` to test that `inputs` - # still works correctly, even though it is deprecated. - # TODO(felixwang9817): Remove references to `inputs` once it is fully deprecated. - inputs={"date_request": date_request}, - features=[ - Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), - Feature(name="object_output", dtype=ValueType.STRING), - ], - ) - def invalid_test_view(features_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["output"] = features_df["some_date"] - data["object_output"] = features_df["some_date"].astype(str) - return data - - with pytest.raises(ValueError, match="Value with native type object"): - invalid_test_view.infer_features() - - @on_demand_feature_view( - # Note: we deliberately use positional arguments here to test that they work correctly, - # even though positional arguments are deprecated in favor of keyword arguments. - # TODO(felixwang9817): Remove positional arguments once they are fully deprecated. - [ - Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), - Feature(name="missing", dtype=ValueType.STRING), - ], - {"date_request": date_request}, - ) - def test_view_with_missing_feature(features_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["output"] = features_df["some_date"] - return data - - with pytest.raises(SpecifiedFeaturesNotPresentError): - test_view_with_missing_feature.infer_features() - - -# TODO(kevjumba): remove this in feast 0.24 when deprecating -@pytest.mark.parametrize( - "request_source_schema", - [ - [Field(name="some_date", dtype=UnixTimestamp)], - {"some_date": ValueType.UNIX_TIMESTAMP}, - ], -) -def test_datasource_inference(request_source_schema): - # Create Feature Views - date_request = RequestSource(name="date_request", schema=request_source_schema,) - - @on_demand_feature_view( - # Note: we deliberately use positional arguments here to test that they work correctly, - # even though positional arguments are deprecated in favor of keyword arguments. - # TODO(felixwang9817): Remove positional arguments once they are fully deprecated. - [ - Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), - Feature(name="string_output", dtype=ValueType.STRING), - ], - sources=[date_request], - ) - def test_view(features_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["output"] = features_df["some_date"] - data["string_output"] = features_df["some_date"].astype(pd.StringDtype()) - return data - - test_view.infer_features() - - @on_demand_feature_view( - sources=[date_request], - schema=[ - Field(name="output", dtype=UnixTimestamp), - Field(name="object_output", dtype=String), - ], - ) - def invalid_test_view(features_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["output"] = features_df["some_date"] - data["object_output"] = features_df["some_date"].astype(str) - return data - - with pytest.raises(ValueError, match="Value with native type object"): - invalid_test_view.infer_features() - - @on_demand_feature_view( - sources=[date_request], - features=[ - Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), - Feature(name="missing", dtype=ValueType.STRING), - ], - ) - def test_view_with_missing_feature(features_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["output"] = features_df["some_date"] - return data - - with pytest.raises(SpecifiedFeaturesNotPresentError): - test_view_with_missing_feature.infer_features() - - -def test_feature_view_inference_respects_basic_inference(): - """ - Tests that feature view inference respects the basic inference that occurs during creation. - """ - file_source = FileSource(name="test", path="test path") - entity1 = Entity(name="test1", join_keys=["test_column_1"]) - entity2 = Entity(name="test2", join_keys=["test_column_2"]) - feature_view_1 = FeatureView( - name="test1", - entities=[entity1], - schema=[ - Field(name="feature", dtype=Float32), - Field(name="test_column_1", dtype=String), - ], - source=file_source, - ) - feature_view_2 = FeatureView( - name="test2", - entities=[entity1, entity2], - schema=[ - Field(name="feature", dtype=Float32), - Field(name="test_column_1", dtype=String), - Field(name="test_column_2", dtype=String), - ], - source=file_source, - ) - - assert len(feature_view_1.schema) == 2 - assert len(feature_view_1.features) == 1 - assert len(feature_view_1.entity_columns) == 1 - - update_feature_views_with_inferred_features_and_entities( - [feature_view_1], [entity1], RepoConfig(provider="local", project="test") - ) - assert len(feature_view_1.schema) == 2 - assert len(feature_view_1.features) == 1 - assert len(feature_view_1.entity_columns) == 1 - - assert len(feature_view_2.schema) == 3 - assert len(feature_view_2.features) == 1 - assert len(feature_view_2.entity_columns) == 2 - - update_feature_views_with_inferred_features_and_entities( - [feature_view_2], - [entity1, entity2], - RepoConfig(provider="local", project="test"), - ) - assert len(feature_view_2.schema) == 3 - assert len(feature_view_2.features) == 1 - assert len(feature_view_2.entity_columns) == 2 - - -def test_feature_view_inference_on_entity_columns(simple_dataset_1): - """ - Tests that feature view inference correctly infers entity columns. - """ - with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: - entity1 = Entity(name="test1", join_keys=["id_join_key"]) - feature_view_1 = FeatureView( - name="test1", - entities=[entity1], - schema=[Field(name="int64_col", dtype=Int64)], - source=file_source, - ) - - assert len(feature_view_1.schema) == 1 - assert len(feature_view_1.features) == 1 - assert len(feature_view_1.entity_columns) == 0 - - update_feature_views_with_inferred_features_and_entities( - [feature_view_1], [entity1], RepoConfig(provider="local", project="test") - ) - - # The schema is only used as a parameter, as is therefore not updated during inference. - assert len(feature_view_1.schema) == 1 - - # Since there is already a feature specified, additional features are not inferred. - assert len(feature_view_1.features) == 1 - - # The single entity column is inferred correctly. - assert len(feature_view_1.entity_columns) == 1 - - -def test_feature_view_inference_respects_entity_value_type(simple_dataset_1): - """ - Tests that feature view inference still respects an entity's value type. - """ - # TODO(felixwang9817): Remove this test once entity value_type is removed. - with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: - entity1 = Entity( - name="test1", join_keys=["id_join_key"], value_type=ValueType.STRING - ) - feature_view_1 = FeatureView( - name="test1", - entities=[entity1], - schema=[Field(name="int64_col", dtype=Int64)], - source=file_source, - ) - - assert len(feature_view_1.schema) == 1 - assert len(feature_view_1.features) == 1 - assert len(feature_view_1.entity_columns) == 0 - - update_feature_views_with_inferred_features_and_entities( - [feature_view_1], [entity1], RepoConfig(provider="local", project="test") - ) - - # The schema is only used as a parameter, as is therefore not updated during inference. - assert len(feature_view_1.schema) == 1 - - # Since there is already a feature specified, additional features are not inferred. - assert len(feature_view_1.features) == 1 - - # The single entity column is inferred correctly and has type String. - assert len(feature_view_1.entity_columns) == 1 - assert feature_view_1.entity_columns[0].dtype == String - - -def test_feature_view_inference_on_feature_columns(simple_dataset_1): - """ - Tests that feature view inference correctly infers feature columns. - """ - with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: - entity1 = Entity(name="test1", join_keys=["id_join_key"]) - feature_view_1 = FeatureView( - name="test1", - entities=[entity1], - schema=[Field(name="id_join_key", dtype=Int64)], - source=file_source, - ) - - assert len(feature_view_1.schema) == 1 - assert len(feature_view_1.features) == 0 - assert len(feature_view_1.entity_columns) == 1 - - update_feature_views_with_inferred_features_and_entities( - [feature_view_1], [entity1], RepoConfig(provider="local", project="test") - ) - - # The schema is only used as a parameter, as is therefore not updated during inference. - assert len(feature_view_1.schema) == 1 - - # All three feature columns are inferred correctly. - assert len(feature_view_1.features) == 3 - print(feature_view_1.features) - feature_column_1 = Field(name="float_col", dtype=Float64) - feature_column_2 = Field(name="int64_col", dtype=Int64) - feature_column_3 = Field(name="string_col", dtype=String) - assert feature_column_1 in feature_view_1.features - assert feature_column_2 in feature_view_1.features - assert feature_column_3 in feature_view_1.features - - # The single entity column remains. - assert len(feature_view_1.entity_columns) == 1 - - -def test_update_feature_services_with_inferred_features(simple_dataset_1): - with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: - entity1 = Entity(name="test1", join_keys=["id_join_key"]) - feature_view_1 = FeatureView( - name="test1", entities=[entity1], source=file_source, - ) - feature_view_2 = FeatureView( - name="test2", entities=[entity1], source=file_source, - ) - - feature_service = FeatureService( - name="fs_1", features=[feature_view_1[["string_col"]], feature_view_2] - ) - assert len(feature_service.feature_view_projections) == 2 - assert len(feature_service.feature_view_projections[0].features) == 0 - assert len(feature_service.feature_view_projections[0].desired_features) == 1 - assert len(feature_service.feature_view_projections[1].features) == 0 - assert len(feature_service.feature_view_projections[1].desired_features) == 0 - - update_feature_views_with_inferred_features_and_entities( - [feature_view_1, feature_view_2], - [entity1], - RepoConfig(provider="local", project="test"), - ) - feature_service.infer_features( - fvs_to_update={ - feature_view_1.name: feature_view_1, - feature_view_2.name: feature_view_2, - } - ) - - assert len(feature_view_1.schema) == 0 - assert len(feature_view_1.features) == 3 - assert len(feature_view_2.schema) == 0 - assert len(feature_view_2.features) == 3 - assert len(feature_service.feature_view_projections[0].features) == 1 - assert len(feature_service.feature_view_projections[1].features) == 3 - - -# TODO(felixwang9817): Add tests that interact with field mapping. diff --git a/sdk/python/tests/integration/registration/test_registry.py b/sdk/python/tests/integration/registration/test_registry.py index 36e19e222a..0cc161d997 100644 --- a/sdk/python/tests/integration/registration/test_registry.py +++ b/sdk/python/tests/integration/registration/test_registry.py @@ -11,35 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import time from datetime import timedelta -from tempfile import mkstemp -import pandas as pd import pytest from pytest_lazyfixture import lazy_fixture from feast import FileSource -from feast.aggregation import Aggregation -from feast.data_format import AvroFormat, ParquetFormat -from feast.data_source import KafkaSource +from feast.data_format import ParquetFormat from feast.entity import Entity -from feast.feature import Feature from feast.feature_view import FeatureView from feast.field import Field -from feast.on_demand_feature_view import RequestSource, on_demand_feature_view from feast.registry import Registry from feast.repo_config import RegistryConfig -from feast.stream_feature_view import StreamFeatureView -from feast.types import Array, Bytes, Float32, Int32, Int64, String -from feast.value_type import ValueType - - -@pytest.fixture -def local_registry() -> Registry: - fd, registry_path = mkstemp() - registry_config = RegistryConfig(path=registry_path, cache_ttl_seconds=600) - return Registry(registry_config, None) +from feast.types import Array, Bytes, Int64, String +from tests.utils.e2e_test_validation import validate_registry_data_source_apply @pytest.fixture @@ -63,63 +50,26 @@ def gcs_registry() -> Registry: @pytest.fixture def s3_registry() -> Registry: + aws_registry_path = os.getenv( + "AWS_REGISTRY_PATH", "s3://feast-integration-tests/registries" + ) registry_config = RegistryConfig( - path=f"s3://feast-integration-tests/registries/{int(time.time() * 1000)}/registry.db", + path=f"{aws_registry_path}/{int(time.time() * 1000)}/registry.db", cache_ttl_seconds=600, ) return Registry(registry_config, None) -@pytest.mark.parametrize( - "test_registry", [lazy_fixture("local_registry")], -) -def test_apply_entity_success(test_registry): - entity = Entity( - name="driver_car_id", description="Car driver id", tags={"team": "matchmaking"}, - ) - - project = "project" - - # Register Entity - test_registry.apply_entity(entity, project) - - entities = test_registry.list_entities(project) - - entity = entities[0] - assert ( - len(entities) == 1 - and entity.name == "driver_car_id" - and entity.description == "Car driver id" - and "team" in entity.tags - and entity.tags["team"] == "matchmaking" - ) - - entity = test_registry.get_entity("driver_car_id", project) - assert ( - entity.name == "driver_car_id" - and entity.description == "Car driver id" - and "team" in entity.tags - and entity.tags["team"] == "matchmaking" - ) - - test_registry.delete_entity("driver_car_id", project) - entities = test_registry.list_entities(project) - assert len(entities) == 0 - - test_registry.teardown() - - # Will try to reload registry, which will fail because the file has been deleted - with pytest.raises(FileNotFoundError): - test_registry._get_registry_proto() - - @pytest.mark.integration @pytest.mark.parametrize( - "test_registry", [lazy_fixture("gcs_registry"), lazy_fixture("s3_registry")], + "test_registry", + [lazy_fixture("gcs_registry"), lazy_fixture("s3_registry")], ) def test_apply_entity_integration(test_registry): entity = Entity( - name="driver_car_id", description="Car driver id", tags={"team": "matchmaking"}, + name="driver_car_id", + description="Car driver id", + tags={"team": "matchmaking"}, ) project = "project" @@ -150,350 +100,13 @@ def test_apply_entity_integration(test_registry): # Will try to reload registry, which will fail because the file has been deleted with pytest.raises(FileNotFoundError): - test_registry._get_registry_proto() - - -@pytest.mark.parametrize( - "test_registry", [lazy_fixture("local_registry")], -) -def test_apply_feature_view_success(test_registry): - # Create Feature Views - batch_source = FileSource( - file_format=ParquetFormat(), - path="file://feast/*", - timestamp_field="ts_col", - created_timestamp_column="timestamp", - ) - - entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) - - fv1 = FeatureView( - name="my_feature_view_1", - schema=[ - Field(name="fs1_my_feature_1", dtype=Int64), - Field(name="fs1_my_feature_2", dtype=String), - Field(name="fs1_my_feature_3", dtype=Array(String)), - Field(name="fs1_my_feature_4", dtype=Array(Bytes)), - ], - entities=[entity], - tags={"team": "matchmaking"}, - batch_source=batch_source, - ttl=timedelta(minutes=5), - ) - - project = "project" - - # Register Feature View - test_registry.apply_feature_view(fv1, project) - - feature_views = test_registry.list_feature_views(project) - - # List Feature Views - assert ( - len(feature_views) == 1 - and feature_views[0].name == "my_feature_view_1" - and feature_views[0].features[0].name == "fs1_my_feature_1" - and feature_views[0].features[0].dtype == Int64 - and feature_views[0].features[1].name == "fs1_my_feature_2" - and feature_views[0].features[1].dtype == String - and feature_views[0].features[2].name == "fs1_my_feature_3" - and feature_views[0].features[2].dtype == Array(String) - and feature_views[0].features[3].name == "fs1_my_feature_4" - and feature_views[0].features[3].dtype == Array(Bytes) - and feature_views[0].entities[0] == "fs1_my_entity_1" - ) - - feature_view = test_registry.get_feature_view("my_feature_view_1", project) - assert ( - feature_view.name == "my_feature_view_1" - and feature_view.features[0].name == "fs1_my_feature_1" - and feature_view.features[0].dtype == Int64 - and feature_view.features[1].name == "fs1_my_feature_2" - and feature_view.features[1].dtype == String - and feature_view.features[2].name == "fs1_my_feature_3" - and feature_view.features[2].dtype == Array(String) - and feature_view.features[3].name == "fs1_my_feature_4" - and feature_view.features[3].dtype == Array(Bytes) - and feature_view.entities[0] == "fs1_my_entity_1" - ) - - test_registry.delete_feature_view("my_feature_view_1", project) - feature_views = test_registry.list_feature_views(project) - assert len(feature_views) == 0 - - test_registry.teardown() - - # Will try to reload registry, which will fail because the file has been deleted - with pytest.raises(FileNotFoundError): - test_registry._get_registry_proto() - - -@pytest.mark.parametrize( - "test_registry", [lazy_fixture("local_registry")], -) -def test_apply_on_demand_feature_view_success(test_registry): - # Create Feature Views - driver_stats = FileSource( - name="driver_stats_source", - path="data/driver_stats_lat_lon.parquet", - timestamp_field="event_timestamp", - created_timestamp_column="created", - description="A table describing the stats of a driver based on hourly logs", - owner="test2@gmail.com", - ) - - driver_daily_features_view = FeatureView( - name="driver_daily_features", - entities=["driver"], - ttl=timedelta(seconds=8640000000), - schema=[ - Field(name="daily_miles_driven", dtype=Float32), - Field(name="lat", dtype=Float32), - Field(name="lon", dtype=Float32), - Field(name="string_feature", dtype=String), - ], - online=True, - source=driver_stats, - tags={"production": "True"}, - owner="test2@gmail.com", - ) - - @on_demand_feature_view( - sources=[driver_daily_features_view], - schema=[Field(name="first_char", dtype=String)], - ) - def location_features_from_push(inputs: pd.DataFrame) -> pd.DataFrame: - df = pd.DataFrame() - df["first_char"] = inputs["string_feature"].str[:1].astype("string") - return df - - project = "project" - - # Register Feature View - test_registry.apply_feature_view(location_features_from_push, project) - - feature_views = test_registry.list_on_demand_feature_views(project) - - # List Feature Views - assert ( - len(feature_views) == 1 - and feature_views[0].name == "location_features_from_push" - and feature_views[0].features[0].name == "first_char" - and feature_views[0].features[0].dtype == String - ) - - feature_view = test_registry.get_on_demand_feature_view( - "location_features_from_push", project - ) - assert ( - feature_view.name == "location_features_from_push" - and feature_view.features[0].name == "first_char" - and feature_view.features[0].dtype == String - ) - - test_registry.delete_feature_view("location_features_from_push", project) - feature_views = test_registry.list_on_demand_feature_views(project) - assert len(feature_views) == 0 - - test_registry.teardown() - - # Will try to reload registry, which will fail because the file has been deleted - with pytest.raises(FileNotFoundError): - test_registry._get_registry_proto() - - -@pytest.mark.parametrize( - "test_registry", [lazy_fixture("local_registry")], -) -def test_apply_stream_feature_view_success(test_registry): - # Create Feature Views - def simple_udf(x: int): - return x + 3 - - entity = Entity(name="driver_entity", join_keys=["test_key"]) - - stream_source = KafkaSource( - name="kafka", - timestamp_field="event_timestamp", - kafka_bootstrap_servers="", - message_format=AvroFormat(""), - topic="topic", - batch_source=FileSource(path="some path"), - watermark_delay_threshold=timedelta(days=1), - ) - - sfv = StreamFeatureView( - name="test kafka stream feature view", - entities=[entity], - ttl=timedelta(days=30), - owner="test@example.com", - online=True, - schema=[Field(name="dummy_field", dtype=Float32)], - description="desc", - aggregations=[ - Aggregation( - column="dummy_field", function="max", time_window=timedelta(days=1), - ), - Aggregation( - column="dummy_field2", function="count", time_window=timedelta(days=24), - ), - ], - timestamp_field="event_timestamp", - mode="spark", - source=stream_source, - udf=simple_udf, - tags={}, - ) - - project = "project" - - # Register Feature View - test_registry.apply_feature_view(sfv, project) - - stream_feature_views = test_registry.list_stream_feature_views(project) - - # List Feature Views - assert len(stream_feature_views) == 1 - assert stream_feature_views[0] == sfv - - test_registry.delete_feature_view("test kafka stream feature view", project) - stream_feature_views = test_registry.list_stream_feature_views(project) - assert len(stream_feature_views) == 0 - - test_registry.teardown() - - # Will try to reload registry, which will fail because the file has been deleted - with pytest.raises(FileNotFoundError): - test_registry._get_registry_proto() - - -@pytest.mark.parametrize( - "test_registry", [lazy_fixture("local_registry")], -) -# TODO(kevjumba): remove this in feast 0.24 when deprecating -@pytest.mark.parametrize( - "request_source_schema", - [[Field(name="my_input_1", dtype=Int32)], {"my_input_1": ValueType.INT32}], -) -def test_modify_feature_views_success(test_registry, request_source_schema): - # Create Feature Views - batch_source = FileSource( - file_format=ParquetFormat(), - path="file://feast/*", - timestamp_field="ts_col", - created_timestamp_column="timestamp", - ) - - request_source = RequestSource(name="request_source", schema=request_source_schema,) - - entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) - - fv1 = FeatureView( - name="my_feature_view_1", - schema=[Field(name="fs1_my_feature_1", dtype=Int64)], - entities=[entity], - tags={"team": "matchmaking"}, - batch_source=batch_source, - ttl=timedelta(minutes=5), - ) - - @on_demand_feature_view( - features=[ - Feature(name="odfv1_my_feature_1", dtype=ValueType.STRING), - Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), - ], - sources=[request_source], - ) - def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype("category") - data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32") - return data - - project = "project" - - # Register Feature Views - test_registry.apply_feature_view(odfv1, project) - test_registry.apply_feature_view(fv1, project) - - # Modify odfv by changing a single feature dtype - @on_demand_feature_view( - features=[ - Feature(name="odfv1_my_feature_1", dtype=ValueType.FLOAT), - Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), - ], - sources=[request_source], - ) - def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: - data = pd.DataFrame() - data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype("float") - data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32") - return data - - # Apply the modified odfv - test_registry.apply_feature_view(odfv1, project) - - # Check odfv - on_demand_feature_views = test_registry.list_on_demand_feature_views(project) - - assert ( - len(on_demand_feature_views) == 1 - and on_demand_feature_views[0].name == "odfv1" - and on_demand_feature_views[0].features[0].name == "odfv1_my_feature_1" - and on_demand_feature_views[0].features[0].dtype == Float32 - and on_demand_feature_views[0].features[1].name == "odfv1_my_feature_2" - and on_demand_feature_views[0].features[1].dtype == Int32 - ) - request_schema = on_demand_feature_views[0].get_request_data_schema() - assert ( - list(request_schema.keys())[0] == "my_input_1" - and list(request_schema.values())[0] == ValueType.INT32 - ) - - feature_view = test_registry.get_on_demand_feature_view("odfv1", project) - assert ( - feature_view.name == "odfv1" - and feature_view.features[0].name == "odfv1_my_feature_1" - and feature_view.features[0].dtype == Float32 - and feature_view.features[1].name == "odfv1_my_feature_2" - and feature_view.features[1].dtype == Int32 - ) - request_schema = feature_view.get_request_data_schema() - assert ( - list(request_schema.keys())[0] == "my_input_1" - and list(request_schema.values())[0] == ValueType.INT32 - ) - - # Make sure fv1 is untouched - feature_views = test_registry.list_feature_views(project) - - # List Feature Views - assert ( - len(feature_views) == 1 - and feature_views[0].name == "my_feature_view_1" - and feature_views[0].features[0].name == "fs1_my_feature_1" - and feature_views[0].features[0].dtype == Int64 - and feature_views[0].entities[0] == "fs1_my_entity_1" - ) - - feature_view = test_registry.get_feature_view("my_feature_view_1", project) - assert ( - feature_view.name == "my_feature_view_1" - and feature_view.features[0].name == "fs1_my_feature_1" - and feature_view.features[0].dtype == Int64 - and feature_view.entities[0] == "fs1_my_entity_1" - ) - - test_registry.teardown() - - # Will try to reload registry, which will fail because the file has been deleted - with pytest.raises(FileNotFoundError): - test_registry._get_registry_proto() + test_registry._get_registry_proto(project=project) @pytest.mark.integration @pytest.mark.parametrize( - "test_registry", [lazy_fixture("gcs_registry"), lazy_fixture("s3_registry")], + "test_registry", + [lazy_fixture("gcs_registry"), lazy_fixture("s3_registry")], ) def test_apply_feature_view_integration(test_registry): # Create Feature Views @@ -564,143 +177,13 @@ def test_apply_feature_view_integration(test_registry): # Will try to reload registry, which will fail because the file has been deleted with pytest.raises(FileNotFoundError): - test_registry._get_registry_proto() + test_registry._get_registry_proto(project=project) @pytest.mark.integration @pytest.mark.parametrize( - "test_registry", [lazy_fixture("gcs_registry"), lazy_fixture("s3_registry")], + "test_registry", + [lazy_fixture("gcs_registry"), lazy_fixture("s3_registry")], ) -def test_apply_data_source(test_registry: Registry): - # Create Feature Views - batch_source = FileSource( - name="test_source", - file_format=ParquetFormat(), - path="file://feast/*", - timestamp_field="ts_col", - created_timestamp_column="timestamp", - ) - - entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) - - fv1 = FeatureView( - name="my_feature_view_1", - schema=[ - Field(name="fs1_my_feature_1", dtype=Int64), - Field(name="fs1_my_feature_2", dtype=String), - Field(name="fs1_my_feature_3", dtype=Array(String)), - Field(name="fs1_my_feature_4", dtype=Array(Bytes)), - ], - entities=[entity], - tags={"team": "matchmaking"}, - batch_source=batch_source, - ttl=timedelta(minutes=5), - ) - - project = "project" - - # Register data source and feature view - test_registry.apply_data_source(batch_source, project, commit=False) - test_registry.apply_feature_view(fv1, project, commit=True) - - registry_feature_views = test_registry.list_feature_views(project) - registry_data_sources = test_registry.list_data_sources(project) - assert len(registry_feature_views) == 1 - assert len(registry_data_sources) == 1 - registry_feature_view = registry_feature_views[0] - assert registry_feature_view.batch_source == batch_source - registry_data_source = registry_data_sources[0] - assert registry_data_source == batch_source - - # Check that change to batch source propagates - batch_source.timestamp_field = "new_ts_col" - test_registry.apply_data_source(batch_source, project, commit=False) - test_registry.apply_feature_view(fv1, project, commit=True) - registry_feature_views = test_registry.list_feature_views(project) - registry_data_sources = test_registry.list_data_sources(project) - assert len(registry_feature_views) == 1 - assert len(registry_data_sources) == 1 - registry_feature_view = registry_feature_views[0] - assert registry_feature_view.batch_source == batch_source - registry_batch_source = test_registry.list_data_sources(project)[0] - assert registry_batch_source == batch_source - - test_registry.teardown() - - # Will try to reload registry, which will fail because the file has been deleted - with pytest.raises(FileNotFoundError): - test_registry._get_registry_proto() - - -def test_commit(): - fd, registry_path = mkstemp() - registry_config = RegistryConfig(path=registry_path, cache_ttl_seconds=600) - test_registry = Registry(registry_config, None) - - entity = Entity( - name="driver_car_id", description="Car driver id", tags={"team": "matchmaking"}, - ) - - project = "project" - - # Register Entity without commiting - test_registry.apply_entity(entity, project, commit=False) - - # Retrieving the entity should still succeed - entities = test_registry.list_entities(project, allow_cache=True) - - entity = entities[0] - assert ( - len(entities) == 1 - and entity.name == "driver_car_id" - and entity.description == "Car driver id" - and "team" in entity.tags - and entity.tags["team"] == "matchmaking" - ) - - entity = test_registry.get_entity("driver_car_id", project, allow_cache=True) - assert ( - entity.name == "driver_car_id" - and entity.description == "Car driver id" - and "team" in entity.tags - and entity.tags["team"] == "matchmaking" - ) - - # Create new registry that points to the same store - registry_with_same_store = Registry(registry_config, None) - - # Retrieving the entity should fail since the store is empty - entities = registry_with_same_store.list_entities(project) - assert len(entities) == 0 - - # commit from the original registry - test_registry.commit() - - # Reconstruct the new registry in order to read the newly written store - registry_with_same_store = Registry(registry_config, None) - - # Retrieving the entity should now succeed - entities = registry_with_same_store.list_entities(project) - - entity = entities[0] - assert ( - len(entities) == 1 - and entity.name == "driver_car_id" - and entity.description == "Car driver id" - and "team" in entity.tags - and entity.tags["team"] == "matchmaking" - ) - - entity = test_registry.get_entity("driver_car_id", project) - assert ( - entity.name == "driver_car_id" - and entity.description == "Car driver id" - and "team" in entity.tags - and entity.tags["team"] == "matchmaking" - ) - - test_registry.teardown() - - # Will try to reload registry, which will fail because the file has been deleted - with pytest.raises(FileNotFoundError): - test_registry._get_registry_proto() +def test_apply_data_source_integration(test_registry: Registry): + validate_registry_data_source_apply(test_registry) diff --git a/sdk/python/tests/integration/registration/test_sql_registry.py b/sdk/python/tests/integration/registration/test_sql_registry.py index c483a7c46f..286b1abd21 100644 --- a/sdk/python/tests/integration/registration/test_sql_registry.py +++ b/sdk/python/tests/integration/registration/test_sql_registry.py @@ -56,7 +56,10 @@ def pg_registry(): log_string_to_wait_for = "database system is ready to accept connections" waited = wait_for_logs( - container=container, predicate=log_string_to_wait_for, timeout=30, interval=10, + container=container, + predicate=log_string_to_wait_for, + timeout=30, + interval=10, ) logger.info("Waited for %s seconds until postgres container was up", waited) container_port = container.get_exposed_port(5432) @@ -84,9 +87,13 @@ def mysql_registry(): container.start() - log_string_to_wait_for = "/usr/sbin/mysqld: ready for connections. Version: '8.0.29' socket: '/var/run/mysqld/mysqld.sock' port: 3306" + # The log string uses '8.0.*' since the version might be changed as new Docker images are pushed. + log_string_to_wait_for = "/usr/sbin/mysqld: ready for connections. Version: '8.0.*' socket: '/var/run/mysqld/mysqld.sock' port: 3306" waited = wait_for_logs( - container=container, predicate=log_string_to_wait_for, timeout=60, interval=10, + container=container, + predicate=log_string_to_wait_for, + timeout=60, + interval=10, ) logger.info("Waited for %s seconds until mysql container was up", waited) container_port = container.get_exposed_port(3306) @@ -106,19 +113,28 @@ def mysql_registry(): reason="does not run on mac github actions", ) @pytest.mark.parametrize( - "sql_registry", [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + "sql_registry", + [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], ) def test_apply_entity_success(sql_registry): entity = Entity( - name="driver_car_id", description="Car driver id", tags={"team": "matchmaking"}, + name="driver_car_id", + description="Car driver id", + tags={"team": "matchmaking"}, ) project = "project" # Register Entity sql_registry.apply_entity(entity, project) + project_metadata = sql_registry.list_project_metadata(project=project) + assert len(project_metadata) == 1 + project_uuid = project_metadata[0].project_uuid + assert len(project_metadata[0].project_uuid) == 36 + assert_project_uuid(project, project_uuid, sql_registry) entities = sql_registry.list_entities(project) + assert_project_uuid(project, project_uuid, sql_registry) entity = entities[0] assert ( @@ -138,18 +154,27 @@ def test_apply_entity_success(sql_registry): ) sql_registry.delete_entity("driver_car_id", project) + assert_project_uuid(project, project_uuid, sql_registry) entities = sql_registry.list_entities(project) + assert_project_uuid(project, project_uuid, sql_registry) assert len(entities) == 0 sql_registry.teardown() +def assert_project_uuid(project, project_uuid, sql_registry): + project_metadata = sql_registry.list_project_metadata(project=project) + assert len(project_metadata) == 1 + assert project_metadata[0].project_uuid == project_uuid + + @pytest.mark.skipif( sys.platform == "darwin" and "GITHUB_REF" in os.environ, reason="does not run on mac github actions", ) @pytest.mark.parametrize( - "sql_registry", [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + "sql_registry", + [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], ) def test_apply_feature_view_success(sql_registry): # Create Feature Views @@ -224,7 +249,8 @@ def test_apply_feature_view_success(sql_registry): reason="does not run on mac github actions", ) @pytest.mark.parametrize( - "sql_registry", [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + "sql_registry", + [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], ) def test_apply_on_demand_feature_view_success(sql_registry): # Create Feature Views @@ -307,7 +333,8 @@ def location_features_from_push(inputs: pd.DataFrame) -> pd.DataFrame: reason="does not run on mac github actions", ) @pytest.mark.parametrize( - "sql_registry", [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + "sql_registry", + [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], ) @pytest.mark.parametrize( "request_source_schema", @@ -322,7 +349,10 @@ def test_modify_feature_views_success(sql_registry, request_source_schema): created_timestamp_column="timestamp", ) - request_source = RequestSource(name="request_source", schema=request_source_schema,) + request_source = RequestSource( + name="request_source", + schema=request_source_schema, + ) entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) @@ -431,7 +461,8 @@ def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: ) @pytest.mark.integration @pytest.mark.parametrize( - "sql_registry", [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], + "sql_registry", + [lazy_fixture("mysql_registry"), lazy_fixture("pg_registry")], ) def test_apply_data_source(sql_registry): # Create Feature Views diff --git a/sdk/python/tests/integration/registration/test_universal_cli.py b/sdk/python/tests/integration/registration/test_universal_cli.py new file mode 100644 index 0000000000..1fb82ce59f --- /dev/null +++ b/sdk/python/tests/integration/registration/test_universal_cli.py @@ -0,0 +1,166 @@ +import tempfile +import uuid +from pathlib import Path +from textwrap import dedent + +import pytest +from assertpy import assertpy + +from feast.feature_store import FeatureStore +from tests.integration.feature_repos.repo_configuration import Environment +from tests.utils.basic_read_write_test import basic_rw_test +from tests.utils.cli_repo_creator import CliRunner, get_example_repo +from tests.utils.e2e_test_validation import ( + NULLABLE_ONLINE_STORE_CONFIGS, + make_feature_store_yaml, +) + + +@pytest.mark.integration +@pytest.mark.universal_offline_stores +def test_universal_cli(environment: Environment): + project = f"test_universal_cli_{str(uuid.uuid4()).replace('-', '')[:8]}" + runner = CliRunner() + + with tempfile.TemporaryDirectory() as repo_dir_name: + try: + repo_path = Path(repo_dir_name) + feature_store_yaml = make_feature_store_yaml( + project, environment.test_repo_config, repo_path + ) + + repo_config = repo_path / "feature_store.yaml" + + repo_config.write_text(dedent(feature_store_yaml)) + + repo_example = repo_path / "example.py" + repo_example.write_text(get_example_repo("example_feature_repo_1.py")) + result = runner.run(["apply"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + + # Store registry contents, to be compared later. + fs = FeatureStore(repo_path=str(repo_path)) + registry_dict = fs.registry.to_dict(project=project) + # Save only the specs, not the metadata. + registry_specs = { + key: [fco["spec"] if "spec" in fco else fco for fco in value] + for key, value in registry_dict.items() + } + + # entity & feature view list commands should succeed + result = runner.run(["entities", "list"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + result = runner.run(["feature-views", "list"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + result = runner.run(["feature-services", "list"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + result = runner.run(["data-sources", "list"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + + # entity & feature view describe commands should succeed when objects exist + result = runner.run(["entities", "describe", "driver"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + result = runner.run( + ["feature-views", "describe", "driver_locations"], cwd=repo_path + ) + assertpy.assert_that(result.returncode).is_equal_to(0) + result = runner.run( + ["feature-services", "describe", "driver_locations_service"], + cwd=repo_path, + ) + assertpy.assert_that(result.returncode).is_equal_to(0) + assertpy.assert_that(fs.list_feature_views()).is_length(4) + result = runner.run( + ["data-sources", "describe", "customer_profile_source"], + cwd=repo_path, + ) + assertpy.assert_that(result.returncode).is_equal_to(0) + assertpy.assert_that(fs.list_data_sources()).is_length(4) + + # entity & feature view describe commands should fail when objects don't exist + result = runner.run(["entities", "describe", "foo"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(1) + result = runner.run(["feature-views", "describe", "foo"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(1) + result = runner.run(["feature-services", "describe", "foo"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(1) + result = runner.run(["data-sources", "describe", "foo"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(1) + + # Doing another apply should be a no op, and should not cause errors + result = runner.run(["apply"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + basic_rw_test( + FeatureStore(repo_path=str(repo_path), config=None), + view_name="driver_locations", + ) + + # Confirm that registry contents have not changed. + registry_dict = fs.registry.to_dict(project=project) + assertpy.assert_that(registry_specs).is_equal_to( + { + key: [fco["spec"] if "spec" in fco else fco for fco in value] + for key, value in registry_dict.items() + } + ) + + result = runner.run(["teardown"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + finally: + runner.run(["teardown"], cwd=repo_path) + + +@pytest.mark.integration +@pytest.mark.universal_offline_stores +def test_odfv_apply(environment) -> None: + project = f"test_odfv_apply{str(uuid.uuid4()).replace('-', '')[:8]}" + runner = CliRunner() + + with tempfile.TemporaryDirectory() as repo_dir_name: + try: + repo_path = Path(repo_dir_name) + feature_store_yaml = make_feature_store_yaml( + project, environment.test_repo_config, repo_path + ) + + repo_config = repo_path / "feature_store.yaml" + + repo_config.write_text(dedent(feature_store_yaml)) + + repo_example = repo_path / "example.py" + repo_example.write_text(get_example_repo("on_demand_feature_view_repo.py")) + result = runner.run(["apply"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + + # entity & feature view list commands should succeed + result = runner.run(["entities", "list"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + result = runner.run(["on-demand-feature-views", "list"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + finally: + runner.run(["teardown"], cwd=repo_path) + + +@pytest.mark.integration +@pytest.mark.parametrize("test_nullable_online_store", NULLABLE_ONLINE_STORE_CONFIGS) +def test_nullable_online_store(test_nullable_online_store) -> None: + project = f"test_nullable_online_store{str(uuid.uuid4()).replace('-', '')[:8]}" + runner = CliRunner() + + with tempfile.TemporaryDirectory() as repo_dir_name: + try: + repo_path = Path(repo_dir_name) + feature_store_yaml = make_feature_store_yaml( + project, test_nullable_online_store, repo_path + ) + + repo_config = repo_path / "feature_store.yaml" + + repo_config.write_text(dedent(feature_store_yaml)) + + repo_example = repo_path / "example.py" + repo_example.write_text(get_example_repo("empty_feature_repo.py")) + result = runner.run(["apply"], cwd=repo_path) + assertpy.assert_that(result.returncode).is_equal_to(0) + finally: + runner.run(["teardown"], cwd=repo_path) diff --git a/sdk/python/tests/integration/registration/test_universal_odfv_feature_inference.py b/sdk/python/tests/integration/registration/test_universal_odfv_feature_inference.py index b7a9a571af..ce960b9c35 100644 --- a/sdk/python/tests/integration/registration/test_universal_odfv_feature_inference.py +++ b/sdk/python/tests/integration/registration/test_universal_odfv_feature_inference.py @@ -31,7 +31,8 @@ def test_infer_odfv_features(environment, universal_data_sources, infer_features ) request_source = create_conv_rate_request_source() driver_odfv = conv_rate_plus_100_feature_view( - [driver_hourly_stats, request_source], infer_features=infer_features, + [driver_hourly_stats, request_source], + infer_features=infer_features, ) feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()] @@ -83,7 +84,8 @@ def test_infer_odfv_features_with_error(environment, universal_data_sources): ) request_source = create_conv_rate_request_source() driver_odfv = conv_rate_plus_100_feature_view( - [driver_hourly_stats, request_source], features=features, + [driver_hourly_stats, request_source], + features=features, ) feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()] diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index b03303f6ee..1d90eee13e 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -19,99 +19,13 @@ String, UnixTimestamp, ) -from tests.data.data_creator import create_dataset +from tests.data.data_creator import create_basic_driver_dataset from tests.integration.feature_repos.universal.entities import driver from tests.integration.feature_repos.universal.feature_views import driver_feature_view logger = logging.getLogger(__name__) -def populate_test_configs(offline: bool): - feature_dtypes = [ - "int32", - "int64", - "float", - "bool", - "datetime", - ] - configs: List[TypeTestConfig] = [] - for feature_dtype in feature_dtypes: - for feature_is_list in [True, False]: - for has_empty_list in [True, False]: - # For non list features `has_empty_list` does nothing - if feature_is_list is False and has_empty_list is True: - continue - - configs.append( - TypeTestConfig( - feature_dtype=feature_dtype, - feature_is_list=feature_is_list, - has_empty_list=has_empty_list, - ) - ) - return configs - - -@dataclass(frozen=True, repr=True) -class TypeTestConfig: - feature_dtype: str - feature_is_list: bool - has_empty_list: bool - - -OFFLINE_TYPE_TEST_CONFIGS: List[TypeTestConfig] = populate_test_configs(offline=True) -ONLINE_TYPE_TEST_CONFIGS: List[TypeTestConfig] = populate_test_configs(offline=False) - - -@pytest.fixture( - params=OFFLINE_TYPE_TEST_CONFIGS, - scope="session", - ids=[str(c) for c in OFFLINE_TYPE_TEST_CONFIGS], -) -def offline_types_test_fixtures(request, environment): - config: TypeTestConfig = request.param - if ( - environment.test_repo_config.provider == "aws" - and config.feature_is_list is True - ): - pytest.skip("Redshift doesn't support list features") - - return get_fixtures(request, environment) - - -@pytest.fixture( - params=ONLINE_TYPE_TEST_CONFIGS, - scope="session", - ids=[str(c) for c in ONLINE_TYPE_TEST_CONFIGS], -) -def online_types_test_fixtures(request, environment): - return get_fixtures(request, environment) - - -def get_fixtures(request, environment): - config: TypeTestConfig = request.param - # Lower case needed because Redshift lower-cases all table names - destination_name = f"feature_type_{config.feature_dtype}{config.feature_is_list}".replace( - ".", "" - ).lower() - config = request.param - df = create_dataset( - Int64, config.feature_dtype, config.feature_is_list, config.has_empty_list, - ) - data_source = environment.data_source_creator.create_data_source( - df, destination_name=destination_name, field_mapping={"ts_1": "ts"}, - ) - fv = create_feature_view( - destination_name, - config.feature_dtype, - config.feature_is_list, - config.has_empty_list, - data_source, - ) - - return config, data_source, fv - - @pytest.mark.integration @pytest.mark.universal_offline_stores @pytest.mark.parametrize("entity_type", [Int32, Int64, String]) @@ -119,7 +33,10 @@ def test_entity_inference_types_match(environment, entity_type): fs = environment.feature_store # Don't specify value type in entity to force inference - df = create_dataset(entity_type, feature_dtype="int32",) + df = create_basic_driver_dataset( + entity_type, + feature_dtype="int32", + ) data_source = environment.data_source_creator.create_data_source( df, destination_name=f"entity_type_{entity_type.name.lower()}", @@ -190,7 +107,8 @@ def test_feature_get_historical_features_types_match( features = [f"{fv.name}:value"] historical_features = fs.get_historical_features( - entity_df=entity_df, features=features, + entity_df=entity_df, + features=features, ) # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() @@ -239,7 +157,8 @@ def test_feature_get_online_features_types_match( ) online_features = fs.get_online_features( - features=features, entity_rows=[{"driver_id": 1}], + features=features, + entity_rows=[{"driver_id": 1}], ).to_dict() feature_list_dtype_to_expected_online_response_value_type = { @@ -344,7 +263,10 @@ def assert_feature_list_types( bool, np.bool_, ), # Can be `np.bool_` if from `np.array` rather that `list` - "datetime": (np.datetime64, datetime,), # datetime.datetime + "datetime": ( + np.datetime64, + datetime, + ), # datetime.datetime } expected_dtype = feature_list_dtype_to_expected_historical_feature_list_dtype[ feature_dtype @@ -384,3 +306,94 @@ def assert_expected_arrow_types( assert arrow_type_checker(pa_type.value_type) else: assert arrow_type_checker(pa_type) + + +def populate_test_configs(offline: bool): + feature_dtypes = [ + "int32", + "int64", + "float", + "bool", + "datetime", + ] + configs: List[TypeTestConfig] = [] + for feature_dtype in feature_dtypes: + for feature_is_list in [True, False]: + for has_empty_list in [True, False]: + # For non list features `has_empty_list` does nothing + if feature_is_list is False and has_empty_list is True: + continue + + configs.append( + TypeTestConfig( + feature_dtype=feature_dtype, + feature_is_list=feature_is_list, + has_empty_list=has_empty_list, + ) + ) + return configs + + +@dataclass(frozen=True, repr=True) +class TypeTestConfig: + feature_dtype: str + feature_is_list: bool + has_empty_list: bool + + +OFFLINE_TYPE_TEST_CONFIGS: List[TypeTestConfig] = populate_test_configs(offline=True) +ONLINE_TYPE_TEST_CONFIGS: List[TypeTestConfig] = populate_test_configs(offline=False) + + +@pytest.fixture( + params=OFFLINE_TYPE_TEST_CONFIGS, + ids=[str(c) for c in OFFLINE_TYPE_TEST_CONFIGS], +) +def offline_types_test_fixtures(request, environment): + config: TypeTestConfig = request.param + if ( + environment.test_repo_config.provider == "aws" + and config.feature_is_list is True + ): + pytest.skip("Redshift doesn't support list features") + + return get_fixtures(request, environment) + + +@pytest.fixture( + params=ONLINE_TYPE_TEST_CONFIGS, + ids=[str(c) for c in ONLINE_TYPE_TEST_CONFIGS], +) +def online_types_test_fixtures(request, environment): + return get_fixtures(request, environment) + + +def get_fixtures(request, environment): + config: TypeTestConfig = request.param + # Lower case needed because Redshift lower-cases all table names + destination_name = ( + f"feature_type_{config.feature_dtype}{config.feature_is_list}".replace( + ".", "" + ).lower() + ) + config = request.param + df = create_basic_driver_dataset( + Int64, + config.feature_dtype, + config.feature_is_list, + config.has_empty_list, + ) + data_source = environment.data_source_creator.create_data_source( + df, + destination_name=destination_name, + field_mapping={"ts_1": "ts"}, + ) + fv = create_feature_view( + destination_name, + config.feature_dtype, + config.feature_is_list, + config.has_empty_list, + data_source, + ) + + return config, data_source, fv diff --git a/sdk/python/tests/integration/scaffolding/test_partial_apply.py b/sdk/python/tests/integration/scaffolding/test_partial_apply.py deleted file mode 100644 index e5a7206b96..0000000000 --- a/sdk/python/tests/integration/scaffolding/test_partial_apply.py +++ /dev/null @@ -1,48 +0,0 @@ -from datetime import timedelta - -import pytest - -from feast import BigQuerySource, Entity, FeatureView, Field -from feast.types import Float32, String -from tests.utils.cli_utils import CliRunner, get_example_repo -from tests.utils.online_read_write_test import basic_rw_test - - -@pytest.mark.integration -def test_partial() -> None: - """ - Add another table to existing repo using partial apply API. Make sure both the table - applied via CLI apply and the new table are passing RW test. - """ - - runner = CliRunner() - with runner.local_repo( - get_example_repo("example_feature_repo_1.py"), "bigquery" - ) as store: - driver = Entity(name="driver", join_keys=["test"]) - - driver_locations_source = BigQuerySource( - table="feast-oss.public.drivers", - timestamp_field="event_timestamp", - created_timestamp_column="created_timestamp", - ) - - driver_locations_100 = FeatureView( - name="driver_locations_100", - entities=[driver], - ttl=timedelta(days=1), - schema=[ - Field(name="lat", dtype=Float32), - Field(name="lon", dtype=String), - Field(name="name", dtype=String), - Field(name="test", dtype=String), - ], - online=True, - batch_source=driver_locations_source, - tags={}, - ) - - store.apply([driver_locations_100]) - - basic_rw_test(store, view_name="driver_locations") - basic_rw_test(store, view_name="driver_locations_100") diff --git a/sdk/python/tests/unit/cli/test_cli.py b/sdk/python/tests/unit/cli/test_cli.py new file mode 100644 index 0000000000..9b535ce8fb --- /dev/null +++ b/sdk/python/tests/unit/cli/test_cli.py @@ -0,0 +1,140 @@ +import tempfile +from contextlib import contextmanager +from pathlib import Path +from textwrap import dedent + +from assertpy import assertpy + +from tests.utils.cli_repo_creator import CliRunner + + +def test_3rd_party_providers() -> None: + """ + Test running apply on third party providers + """ + runner = CliRunner() + # Check with incorrect built-in provider name (no dots) + with setup_third_party_provider_repo("feast123") as repo_path: + return_code, output = runner.run_with_output(["apply"], cwd=repo_path) + assertpy.assert_that(return_code).is_equal_to(1) + assertpy.assert_that(output).contains(b"Provider 'feast123' is not implemented") + # Check with incorrect third-party provider name (with dots) + with setup_third_party_provider_repo("feast_foo.Provider") as repo_path: + return_code, output = runner.run_with_output(["apply"], cwd=repo_path) + assertpy.assert_that(return_code).is_equal_to(1) + assertpy.assert_that(output).contains( + b"Could not import module 'feast_foo' while attempting to load class 'Provider'" + ) + # Check with incorrect third-party provider name (with dots) + with setup_third_party_provider_repo("foo.FooProvider") as repo_path: + return_code, output = runner.run_with_output(["apply"], cwd=repo_path) + assertpy.assert_that(return_code).is_equal_to(1) + assertpy.assert_that(output).contains( + b"Could not import class 'FooProvider' from module 'foo'" + ) + # Check with correct third-party provider name + with setup_third_party_provider_repo("foo.provider.FooProvider") as repo_path: + return_code, output = runner.run_with_output(["apply"], cwd=repo_path) + assertpy.assert_that(return_code).is_equal_to(0) + + +def test_3rd_party_registry_store() -> None: + """ + Test running apply on third party registry stores + """ + runner = CliRunner() + # Check with incorrect built-in provider name (no dots) + with setup_third_party_registry_store_repo("feast123") as repo_path: + return_code, output = runner.run_with_output(["apply"], cwd=repo_path) + assertpy.assert_that(return_code).is_equal_to(1) + assertpy.assert_that(output).contains( + b'Registry store class name should end with "RegistryStore"' + ) + # Check with incorrect third-party registry store name (with dots) + with setup_third_party_registry_store_repo("feast_foo.RegistryStore") as repo_path: + return_code, output = runner.run_with_output(["apply"], cwd=repo_path) + assertpy.assert_that(return_code).is_equal_to(1) + assertpy.assert_that(output).contains( + b"Could not import module 'feast_foo' while attempting to load class 'RegistryStore'" + ) + # Check with incorrect third-party registry store name (with dots) + with setup_third_party_registry_store_repo("foo.FooRegistryStore") as repo_path: + return_code, output = runner.run_with_output(["apply"], cwd=repo_path) + assertpy.assert_that(return_code).is_equal_to(1) + assertpy.assert_that(output).contains( + b"Could not import class 'FooRegistryStore' from module 'foo'" + ) + # Check with correct third-party registry store name + with setup_third_party_registry_store_repo( + "foo.registry_store.FooRegistryStore" + ) as repo_path: + return_code, output = runner.run_with_output(["apply"], cwd=repo_path) + assertpy.assert_that(return_code).is_equal_to(0) + + +@contextmanager +def setup_third_party_provider_repo(provider_name: str): + with tempfile.TemporaryDirectory() as repo_dir_name: + + # Construct an example repo in a temporary dir + repo_path = Path(repo_dir_name) + + repo_config = repo_path / "feature_store.yaml" + + repo_config.write_text( + dedent( + f""" + project: foo + registry: data/registry.db + provider: {provider_name} + online_store: + path: data/online_store.db + type: sqlite + offline_store: + type: file + """ + ) + ) + + (repo_path / "foo").mkdir() + repo_example = repo_path / "foo/provider.py" + repo_example.write_text( + (Path(__file__).parents[2] / "foo_provider.py").read_text() + ) + + yield repo_path + + +@contextmanager +def setup_third_party_registry_store_repo(registry_store: str): + with tempfile.TemporaryDirectory() as repo_dir_name: + + # Construct an example repo in a temporary dir + repo_path = Path(repo_dir_name) + + repo_config = repo_path / "feature_store.yaml" + + repo_config.write_text( + dedent( + f""" + project: foo + registry: + registry_store_type: {registry_store} + path: foobar://foo.bar + provider: local + online_store: + path: data/online_store.db + type: sqlite + offline_store: + type: file + """ + ) + ) + + (repo_path / "foo").mkdir() + repo_example = repo_path / "foo/registry_store.py" + repo_example.write_text( + (Path(__file__).parents[2] / "foo_registry_store.py").read_text() + ) + + yield repo_path diff --git a/sdk/python/tests/integration/registration/test_cli_apply_duplicates.py b/sdk/python/tests/unit/cli/test_cli_apply_duplicates.py similarity index 86% rename from sdk/python/tests/integration/registration/test_cli_apply_duplicates.py rename to sdk/python/tests/unit/cli/test_cli_apply_duplicates.py index bad3b50a80..998662781e 100644 --- a/sdk/python/tests/integration/registration/test_cli_apply_duplicates.py +++ b/sdk/python/tests/unit/cli/test_cli_apply_duplicates.py @@ -2,7 +2,7 @@ from pathlib import Path from textwrap import dedent -from tests.utils.cli_utils import CliRunner, get_example_repo +from tests.utils.cli_repo_creator import CliRunner, get_example_repo def test_cli_apply_duplicated_featureview_names() -> None: @@ -49,9 +49,8 @@ def run_simple_apply_test(example_repo_file_name: str, expected_error: bytes): def test_cli_apply_imported_featureview() -> None: """ - Test apply feature views with duplicated names and single py file in a feature repo using CLI + Tests that applying a feature view imported from a separate Python file is successful. """ - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: runner = CliRunner() # Construct an example repo in a temporary dir @@ -72,8 +71,11 @@ def test_cli_apply_imported_featureview() -> None: ) ) + # Import feature view from an existing file so it exists in two files. repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("example_feature_repo_2.py")) + repo_example.write_text( + get_example_repo("example_feature_repo_with_driver_stats_feature_view.py") + ) repo_example_2 = repo_path / "example_2.py" repo_example_2.write_text( "from example import driver_hourly_stats_view\n" @@ -92,9 +94,9 @@ def test_cli_apply_imported_featureview() -> None: def test_cli_apply_imported_featureview_with_duplication() -> None: """ - Test apply feature views with duplicated names and single py file in a feature repo using CLI + Tests that applying feature views with duplicated names is not possible, even if one of the + duplicated feature views is imported from another file. """ - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: runner = CliRunner() # Construct an example repo in a temporary dir @@ -115,8 +117,11 @@ def test_cli_apply_imported_featureview_with_duplication() -> None: ) ) + # Import feature view with duplicated name to try breaking the deduplication logic. repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("example_feature_repo_2.py")) + repo_example.write_text( + get_example_repo("example_feature_repo_with_driver_stats_feature_view.py") + ) repo_example_2 = repo_path / "example_2.py" repo_example_2.write_text( "from datetime import timedelta\n" @@ -147,7 +152,6 @@ def test_cli_apply_duplicated_featureview_names_multiple_py_files() -> None: """ Test apply feature views with duplicated names from multiple py files in a feature repo using CLI """ - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: runner = CliRunner() # Construct an example repo in a temporary dir @@ -170,7 +174,11 @@ def test_cli_apply_duplicated_featureview_names_multiple_py_files() -> None: # Create multiple py files containing the same feature view name for i in range(3): repo_example = repo_path / f"example{i}.py" - repo_example.write_text(get_example_repo("example_feature_repo_2.py")) + repo_example.write_text( + get_example_repo( + "example_feature_repo_with_driver_stats_feature_view.py" + ) + ) rc, output = runner.run_with_output(["apply"], cwd=repo_path) assert ( diff --git a/sdk/python/tests/integration/registration/test_cli_chdir.py b/sdk/python/tests/unit/cli/test_cli_chdir.py similarity index 97% rename from sdk/python/tests/integration/registration/test_cli_chdir.py rename to sdk/python/tests/unit/cli/test_cli_chdir.py index ff26c2f5e2..8260a95efd 100644 --- a/sdk/python/tests/integration/registration/test_cli_chdir.py +++ b/sdk/python/tests/unit/cli/test_cli_chdir.py @@ -2,7 +2,7 @@ from datetime import datetime, timedelta from pathlib import Path -from tests.utils.cli_utils import CliRunner +from tests.utils.cli_repo_creator import CliRunner def test_cli_chdir() -> None: diff --git a/sdk/python/tests/unit/diff/test_registry_diff.py b/sdk/python/tests/unit/diff/test_registry_diff.py index ae10c834c8..0effdfba97 100644 --- a/sdk/python/tests/unit/diff/test_registry_diff.py +++ b/sdk/python/tests/unit/diff/test_registry_diff.py @@ -4,17 +4,23 @@ ) from feast.entity import Entity from feast.feature_view import FeatureView -from tests.utils.data_source_utils import prep_file_source +from tests.utils.data_source_test_creator import prep_file_source def test_tag_objects_for_keep_delete_update_add(simple_dataset_1): with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: entity = Entity(name="id", join_keys=["id"]) to_delete = FeatureView( - name="to_delete", entities=[entity], batch_source=file_source, ttl=None, + name="to_delete", + entities=[entity], + batch_source=file_source, + ttl=None, ) unchanged_fv = FeatureView( - name="fv1", entities=[entity], batch_source=file_source, ttl=None, + name="fv1", + entities=[entity], + batch_source=file_source, + ttl=None, ) pre_changed = FeatureView( name="fv2", @@ -31,7 +37,10 @@ def test_tag_objects_for_keep_delete_update_add(simple_dataset_1): tags={"when": "after"}, ) to_add = FeatureView( - name="to_add", entities=[entity], batch_source=file_source, ttl=None, + name="to_add", + entities=[entity], + batch_source=file_source, + ttl=None, ) keep, delete, update, add = tag_objects_for_keep_delete_update_add( diff --git a/sdk/python/tests/unit/infra/online_store/test_dynamodb_online_store.py b/sdk/python/tests/unit/infra/online_store/test_dynamodb_online_store.py index 25eb061930..c8eca6201f 100644 --- a/sdk/python/tests/unit/infra/online_store/test_dynamodb_online_store.py +++ b/sdk/python/tests/unit/infra/online_store/test_dynamodb_online_store.py @@ -14,10 +14,10 @@ from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.repo_config import RepoConfig -from tests.utils.online_store_utils import ( - _create_n_customer_test_samples, - _create_test_table, - _insert_data_test_table, +from tests.utils.dynamo_table_creator import ( + create_n_customer_test_samples, + create_test_table, + insert_data_test_table, ) REGISTRY = "s3://test_registry/registry.db" @@ -165,9 +165,9 @@ def test_dynamodb_online_store_online_read( ): """Test DynamoDBOnlineStore online_read method.""" db_table_name = f"{TABLE_NAME}_online_read_{n_samples}" - _create_test_table(PROJECT, db_table_name, REGION) - data = _create_n_customer_test_samples(n=n_samples) - _insert_data_test_table(data, PROJECT, db_table_name, REGION) + create_test_table(PROJECT, db_table_name, REGION) + data = create_n_customer_test_samples(n=n_samples) + insert_data_test_table(data, PROJECT, db_table_name, REGION) entity_keys, features, *rest = zip(*data) returned_items = dynamodb_online_store.online_read( @@ -186,8 +186,8 @@ def test_dynamodb_online_store_online_write_batch( ): """Test DynamoDBOnlineStore online_write_batch method.""" db_table_name = f"{TABLE_NAME}_online_write_batch_{n_samples}" - _create_test_table(PROJECT, db_table_name, REGION) - data = _create_n_customer_test_samples() + create_test_table(PROJECT, db_table_name, REGION) + data = create_n_customer_test_samples() entity_keys, features, *rest = zip(*data) dynamodb_online_store.online_write_batch( @@ -211,10 +211,10 @@ def test_dynamodb_online_store_update(repo_config, dynamodb_online_store): """Test DynamoDBOnlineStore update method.""" # create dummy table to keep db_table_keep_name = f"{TABLE_NAME}_keep_update" - _create_test_table(PROJECT, db_table_keep_name, REGION) + create_test_table(PROJECT, db_table_keep_name, REGION) # create dummy table to delete db_table_delete_name = f"{TABLE_NAME}_delete_update" - _create_test_table(PROJECT, db_table_delete_name, REGION) + create_test_table(PROJECT, db_table_delete_name, REGION) dynamodb_online_store.update( config=repo_config, @@ -240,8 +240,8 @@ def test_dynamodb_online_store_teardown(repo_config, dynamodb_online_store): """Test DynamoDBOnlineStore teardown method.""" db_table_delete_name_one = f"{TABLE_NAME}_delete_teardown_1" db_table_delete_name_two = f"{TABLE_NAME}_delete_teardown_2" - _create_test_table(PROJECT, db_table_delete_name_one, REGION) - _create_test_table(PROJECT, db_table_delete_name_two, REGION) + create_test_table(PROJECT, db_table_delete_name_one, REGION) + create_test_table(PROJECT, db_table_delete_name_two, REGION) dynamodb_online_store.teardown( config=repo_config, @@ -267,9 +267,9 @@ def test_dynamodb_online_store_online_read_unknown_entity( ): """Test DynamoDBOnlineStore online_read method.""" n_samples = 2 - _create_test_table(PROJECT, f"{TABLE_NAME}_unknown_entity_{n_samples}", REGION) - data = _create_n_customer_test_samples(n=n_samples) - _insert_data_test_table( + create_test_table(PROJECT, f"{TABLE_NAME}_unknown_entity_{n_samples}", REGION) + data = create_n_customer_test_samples(n=n_samples) + insert_data_test_table( data, PROJECT, f"{TABLE_NAME}_unknown_entity_{n_samples}", REGION ) @@ -304,14 +304,14 @@ def test_dynamodb_online_store_online_read_unknown_entity( def test_write_batch_non_duplicates(repo_config, dynamodb_online_store): """Test DynamoDBOnline Store deduplicate write batch request items.""" dynamodb_tbl = f"{TABLE_NAME}_batch_non_duplicates" - _create_test_table(PROJECT, dynamodb_tbl, REGION) - data = _create_n_customer_test_samples() + create_test_table(PROJECT, dynamodb_tbl, REGION) + data = create_n_customer_test_samples() data_duplicate = deepcopy(data) dynamodb_resource = boto3.resource("dynamodb", region_name=REGION) table_instance = dynamodb_resource.Table(f"{PROJECT}.{dynamodb_tbl}") # Insert duplicate data dynamodb_online_store._write_batch_non_duplicates( - table_instance, data + data_duplicate, progress=None + table_instance, data + data_duplicate, None, repo_config ) # Request more items than inserted response = table_instance.scan(Limit=20) @@ -330,9 +330,9 @@ def test_dynamodb_online_store_online_read_unknown_entity_end_of_batch( """ batch_size = repo_config.online_store.batch_size n_samples = batch_size - _create_test_table(PROJECT, f"{TABLE_NAME}_unknown_entity_{n_samples}", REGION) - data = _create_n_customer_test_samples(n=n_samples) - _insert_data_test_table( + create_test_table(PROJECT, f"{TABLE_NAME}_unknown_entity_{n_samples}", REGION) + data = create_n_customer_test_samples(n=n_samples) + insert_data_test_table( data, PROJECT, f"{TABLE_NAME}_unknown_entity_{n_samples}", REGION ) diff --git a/sdk/python/tests/integration/scaffolding/test_repo_config.py b/sdk/python/tests/unit/infra/scaffolding/test_repo_config.py similarity index 100% rename from sdk/python/tests/integration/scaffolding/test_repo_config.py rename to sdk/python/tests/unit/infra/scaffolding/test_repo_config.py diff --git a/sdk/python/tests/integration/scaffolding/test_repo_operations.py b/sdk/python/tests/unit/infra/scaffolding/test_repo_operations.py similarity index 100% rename from sdk/python/tests/integration/scaffolding/test_repo_operations.py rename to sdk/python/tests/unit/infra/scaffolding/test_repo_operations.py diff --git a/sdk/python/tests/unit/infra/test_inference_unit_tests.py b/sdk/python/tests/unit/infra/test_inference_unit_tests.py new file mode 100644 index 0000000000..7a564679d6 --- /dev/null +++ b/sdk/python/tests/unit/infra/test_inference_unit_tests.py @@ -0,0 +1,382 @@ +import pandas as pd +import pytest + +from feast import BigQuerySource, FileSource, RedshiftSource, SnowflakeSource +from feast.data_source import RequestSource +from feast.entity import Entity +from feast.errors import DataSourceNoNameException, SpecifiedFeaturesNotPresentError +from feast.feature import Feature +from feast.feature_service import FeatureService +from feast.feature_view import FeatureView +from feast.field import Field +from feast.inference import update_feature_views_with_inferred_features_and_entities +from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( + SparkSource, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.repo_config import RepoConfig +from feast.types import Float32, Float64, Int64, String, UnixTimestamp, ValueType +from tests.utils.data_source_test_creator import prep_file_source + + +def test_infer_datasource_names_file(): + file_path = "path/to/test.csv" + data_source = FileSource(path=file_path) + assert data_source.name == file_path + + source_name = "my_name" + data_source = FileSource(name=source_name, path=file_path) + assert data_source.name == source_name + + +def test_infer_datasource_names_dwh(): + table = "project.table" + dwh_classes = [BigQuerySource, RedshiftSource, SnowflakeSource, SparkSource] + + for dwh_class in dwh_classes: + data_source = dwh_class(table=table) + assert data_source.name == table + + source_name = "my_name" + data_source_with_table = dwh_class(name=source_name, table=table) + assert data_source_with_table.name == source_name + data_source_with_query = dwh_class( + name=source_name, query=f"SELECT * from {table}" + ) + assert data_source_with_query.name == source_name + + # If we have a query and no name, throw an error + if dwh_class == SparkSource: + with pytest.raises(DataSourceNoNameException): + print(f"Testing dwh {dwh_class}") + data_source = dwh_class(query="test_query") + else: + data_source = dwh_class(query="test_query") + assert data_source.name == "" + + +def test_on_demand_features_type_inference(): + # Create Feature Views + date_request = RequestSource( + name="date_request", + schema=[Field(name="some_date", dtype=UnixTimestamp)], + ) + + @on_demand_feature_view( + sources=[date_request], + schema=[ + Field(name="output", dtype=UnixTimestamp), + Field(name="string_output", dtype=String), + ], + ) + def test_view(features_df: pd.DataFrame) -> pd.DataFrame: + data = pd.DataFrame() + data["output"] = features_df["some_date"] + data["string_output"] = features_df["some_date"].astype(pd.StringDtype()) + return data + + test_view.infer_features() + + @on_demand_feature_view( + # Note: we deliberately use `inputs` instead of `sources` to test that `inputs` + # still works correctly, even though it is deprecated. + # TODO(felixwang9817): Remove references to `inputs` once it is fully deprecated. + inputs={"date_request": date_request}, + features=[ + Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), + Feature(name="object_output", dtype=ValueType.STRING), + ], + ) + def invalid_test_view(features_df: pd.DataFrame) -> pd.DataFrame: + data = pd.DataFrame() + data["output"] = features_df["some_date"] + data["object_output"] = features_df["some_date"].astype(str) + return data + + with pytest.raises(ValueError, match="Value with native type object"): + invalid_test_view.infer_features() + + @on_demand_feature_view( + # Note: we deliberately use positional arguments here to test that they work correctly, + # even though positional arguments are deprecated in favor of keyword arguments. + # TODO(felixwang9817): Remove positional arguments once they are fully deprecated. + [ + Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), + Feature(name="missing", dtype=ValueType.STRING), + ], + {"date_request": date_request}, + ) + def test_view_with_missing_feature(features_df: pd.DataFrame) -> pd.DataFrame: + data = pd.DataFrame() + data["output"] = features_df["some_date"] + return data + + with pytest.raises(SpecifiedFeaturesNotPresentError): + test_view_with_missing_feature.infer_features() + + +# TODO(kevjumba): remove this in feast 0.24 when deprecating +@pytest.mark.parametrize( + "request_source_schema", + [ + [Field(name="some_date", dtype=UnixTimestamp)], + {"some_date": ValueType.UNIX_TIMESTAMP}, + ], +) +def test_datasource_inference(request_source_schema): + # Create Feature Views + date_request = RequestSource( + name="date_request", + schema=request_source_schema, + ) + + @on_demand_feature_view( + # Note: we deliberately use positional arguments here to test that they work correctly, + # even though positional arguments are deprecated in favor of keyword arguments. + # TODO(felixwang9817): Remove positional arguments once they are fully deprecated. + [ + Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), + Feature(name="string_output", dtype=ValueType.STRING), + ], + sources=[date_request], + ) + def test_view(features_df: pd.DataFrame) -> pd.DataFrame: + data = pd.DataFrame() + data["output"] = features_df["some_date"] + data["string_output"] = features_df["some_date"].astype(pd.StringDtype()) + return data + + test_view.infer_features() + + @on_demand_feature_view( + sources=[date_request], + schema=[ + Field(name="output", dtype=UnixTimestamp), + Field(name="object_output", dtype=String), + ], + ) + def invalid_test_view(features_df: pd.DataFrame) -> pd.DataFrame: + data = pd.DataFrame() + data["output"] = features_df["some_date"] + data["object_output"] = features_df["some_date"].astype(str) + return data + + with pytest.raises(ValueError, match="Value with native type object"): + invalid_test_view.infer_features() + + @on_demand_feature_view( + sources=[date_request], + features=[ + Feature(name="output", dtype=ValueType.UNIX_TIMESTAMP), + Feature(name="missing", dtype=ValueType.STRING), + ], + ) + def test_view_with_missing_feature(features_df: pd.DataFrame) -> pd.DataFrame: + data = pd.DataFrame() + data["output"] = features_df["some_date"] + return data + + with pytest.raises(SpecifiedFeaturesNotPresentError): + test_view_with_missing_feature.infer_features() + + +def test_feature_view_inference_respects_basic_inference(): + """ + Tests that feature view inference respects the basic inference that occurs during creation. + """ + file_source = FileSource(name="test", path="test path") + entity1 = Entity(name="test1", join_keys=["test_column_1"]) + entity2 = Entity(name="test2", join_keys=["test_column_2"]) + feature_view_1 = FeatureView( + name="test1", + entities=[entity1], + schema=[ + Field(name="feature", dtype=Float32), + Field(name="test_column_1", dtype=String), + ], + source=file_source, + ) + feature_view_2 = FeatureView( + name="test2", + entities=[entity1, entity2], + schema=[ + Field(name="feature", dtype=Float32), + Field(name="test_column_1", dtype=String), + Field(name="test_column_2", dtype=String), + ], + source=file_source, + ) + + assert len(feature_view_1.schema) == 2 + assert len(feature_view_1.features) == 1 + assert len(feature_view_1.entity_columns) == 1 + + update_feature_views_with_inferred_features_and_entities( + [feature_view_1], [entity1], RepoConfig(provider="local", project="test") + ) + assert len(feature_view_1.schema) == 2 + assert len(feature_view_1.features) == 1 + assert len(feature_view_1.entity_columns) == 1 + + assert len(feature_view_2.schema) == 3 + assert len(feature_view_2.features) == 1 + assert len(feature_view_2.entity_columns) == 2 + + update_feature_views_with_inferred_features_and_entities( + [feature_view_2], + [entity1, entity2], + RepoConfig(provider="local", project="test"), + ) + assert len(feature_view_2.schema) == 3 + assert len(feature_view_2.features) == 1 + assert len(feature_view_2.entity_columns) == 2 + + +def test_feature_view_inference_on_entity_columns(simple_dataset_1): + """ + Tests that feature view inference correctly infers entity columns. + """ + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity1 = Entity(name="test1", join_keys=["id_join_key"]) + feature_view_1 = FeatureView( + name="test1", + entities=[entity1], + schema=[Field(name="int64_col", dtype=Int64)], + source=file_source, + ) + + assert len(feature_view_1.schema) == 1 + assert len(feature_view_1.features) == 1 + assert len(feature_view_1.entity_columns) == 0 + + update_feature_views_with_inferred_features_and_entities( + [feature_view_1], [entity1], RepoConfig(provider="local", project="test") + ) + + # The schema is only used as a parameter, as is therefore not updated during inference. + assert len(feature_view_1.schema) == 1 + + # Since there is already a feature specified, additional features are not inferred. + assert len(feature_view_1.features) == 1 + + # The single entity column is inferred correctly. + assert len(feature_view_1.entity_columns) == 1 + + +def test_feature_view_inference_respects_entity_value_type(simple_dataset_1): + """ + Tests that feature view inference still respects an entity's value type. + """ + # TODO(felixwang9817): Remove this test once entity value_type is removed. + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity1 = Entity( + name="test1", join_keys=["id_join_key"], value_type=ValueType.STRING + ) + feature_view_1 = FeatureView( + name="test1", + entities=[entity1], + schema=[Field(name="int64_col", dtype=Int64)], + source=file_source, + ) + + assert len(feature_view_1.schema) == 1 + assert len(feature_view_1.features) == 1 + assert len(feature_view_1.entity_columns) == 0 + + update_feature_views_with_inferred_features_and_entities( + [feature_view_1], [entity1], RepoConfig(provider="local", project="test") + ) + + # The schema is only used as a parameter, as is therefore not updated during inference. + assert len(feature_view_1.schema) == 1 + + # Since there is already a feature specified, additional features are not inferred. + assert len(feature_view_1.features) == 1 + + # The single entity column is inferred correctly and has type String. + assert len(feature_view_1.entity_columns) == 1 + assert feature_view_1.entity_columns[0].dtype == String + + +def test_feature_view_inference_on_feature_columns(simple_dataset_1): + """ + Tests that feature view inference correctly infers feature columns. + """ + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity1 = Entity(name="test1", join_keys=["id_join_key"]) + feature_view_1 = FeatureView( + name="test1", + entities=[entity1], + schema=[Field(name="id_join_key", dtype=Int64)], + source=file_source, + ) + + assert len(feature_view_1.schema) == 1 + assert len(feature_view_1.features) == 0 + assert len(feature_view_1.entity_columns) == 1 + + update_feature_views_with_inferred_features_and_entities( + [feature_view_1], [entity1], RepoConfig(provider="local", project="test") + ) + + # The schema is only used as a parameter, as is therefore not updated during inference. + assert len(feature_view_1.schema) == 1 + + # All three feature columns are inferred correctly. + assert len(feature_view_1.features) == 3 + print(feature_view_1.features) + feature_column_1 = Field(name="float_col", dtype=Float64) + feature_column_2 = Field(name="int64_col", dtype=Int64) + feature_column_3 = Field(name="string_col", dtype=String) + assert feature_column_1 in feature_view_1.features + assert feature_column_2 in feature_view_1.features + assert feature_column_3 in feature_view_1.features + + # The single entity column remains. + assert len(feature_view_1.entity_columns) == 1 + + +def test_update_feature_services_with_inferred_features(simple_dataset_1): + with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source: + entity1 = Entity(name="test1", join_keys=["id_join_key"]) + feature_view_1 = FeatureView( + name="test1", + entities=[entity1], + source=file_source, + ) + feature_view_2 = FeatureView( + name="test2", + entities=[entity1], + source=file_source, + ) + + feature_service = FeatureService( + name="fs_1", features=[feature_view_1[["string_col"]], feature_view_2] + ) + assert len(feature_service.feature_view_projections) == 2 + assert len(feature_service.feature_view_projections[0].features) == 0 + assert len(feature_service.feature_view_projections[0].desired_features) == 1 + assert len(feature_service.feature_view_projections[1].features) == 0 + assert len(feature_service.feature_view_projections[1].desired_features) == 0 + + update_feature_views_with_inferred_features_and_entities( + [feature_view_1, feature_view_2], + [entity1], + RepoConfig(provider="local", project="test"), + ) + feature_service.infer_features( + fvs_to_update={ + feature_view_1.name: feature_view_1, + feature_view_2.name: feature_view_2, + } + ) + + assert len(feature_view_1.schema) == 0 + assert len(feature_view_1.features) == 3 + assert len(feature_view_2.schema) == 0 + assert len(feature_view_2.features) == 3 + assert len(feature_service.feature_view_projections[0].features) == 1 + assert len(feature_service.feature_view_projections[1].features) == 3 + + +# TODO(felixwang9817): Add tests that interact with field mapping. diff --git a/sdk/python/tests/unit/infra/test_key_encoding_utils.py b/sdk/python/tests/unit/infra/test_key_encoding_utils.py new file mode 100644 index 0000000000..df691ea21e --- /dev/null +++ b/sdk/python/tests/unit/infra/test_key_encoding_utils.py @@ -0,0 +1,30 @@ +import pytest + +from feast.infra.key_encoding_utils import serialize_entity_key +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto + + +def test_serialize_entity_key(): + # Should be fine + serialize_entity_key( + EntityKeyProto( + join_keys=["user"], entity_values=[ValueProto(int64_val=int(2**15))] + ), + entity_key_serialization_version=2, + ) + # True int64, but should also be fine. + serialize_entity_key( + EntityKeyProto( + join_keys=["user"], entity_values=[ValueProto(int64_val=int(2**31))] + ), + entity_key_serialization_version=2, + ) + + # Old serialization scheme, should fail. + with pytest.raises(BaseException): + serialize_entity_key( + EntityKeyProto( + join_keys=["user"], entity_values=[ValueProto(int64_val=int(2**31))] + ), + ) diff --git a/sdk/python/tests/unit/infra/test_local_registry.py b/sdk/python/tests/unit/infra/test_local_registry.py new file mode 100644 index 0000000000..d69ae6aafd --- /dev/null +++ b/sdk/python/tests/unit/infra/test_local_registry.py @@ -0,0 +1,535 @@ +# Copyright 2022 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from datetime import timedelta +from tempfile import mkstemp + +import pandas as pd +import pytest +from pytest_lazyfixture import lazy_fixture + +from feast import FileSource +from feast.aggregation import Aggregation +from feast.data_format import AvroFormat, ParquetFormat +from feast.data_source import KafkaSource +from feast.entity import Entity +from feast.feature import Feature +from feast.feature_view import FeatureView +from feast.field import Field +from feast.on_demand_feature_view import RequestSource, on_demand_feature_view +from feast.registry import Registry +from feast.repo_config import RegistryConfig +from feast.stream_feature_view import StreamFeatureView +from feast.types import Array, Bytes, Float32, Int32, Int64, String +from feast.value_type import ValueType +from tests.utils.e2e_test_validation import validate_registry_data_source_apply + + +@pytest.fixture +def local_registry() -> Registry: + fd, registry_path = mkstemp() + registry_config = RegistryConfig(path=registry_path, cache_ttl_seconds=600) + return Registry(registry_config, None) + + +@pytest.mark.parametrize( + "test_registry", + [lazy_fixture("local_registry")], +) +def test_apply_entity_success(test_registry): + entity = Entity( + name="driver_car_id", + description="Car driver id", + tags={"team": "matchmaking"}, + ) + + project = "project" + + # Register Entity + test_registry.apply_entity(entity, project) + + entities = test_registry.list_entities(project) + + entity = entities[0] + assert ( + len(entities) == 1 + and entity.name == "driver_car_id" + and entity.description == "Car driver id" + and "team" in entity.tags + and entity.tags["team"] == "matchmaking" + ) + + entity = test_registry.get_entity("driver_car_id", project) + assert ( + entity.name == "driver_car_id" + and entity.description == "Car driver id" + and "team" in entity.tags + and entity.tags["team"] == "matchmaking" + ) + + test_registry.delete_entity("driver_car_id", project) + entities = test_registry.list_entities(project) + assert len(entities) == 0 + + test_registry.teardown() + + # Will try to reload registry, which will fail because the file has been deleted + with pytest.raises(FileNotFoundError): + test_registry._get_registry_proto(project=project) + + +@pytest.mark.parametrize( + "test_registry", + [lazy_fixture("local_registry")], +) +def test_apply_feature_view_success(test_registry): + # Create Feature Views + batch_source = FileSource( + file_format=ParquetFormat(), + path="file://feast/*", + timestamp_field="ts_col", + created_timestamp_column="timestamp", + ) + + entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) + + fv1 = FeatureView( + name="my_feature_view_1", + schema=[ + Field(name="fs1_my_feature_1", dtype=Int64), + Field(name="fs1_my_feature_2", dtype=String), + Field(name="fs1_my_feature_3", dtype=Array(String)), + Field(name="fs1_my_feature_4", dtype=Array(Bytes)), + ], + entities=[entity], + tags={"team": "matchmaking"}, + batch_source=batch_source, + ttl=timedelta(minutes=5), + ) + + project = "project" + + # Register Feature View + test_registry.apply_feature_view(fv1, project) + + feature_views = test_registry.list_feature_views(project) + + # List Feature Views + assert ( + len(feature_views) == 1 + and feature_views[0].name == "my_feature_view_1" + and feature_views[0].features[0].name == "fs1_my_feature_1" + and feature_views[0].features[0].dtype == Int64 + and feature_views[0].features[1].name == "fs1_my_feature_2" + and feature_views[0].features[1].dtype == String + and feature_views[0].features[2].name == "fs1_my_feature_3" + and feature_views[0].features[2].dtype == Array(String) + and feature_views[0].features[3].name == "fs1_my_feature_4" + and feature_views[0].features[3].dtype == Array(Bytes) + and feature_views[0].entities[0] == "fs1_my_entity_1" + ) + + feature_view = test_registry.get_feature_view("my_feature_view_1", project) + assert ( + feature_view.name == "my_feature_view_1" + and feature_view.features[0].name == "fs1_my_feature_1" + and feature_view.features[0].dtype == Int64 + and feature_view.features[1].name == "fs1_my_feature_2" + and feature_view.features[1].dtype == String + and feature_view.features[2].name == "fs1_my_feature_3" + and feature_view.features[2].dtype == Array(String) + and feature_view.features[3].name == "fs1_my_feature_4" + and feature_view.features[3].dtype == Array(Bytes) + and feature_view.entities[0] == "fs1_my_entity_1" + ) + + test_registry.delete_feature_view("my_feature_view_1", project) + feature_views = test_registry.list_feature_views(project) + assert len(feature_views) == 0 + + test_registry.teardown() + + # Will try to reload registry, which will fail because the file has been deleted + with pytest.raises(FileNotFoundError): + test_registry._get_registry_proto(project=project) + + +@pytest.mark.parametrize( + "test_registry", + [lazy_fixture("local_registry")], +) +def test_apply_on_demand_feature_view_success(test_registry): + # Create Feature Views + driver_stats = FileSource( + name="driver_stats_source", + path="data/driver_stats_lat_lon.parquet", + timestamp_field="event_timestamp", + created_timestamp_column="created", + description="A table describing the stats of a driver based on hourly logs", + owner="test2@gmail.com", + ) + + driver_daily_features_view = FeatureView( + name="driver_daily_features", + entities=["driver"], + ttl=timedelta(seconds=8640000000), + schema=[ + Field(name="daily_miles_driven", dtype=Float32), + Field(name="lat", dtype=Float32), + Field(name="lon", dtype=Float32), + Field(name="string_feature", dtype=String), + ], + online=True, + source=driver_stats, + tags={"production": "True"}, + owner="test2@gmail.com", + ) + + @on_demand_feature_view( + sources=[driver_daily_features_view], + schema=[Field(name="first_char", dtype=String)], + ) + def location_features_from_push(inputs: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["first_char"] = inputs["string_feature"].str[:1].astype("string") + return df + + project = "project" + + # Register Feature View + test_registry.apply_feature_view(location_features_from_push, project) + + feature_views = test_registry.list_on_demand_feature_views(project) + + # List Feature Views + assert ( + len(feature_views) == 1 + and feature_views[0].name == "location_features_from_push" + and feature_views[0].features[0].name == "first_char" + and feature_views[0].features[0].dtype == String + ) + + feature_view = test_registry.get_on_demand_feature_view( + "location_features_from_push", project + ) + assert ( + feature_view.name == "location_features_from_push" + and feature_view.features[0].name == "first_char" + and feature_view.features[0].dtype == String + ) + + test_registry.delete_feature_view("location_features_from_push", project) + feature_views = test_registry.list_on_demand_feature_views(project) + assert len(feature_views) == 0 + + test_registry.teardown() + + # Will try to reload registry, which will fail because the file has been deleted + with pytest.raises(FileNotFoundError): + test_registry._get_registry_proto(project=project) + + +@pytest.mark.parametrize( + "test_registry", + [lazy_fixture("local_registry")], +) +def test_apply_stream_feature_view_success(test_registry): + # Create Feature Views + def simple_udf(x: int): + return x + 3 + + entity = Entity(name="driver_entity", join_keys=["test_key"]) + + stream_source = KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + kafka_bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=FileSource(path="some path"), + watermark_delay_threshold=timedelta(days=1), + ) + + sfv = StreamFeatureView( + name="test kafka stream feature view", + entities=[entity], + ttl=timedelta(days=30), + owner="test@example.com", + online=True, + schema=[Field(name="dummy_field", dtype=Float32)], + description="desc", + aggregations=[ + Aggregation( + column="dummy_field", + function="max", + time_window=timedelta(days=1), + ), + Aggregation( + column="dummy_field2", + function="count", + time_window=timedelta(days=24), + ), + ], + timestamp_field="event_timestamp", + mode="spark", + source=stream_source, + udf=simple_udf, + tags={}, + ) + + project = "project" + + # Register Feature View + test_registry.apply_feature_view(sfv, project) + + stream_feature_views = test_registry.list_stream_feature_views(project) + + # List Feature Views + assert len(stream_feature_views) == 1 + assert stream_feature_views[0] == sfv + + test_registry.delete_feature_view("test kafka stream feature view", project) + stream_feature_views = test_registry.list_stream_feature_views(project) + assert len(stream_feature_views) == 0 + + test_registry.teardown() + + # Will try to reload registry, which will fail because the file has been deleted + with pytest.raises(FileNotFoundError): + test_registry._get_registry_proto(project=project) + + +@pytest.mark.parametrize( + "test_registry", + [lazy_fixture("local_registry")], +) +# TODO(kevjumba): remove this in feast 0.24 when deprecating +@pytest.mark.parametrize( + "request_source_schema", + [[Field(name="my_input_1", dtype=Int32)], {"my_input_1": ValueType.INT32}], +) +def test_modify_feature_views_success(test_registry, request_source_schema): + # Create Feature Views + batch_source = FileSource( + file_format=ParquetFormat(), + path="file://feast/*", + timestamp_field="ts_col", + created_timestamp_column="timestamp", + ) + + request_source = RequestSource( + name="request_source", + schema=request_source_schema, + ) + + entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) + + fv1 = FeatureView( + name="my_feature_view_1", + schema=[Field(name="fs1_my_feature_1", dtype=Int64)], + entities=[entity], + tags={"team": "matchmaking"}, + batch_source=batch_source, + ttl=timedelta(minutes=5), + ) + + @on_demand_feature_view( + features=[ + Feature(name="odfv1_my_feature_1", dtype=ValueType.STRING), + Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), + ], + sources=[request_source], + ) + def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: + data = pd.DataFrame() + data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype("category") + data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32") + return data + + project = "project" + + # Register Feature Views + test_registry.apply_feature_view(odfv1, project) + test_registry.apply_feature_view(fv1, project) + + # Modify odfv by changing a single feature dtype + @on_demand_feature_view( + features=[ + Feature(name="odfv1_my_feature_1", dtype=ValueType.FLOAT), + Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), + ], + sources=[request_source], + ) + def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: + data = pd.DataFrame() + data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype("float") + data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32") + return data + + # Apply the modified odfv + test_registry.apply_feature_view(odfv1, project) + + # Check odfv + on_demand_feature_views = test_registry.list_on_demand_feature_views(project) + + assert ( + len(on_demand_feature_views) == 1 + and on_demand_feature_views[0].name == "odfv1" + and on_demand_feature_views[0].features[0].name == "odfv1_my_feature_1" + and on_demand_feature_views[0].features[0].dtype == Float32 + and on_demand_feature_views[0].features[1].name == "odfv1_my_feature_2" + and on_demand_feature_views[0].features[1].dtype == Int32 + ) + request_schema = on_demand_feature_views[0].get_request_data_schema() + assert ( + list(request_schema.keys())[0] == "my_input_1" + and list(request_schema.values())[0] == ValueType.INT32 + ) + + feature_view = test_registry.get_on_demand_feature_view("odfv1", project) + assert ( + feature_view.name == "odfv1" + and feature_view.features[0].name == "odfv1_my_feature_1" + and feature_view.features[0].dtype == Float32 + and feature_view.features[1].name == "odfv1_my_feature_2" + and feature_view.features[1].dtype == Int32 + ) + request_schema = feature_view.get_request_data_schema() + assert ( + list(request_schema.keys())[0] == "my_input_1" + and list(request_schema.values())[0] == ValueType.INT32 + ) + + # Make sure fv1 is untouched + feature_views = test_registry.list_feature_views(project) + + # List Feature Views + assert ( + len(feature_views) == 1 + and feature_views[0].name == "my_feature_view_1" + and feature_views[0].features[0].name == "fs1_my_feature_1" + and feature_views[0].features[0].dtype == Int64 + and feature_views[0].entities[0] == "fs1_my_entity_1" + ) + + feature_view = test_registry.get_feature_view("my_feature_view_1", project) + assert ( + feature_view.name == "my_feature_view_1" + and feature_view.features[0].name == "fs1_my_feature_1" + and feature_view.features[0].dtype == Int64 + and feature_view.entities[0] == "fs1_my_entity_1" + ) + + test_registry.teardown() + + # Will try to reload registry, which will fail because the file has been deleted + with pytest.raises(FileNotFoundError): + test_registry._get_registry_proto(project=project) + + +@pytest.mark.parametrize( + "test_registry", + [lazy_fixture("local_registry")], +) +def test_apply_data_source(test_registry: Registry): + validate_registry_data_source_apply(test_registry) + + +def test_commit(): + fd, registry_path = mkstemp() + registry_config = RegistryConfig(path=registry_path, cache_ttl_seconds=600) + test_registry = Registry(registry_config, None) + + entity = Entity( + name="driver_car_id", + description="Car driver id", + tags={"team": "matchmaking"}, + ) + + project = "project" + + # Register Entity without commiting + test_registry.apply_entity(entity, project, commit=False) + assert test_registry.cached_registry_proto + assert len(test_registry.cached_registry_proto.project_metadata) == 1 + project_metadata = test_registry.cached_registry_proto.project_metadata[0] + project_uuid = project_metadata.project_uuid + assert len(project_uuid) == 36 + validate_project_uuid(project_uuid, test_registry) + + # Retrieving the entity should still succeed + entities = test_registry.list_entities(project, allow_cache=True) + entity = entities[0] + assert ( + len(entities) == 1 + and entity.name == "driver_car_id" + and entity.description == "Car driver id" + and "team" in entity.tags + and entity.tags["team"] == "matchmaking" + ) + validate_project_uuid(project_uuid, test_registry) + + entity = test_registry.get_entity("driver_car_id", project, allow_cache=True) + assert ( + entity.name == "driver_car_id" + and entity.description == "Car driver id" + and "team" in entity.tags + and entity.tags["team"] == "matchmaking" + ) + validate_project_uuid(project_uuid, test_registry) + + # Create new registry that points to the same store + registry_with_same_store = Registry(registry_config, None) + + # Retrieving the entity should fail since the store is empty + entities = registry_with_same_store.list_entities(project) + assert len(entities) == 0 + validate_project_uuid(project_uuid, registry_with_same_store) + + # commit from the original registry + test_registry.commit() + + # Reconstruct the new registry in order to read the newly written store + registry_with_same_store = Registry(registry_config, None) + + # Retrieving the entity should now succeed + entities = registry_with_same_store.list_entities(project) + entity = entities[0] + assert ( + len(entities) == 1 + and entity.name == "driver_car_id" + and entity.description == "Car driver id" + and "team" in entity.tags + and entity.tags["team"] == "matchmaking" + ) + validate_project_uuid(project_uuid, registry_with_same_store) + + entity = test_registry.get_entity("driver_car_id", project) + assert ( + entity.name == "driver_car_id" + and entity.description == "Car driver id" + and "team" in entity.tags + and entity.tags["team"] == "matchmaking" + ) + + test_registry.teardown() + + # Will try to reload registry, which will fail because the file has been deleted + with pytest.raises(FileNotFoundError): + test_registry._get_registry_proto(project=project) + + +def validate_project_uuid(project_uuid, test_registry): + assert len(test_registry.cached_registry_proto.project_metadata) == 1 + project_metadata = test_registry.cached_registry_proto.project_metadata[0] + assert project_metadata.project_uuid == project_uuid diff --git a/sdk/python/tests/unit/infra/test_provider.py b/sdk/python/tests/unit/infra/test_provider.py index 5ed5603b03..217a1361b4 100644 --- a/sdk/python/tests/unit/infra/test_provider.py +++ b/sdk/python/tests/unit/infra/test_provider.py @@ -18,8 +18,8 @@ from feast.entity import Entity from feast.feature_view import FeatureView from feast.field import Field -from feast.infra.provider import _get_column_names from feast.types import String +from feast.utils import _get_column_names def test_get_column_names_preserves_feature_ordering(): diff --git a/sdk/python/tests/integration/online_store/test_e2e_local.py b/sdk/python/tests/unit/local_feast_tests/test_e2e_local.py similarity index 52% rename from sdk/python/tests/integration/online_store/test_e2e_local.py rename to sdk/python/tests/unit/local_feast_tests/test_e2e_local.py index c5b66e7ddc..97d6463f5f 100644 --- a/sdk/python/tests/integration/online_store/test_e2e_local.py +++ b/sdk/python/tests/unit/local_feast_tests/test_e2e_local.py @@ -4,142 +4,17 @@ from pathlib import Path import pandas as pd -from pytz import utc +from feast import Entity, FeatureView, Field, FileSource from feast.driver_test_data import ( create_driver_hourly_stats_df, create_global_daily_stats_df, ) from feast.feature_store import FeatureStore -from tests.utils.cli_utils import CliRunner, get_example_repo - - -def _get_last_feature_row(df: pd.DataFrame, driver_id, max_date: datetime): - """Manually extract last feature value from a dataframe for a given driver_id with up to `max_date` date""" - filtered = df[ - (df["driver_id"] == driver_id) - & (df["event_timestamp"] < max_date.replace(tzinfo=utc)) - ] - max_ts = filtered.loc[filtered["event_timestamp"].idxmax()]["event_timestamp"] - filtered_by_ts = filtered[filtered["event_timestamp"] == max_ts] - return filtered_by_ts.loc[filtered_by_ts["created"].idxmax()] - - -def _assert_online_features( - store: FeatureStore, driver_df: pd.DataFrame, max_date: datetime -): - """Assert that features in online store are up to date with `max_date` date.""" - # Read features back - response = store.get_online_features( - features=[ - "driver_hourly_stats:conv_rate", - "driver_hourly_stats:avg_daily_trips", - "global_daily_stats:num_rides", - "global_daily_stats:avg_ride_length", - ], - entity_rows=[{"driver_id": 1001}], - full_feature_names=True, - ) - - # Float features should still be floats. - assert ( - response.proto.results[ - list(response.proto.metadata.feature_names.val).index( - "driver_hourly_stats__conv_rate" - ) - ] - .values[0] - .float_val - > 0 - ) - - result = response.to_dict() - assert len(result) == 5 - assert "driver_hourly_stats__avg_daily_trips" in result - assert "driver_hourly_stats__conv_rate" in result - assert ( - abs( - result["driver_hourly_stats__conv_rate"][0] - - _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"] - ) - < 0.01 - ) - assert "global_daily_stats__num_rides" in result - assert "global_daily_stats__avg_ride_length" in result - - # Test the ODFV if it exists. - odfvs = store.list_on_demand_feature_views() - if odfvs and odfvs[0].name == "conv_rate_plus_100": - response = store.get_online_features( - features=[ - "conv_rate_plus_100:conv_rate_plus_100", - "conv_rate_plus_100:conv_rate_plus_val_to_add", - ], - entity_rows=[{"driver_id": 1001, "val_to_add": 100}], - full_feature_names=True, - ) - - # Check that float64 feature is stored correctly in proto format. - assert ( - response.proto.results[ - list(response.proto.metadata.feature_names.val).index( - "conv_rate_plus_100__conv_rate_plus_100" - ) - ] - .values[0] - .double_val - > 0 - ) - - result = response.to_dict() - assert len(result) == 3 - assert "conv_rate_plus_100__conv_rate_plus_100" in result - assert "conv_rate_plus_100__conv_rate_plus_val_to_add" in result - assert ( - abs( - result["conv_rate_plus_100__conv_rate_plus_100"][0] - - (_get_last_feature_row(driver_df, 1001, max_date)["conv_rate"] + 100) - ) - < 0.01 - ) - assert ( - abs( - result["conv_rate_plus_100__conv_rate_plus_val_to_add"][0] - - (_get_last_feature_row(driver_df, 1001, max_date)["conv_rate"] + 100) - ) - < 0.01 - ) - - -def _test_materialize_and_online_retrieval( - runner: CliRunner, - store: FeatureStore, - start_date: datetime, - end_date: datetime, - driver_df: pd.DataFrame, -): - assert store.repo_path is not None - - # Test `feast materialize` and online retrieval. - r = runner.run( - [ - "materialize", - start_date.isoformat(), - (end_date - timedelta(days=7)).isoformat(), - ], - cwd=Path(store.repo_path), - ) - - assert r.returncode == 0, f"stdout: {r.stdout}\n stderr: {r.stderr}" - _assert_online_features(store, driver_df, end_date - timedelta(days=7)) - - # Test `feast materialize-incremental` and online retrieval. - r = runner.run( - ["materialize-incremental", end_date.isoformat()], cwd=Path(store.repo_path), - ) - - assert r.returncode == 0, f"stdout: {r.stdout}\n stderr: {r.stderr}" - _assert_online_features(store, driver_df, end_date) +from feast.types import Float32, String +from tests.utils.basic_read_write_test import basic_rw_test +from tests.utils.cli_repo_creator import CliRunner, get_example_repo +from tests.utils.feature_records import validate_online_features def test_e2e_local() -> None: @@ -216,3 +91,73 @@ def test_e2e_local() -> None: assert returncode != 0 assert "feast.errors.FeastJoinKeysDuringMaterialization" in str(output) + + +def _test_materialize_and_online_retrieval( + runner: CliRunner, + store: FeatureStore, + start_date: datetime, + end_date: datetime, + driver_df: pd.DataFrame, +): + assert store.repo_path is not None + + # Test `feast materialize` and online retrieval. + r = runner.run( + [ + "materialize", + start_date.isoformat(), + (end_date - timedelta(days=7)).isoformat(), + ], + cwd=Path(store.repo_path), + ) + + assert r.returncode == 0, f"stdout: {r.stdout}\n stderr: {r.stderr}" + validate_online_features(store, driver_df, end_date - timedelta(days=7)) + + # Test `feast materialize-incremental` and online retrieval. + r = runner.run( + ["materialize-incremental", end_date.isoformat()], + cwd=Path(store.repo_path), + ) + + assert r.returncode == 0, f"stdout: {r.stdout}\n stderr: {r.stderr}" + validate_online_features(store, driver_df, end_date) + + +def test_partial() -> None: + """ + Add another table to existing repo using partial apply API. Make sure both the table + applied via CLI apply and the new table are passing RW test. + """ + runner = CliRunner() + with runner.local_repo( + get_example_repo("example_feature_repo_1.py"), "file" + ) as store: + driver = Entity(name="driver", join_keys=["test"]) + + driver_locations_source = FileSource( + path="data/driver_locations.parquet", # Fake path + timestamp_field="event_timestamp", + created_timestamp_column="created_timestamp", + ) + + driver_locations_100 = FeatureView( + name="driver_locations_100", + entities=[driver], + ttl=timedelta(days=1), + schema=[ + Field(name="lat", dtype=Float32), + Field(name="lon", dtype=String), + Field(name="name", dtype=String), + Field(name="test", dtype=String), + ], + online=True, + batch_source=driver_locations_source, + tags={}, + ) + + store.apply([driver_locations_100]) + + basic_rw_test(store, view_name="driver_locations") + basic_rw_test(store, view_name="driver_locations_100") diff --git a/sdk/python/tests/integration/registration/test_feature_service_apply.py b/sdk/python/tests/unit/local_feast_tests/test_feature_service_apply.py similarity index 77% rename from sdk/python/tests/integration/registration/test_feature_service_apply.py rename to sdk/python/tests/unit/local_feast_tests/test_feature_service_apply.py index 7824f6333e..dc642a6e3c 100644 --- a/sdk/python/tests/integration/registration/test_feature_service_apply.py +++ b/sdk/python/tests/unit/local_feast_tests/test_feature_service_apply.py @@ -1,19 +1,15 @@ -import pytest +from feast.feature_service import FeatureService +from tests.utils.cli_repo_creator import CliRunner, get_example_repo -from feast import FeatureService -from tests.utils.cli_utils import CliRunner, get_example_repo - -@pytest.mark.integration def test_read_pre_applied() -> None: """ Read feature values from the FeatureStore using a FeatureService. """ runner = CliRunner() with runner.local_repo( - get_example_repo("example_feature_repo_1.py"), "bigquery" + get_example_repo("example_feature_repo_with_feature_service.py"), "file" ) as store: - assert len(store.list_feature_services()) == 1 fs = store.get_feature_service("driver_locations_service") assert len(fs.tags) == 1 diff --git a/sdk/python/tests/integration/online_store/test_feature_service_read.py b/sdk/python/tests/unit/local_feast_tests/test_feature_service_read.py similarity index 60% rename from sdk/python/tests/integration/online_store/test_feature_service_read.py rename to sdk/python/tests/unit/local_feast_tests/test_feature_service_read.py index 33c318b9ed..2b5b311dc9 100644 --- a/sdk/python/tests/integration/online_store/test_feature_service_read.py +++ b/sdk/python/tests/unit/local_feast_tests/test_feature_service_read.py @@ -1,20 +1,15 @@ -import pytest +from tests.utils.basic_read_write_test import basic_rw_test +from tests.utils.cli_repo_creator import CliRunner, get_example_repo -from tests.utils.cli_utils import CliRunner, get_example_repo -from tests.utils.online_read_write_test import basic_rw_test - -@pytest.mark.integration def test_feature_service_read() -> None: """ Read feature values from the FeatureStore using a FeatureService. """ - runner = CliRunner() with runner.local_repo( - get_example_repo("example_feature_repo_1.py"), "bigquery" + get_example_repo("example_feature_repo_with_feature_service.py"), "file" ) as store: - basic_rw_test( store, view_name="driver_locations", diff --git a/sdk/python/tests/integration/scaffolding/test_init.py b/sdk/python/tests/unit/local_feast_tests/test_init.py similarity index 97% rename from sdk/python/tests/integration/scaffolding/test_init.py rename to sdk/python/tests/unit/local_feast_tests/test_init.py index 1cada91ea0..f9bf536e56 100644 --- a/sdk/python/tests/integration/scaffolding/test_init.py +++ b/sdk/python/tests/unit/local_feast_tests/test_init.py @@ -3,7 +3,7 @@ from pathlib import Path from textwrap import dedent -from tests.utils.cli_utils import CliRunner +from tests.utils.cli_repo_creator import CliRunner def test_repo_init() -> None: diff --git a/sdk/python/tests/unit/local_feast_tests/test_local_feature_store.py b/sdk/python/tests/unit/local_feast_tests/test_local_feature_store.py new file mode 100644 index 0000000000..44a35e0660 --- /dev/null +++ b/sdk/python/tests/unit/local_feast_tests/test_local_feature_store.py @@ -0,0 +1,266 @@ +from datetime import datetime, timedelta +from tempfile import mkstemp + +import pytest +from pytest_lazyfixture import lazy_fixture + +from feast import FileSource +from feast.data_format import ParquetFormat +from feast.entity import Entity +from feast.feature_store import FeatureStore +from feast.feature_view import FeatureView +from feast.field import Field +from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig +from feast.repo_config import RepoConfig +from feast.types import Array, Bytes, Int64, String +from tests.utils.data_source_test_creator import prep_file_source + + +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +def test_apply_entity_success(test_feature_store): + entity = Entity( + name="driver_car_id", + description="Car driver id", + tags={"team": "matchmaking"}, + ) + + # Register Entity + test_feature_store.apply(entity) + + entities = test_feature_store.list_entities() + + entity = entities[0] + assert ( + len(entities) == 1 + and entity.name == "driver_car_id" + and entity.description == "Car driver id" + and "team" in entity.tags + and entity.tags["team"] == "matchmaking" + ) + + test_feature_store.teardown() + + +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +def test_apply_feature_view_success(test_feature_store): + # Create Feature Views + batch_source = FileSource( + file_format=ParquetFormat(), + path="file://feast/*", + timestamp_field="ts_col", + created_timestamp_column="timestamp", + date_partition_column="date_partition_col", + ) + + entity = Entity(name="fs1_my_entity_1", join_keys=["entity_id"]) + + fv1 = FeatureView( + name="my_feature_view_1", + schema=[ + Field(name="fs1_my_feature_1", dtype=Int64), + Field(name="fs1_my_feature_2", dtype=String), + Field(name="fs1_my_feature_3", dtype=Array(String)), + Field(name="fs1_my_feature_4", dtype=Array(Bytes)), + Field(name="entity_id", dtype=Int64), + ], + entities=[entity], + tags={"team": "matchmaking"}, + batch_source=batch_source, + ttl=timedelta(minutes=5), + ) + + # Register Feature View + test_feature_store.apply([entity, fv1]) + + feature_views = test_feature_store.list_feature_views() + + # List Feature Views + assert ( + len(feature_views) == 1 + and feature_views[0].name == "my_feature_view_1" + and feature_views[0].features[0].name == "fs1_my_feature_1" + and feature_views[0].features[0].dtype == Int64 + and feature_views[0].features[1].name == "fs1_my_feature_2" + and feature_views[0].features[1].dtype == String + and feature_views[0].features[2].name == "fs1_my_feature_3" + and feature_views[0].features[2].dtype == Array(String) + and feature_views[0].features[3].name == "fs1_my_feature_4" + and feature_views[0].features[3].dtype == Array(Bytes) + and feature_views[0].entities[0] == "fs1_my_entity_1" + ) + + test_feature_store.teardown() + + +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +def test_apply_object_and_read(test_feature_store): + assert isinstance(test_feature_store, FeatureStore) + # Create Feature Views + batch_source = FileSource( + file_format=ParquetFormat(), + path="file://feast/*", + timestamp_field="ts_col", + created_timestamp_column="timestamp", + ) + + e1 = Entity(name="fs1_my_entity_1", description="something") + + e2 = Entity(name="fs1_my_entity_2", description="something") + + fv1 = FeatureView( + name="my_feature_view_1", + schema=[ + Field(name="fs1_my_feature_1", dtype=Int64), + Field(name="fs1_my_feature_2", dtype=String), + Field(name="fs1_my_feature_3", dtype=Array(String)), + Field(name="fs1_my_feature_4", dtype=Array(Bytes)), + Field(name="fs1_my_entity_1", dtype=Int64), + ], + entities=[e1], + tags={"team": "matchmaking"}, + batch_source=batch_source, + ttl=timedelta(minutes=5), + ) + + fv2 = FeatureView( + name="my_feature_view_2", + schema=[ + Field(name="fs1_my_feature_1", dtype=Int64), + Field(name="fs1_my_feature_2", dtype=String), + Field(name="fs1_my_feature_3", dtype=Array(String)), + Field(name="fs1_my_feature_4", dtype=Array(Bytes)), + Field(name="fs1_my_entity_2", dtype=Int64), + ], + entities=[e2], + tags={"team": "matchmaking"}, + batch_source=batch_source, + ttl=timedelta(minutes=5), + ) + + # Register Feature View + test_feature_store.apply([fv1, e1, fv2, e2]) + + fv1_actual = test_feature_store.get_feature_view("my_feature_view_1") + e1_actual = test_feature_store.get_entity("fs1_my_entity_1") + + assert e1 == e1_actual + assert fv2 != fv1_actual + assert e2 != e1_actual + + test_feature_store.teardown() + + +@pytest.mark.parametrize( + "test_feature_store", + [lazy_fixture("feature_store_with_local_registry")], +) +@pytest.mark.parametrize("dataframe_source", [lazy_fixture("simple_dataset_1")]) +def test_reapply_feature_view_success(test_feature_store, dataframe_source): + with prep_file_source(df=dataframe_source, timestamp_field="ts_1") as file_source: + + e = Entity(name="id", join_keys=["id_join_key"]) + + # Create Feature View + fv1 = FeatureView( + name="my_feature_view_1", + schema=[Field(name="string_col", dtype=String)], + entities=[e], + batch_source=file_source, + ttl=timedelta(minutes=5), + ) + + # Register Feature View + test_feature_store.apply([fv1, e]) + + # Check Feature View + fv_stored = test_feature_store.get_feature_view(fv1.name) + assert len(fv_stored.materialization_intervals) == 0 + + # Run materialization + test_feature_store.materialize(datetime(2020, 1, 1), datetime(2021, 1, 1)) + + # Check Feature View + fv_stored = test_feature_store.get_feature_view(fv1.name) + assert len(fv_stored.materialization_intervals) == 1 + + # Apply again + test_feature_store.apply([fv1]) + + # Check Feature View + fv_stored = test_feature_store.get_feature_view(fv1.name) + assert len(fv_stored.materialization_intervals) == 1 + + # Change and apply Feature View + fv1 = FeatureView( + name="my_feature_view_1", + schema=[Field(name="int64_col", dtype=Int64)], + entities=[e], + batch_source=file_source, + ttl=timedelta(minutes=5), + ) + test_feature_store.apply([fv1]) + + # Check Feature View + fv_stored = test_feature_store.get_feature_view(fv1.name) + assert len(fv_stored.materialization_intervals) == 0 + + test_feature_store.teardown() + + +def test_apply_conflicting_featureview_names(feature_store_with_local_registry): + """Test applying feature views with non-case-insensitively unique names""" + driver = Entity(name="driver", join_keys=["driver_id"]) + customer = Entity(name="customer", join_keys=["customer_id"]) + + driver_stats = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(seconds=10), + online=False, + batch_source=FileSource(path="driver_stats.parquet"), + tags={}, + ) + + customer_stats = FeatureView( + name="DRIVER_HOURLY_STATS", + entities=[customer], + ttl=timedelta(seconds=10), + online=False, + batch_source=FileSource(path="customer_stats.parquet"), + tags={}, + ) + try: + feature_store_with_local_registry.apply([driver_stats, customer_stats]) + error = None + except ValueError as e: + error = e + assert ( + isinstance(error, ValueError) + and "Please ensure that all feature view names are case-insensitively unique" + in error.args[0] + ) + + feature_store_with_local_registry.teardown() + + +@pytest.fixture +def feature_store_with_local_registry(): + fd, registry_path = mkstemp() + fd, online_store_path = mkstemp() + return FeatureStore( + config=RepoConfig( + registry=registry_path, + project="default", + provider="local", + online_store=SqliteOnlineStoreConfig(path=online_store_path), + ) + ) diff --git a/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py b/sdk/python/tests/unit/local_feast_tests/test_stream_feature_view_apply.py similarity index 87% rename from sdk/python/tests/integration/registration/test_stream_feature_view_apply.py rename to sdk/python/tests/unit/local_feast_tests/test_stream_feature_view_apply.py index 8e2af031c5..0def3cc783 100644 --- a/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py +++ b/sdk/python/tests/unit/local_feast_tests/test_stream_feature_view_apply.py @@ -1,7 +1,5 @@ from datetime import timedelta -import pytest - from feast.aggregation import Aggregation from feast.data_format import AvroFormat from feast.data_source import KafkaSource @@ -9,18 +7,17 @@ from feast.field import Field from feast.stream_feature_view import stream_feature_view from feast.types import Float32 -from tests.utils.cli_utils import CliRunner, get_example_repo -from tests.utils.data_source_utils import prep_file_source +from tests.utils.cli_repo_creator import CliRunner, get_example_repo +from tests.utils.data_source_test_creator import prep_file_source -@pytest.mark.integration def test_apply_stream_feature_view(simple_dataset_1) -> None: """ Test apply of StreamFeatureView. """ runner = CliRunner() with runner.local_repo( - get_example_repo("example_feature_repo_1.py"), "bigquery" + get_example_repo("empty_feature_repo.py"), "file" ) as fs, prep_file_source( df=simple_dataset_1, timestamp_field="ts_1" ) as file_source: @@ -45,7 +42,9 @@ def test_apply_stream_feature_view(simple_dataset_1) -> None: description="desc", aggregations=[ Aggregation( - column="dummy_field", function="max", time_window=timedelta(days=1), + column="dummy_field", + function="max", + time_window=timedelta(days=1), ), Aggregation( column="dummy_field2", @@ -68,7 +67,8 @@ def simple_sfv(df): assert stream_feature_views[0] == simple_sfv features = fs.get_online_features( - features=["simple_sfv:dummy_field"], entity_rows=[{"test_key": 1001}], + features=["simple_sfv:dummy_field"], + entity_rows=[{"test_key": 1001}], ).to_dict(include_event_timestamps=True) assert "test_key" in features @@ -77,14 +77,13 @@ def simple_sfv(df): assert features["dummy_field"] == [None] -@pytest.mark.integration def test_stream_feature_view_udf(simple_dataset_1) -> None: """ Test apply of StreamFeatureView udfs are serialized correctly and usable. """ runner = CliRunner() with runner.local_repo( - get_example_repo("example_feature_repo_1.py"), "bigquery" + get_example_repo("empty_feature_repo.py"), "file" ) as fs, prep_file_source( df=simple_dataset_1, timestamp_field="ts_1" ) as file_source: @@ -109,7 +108,9 @@ def test_stream_feature_view_udf(simple_dataset_1) -> None: description="desc", aggregations=[ Aggregation( - column="dummy_field", function="max", time_window=timedelta(days=1), + column="dummy_field", + function="max", + time_window=timedelta(days=1), ), Aggregation( column="dummy_field2", diff --git a/sdk/python/tests/integration/online_store/test_online_retrieval.py b/sdk/python/tests/unit/online_store/test_online_retrieval.py similarity index 98% rename from sdk/python/tests/integration/online_store/test_online_retrieval.py rename to sdk/python/tests/unit/online_store/test_online_retrieval.py index 9cf4d9a182..731230a5f6 100644 --- a/sdk/python/tests/integration/online_store/test_online_retrieval.py +++ b/sdk/python/tests/unit/online_store/test_online_retrieval.py @@ -11,20 +11,18 @@ from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.repo_config import RegistryConfig -from tests.utils.cli_utils import CliRunner, get_example_repo +from tests.utils.cli_repo_creator import CliRunner, get_example_repo -@pytest.mark.integration def test_online() -> None: """ Test reading from the online store in local mode. """ runner = CliRunner() with runner.local_repo( - get_example_repo("example_feature_repo_1.py"), "bigquery" + get_example_repo("example_feature_repo_1.py"), "file" ) as store: # Write some data to two tables - driver_locations_fv = store.get_feature_view(name="driver_locations") customer_profile_fv = store.get_feature_view(name="customer_profile") customer_driver_combined_fv = store.get_feature_view( @@ -251,13 +249,11 @@ def test_online() -> None: os.rename(store.config.registry + "_fake", store.config.registry) -@pytest.mark.integration def test_online_to_df(): """ Test dataframe conversion. Make sure the response columns and rows are the same order as the request. """ - driver_ids = [1, 2, 3] customer_ids = [4, 5, 6] name = "foo" @@ -268,7 +264,7 @@ def test_online_to_df(): runner = CliRunner() with runner.local_repo( - get_example_repo("example_feature_repo_1.py"), "bigquery" + get_example_repo("example_feature_repo_1.py"), "file" ) as store: # Write three tables to online store driver_locations_fv = store.get_feature_view(name="driver_locations") diff --git a/sdk/python/tests/unit/test_data_sources.py b/sdk/python/tests/unit/test_data_sources.py index 0208a71503..0b437e50b9 100644 --- a/sdk/python/tests/unit/test_data_sources.py +++ b/sdk/python/tests/unit/test_data_sources.py @@ -20,7 +20,8 @@ def test_push_with_batch(): push_source = PushSource( - name="test", batch_source=BigQuerySource(table="test.test"), + name="test", + batch_source=BigQuerySource(table="test.test"), ) push_source_proto = push_source.to_proto() assert push_source_proto.HasField("batch_source") @@ -48,7 +49,11 @@ def test_request_source_primitive_type_to_proto(): Field(name="f2", dtype=Bool), ] request_source = RequestSource( - name="source", schema=schema, description="desc", tags={}, owner="feast", + name="source", + schema=schema, + description="desc", + tags={}, + owner="feast", ) request_proto = request_source.to_proto() deserialized_request_source = RequestSource.from_proto(request_proto) @@ -57,13 +62,16 @@ def test_request_source_primitive_type_to_proto(): def test_hash(): push_source_1 = PushSource( - name="test", batch_source=BigQuerySource(table="test.test"), + name="test", + batch_source=BigQuerySource(table="test.test"), ) push_source_2 = PushSource( - name="test", batch_source=BigQuerySource(table="test.test"), + name="test", + batch_source=BigQuerySource(table="test.test"), ) push_source_3 = PushSource( - name="test", batch_source=BigQuerySource(table="test.test2"), + name="test", + batch_source=BigQuerySource(table="test.test2"), ) push_source_4 = PushSource( name="test", @@ -253,3 +261,13 @@ def test_proto_conversion(): assert DataSource.from_proto(kinesis_source.to_proto()) == kinesis_source assert DataSource.from_proto(push_source.to_proto()) == push_source assert DataSource.from_proto(request_source.to_proto()) == request_source + + +def test_column_conflict(): + with pytest.raises(ValueError): + _ = FileSource( + name="test_source", + path="test_path", + timestamp_field="event_timestamp", + created_timestamp_column="event_timestamp", + ) diff --git a/sdk/python/tests/unit/test_entity.py b/sdk/python/tests/unit/test_entity.py index 04a857ddef..66ed02a71c 100644 --- a/sdk/python/tests/unit/test_entity.py +++ b/sdk/python/tests/unit/test_entity.py @@ -27,7 +27,9 @@ def test_join_key_default(): def test_entity_class_contains_tags(): with pytest.deprecated_call(): entity = Entity( - "my-entity", description="My entity", tags={"key1": "val1", "key2": "val2"}, + "my-entity", + description="My entity", + tags={"key1": "val1", "key2": "val2"}, ) assert "key1" in entity.tags.keys() and entity.tags["key1"] == "val1" assert "key2" in entity.tags.keys() and entity.tags["key2"] == "val2" diff --git a/sdk/python/tests/unit/test_feature_service.py b/sdk/python/tests/unit/test_feature_service.py index fc4fd70bcb..da69809b3e 100644 --- a/sdk/python/tests/unit/test_feature_service.py +++ b/sdk/python/tests/unit/test_feature_service.py @@ -5,6 +5,7 @@ from feast.field import Field from feast.infra.offline_stores.file_source import FileSource from feast.types import Float32 +from tests.utils.test_wrappers import no_warnings def test_feature_service_with_description(): @@ -16,7 +17,6 @@ def test_feature_service_with_description(): def test_feature_service_without_description(): feature_service = FeatureService(name="my-feature-service", features=[]) - # assert feature_service.to_proto().spec.description == "" @@ -75,19 +75,6 @@ def test_feature_view_kw_args_warning(): service = FeatureService(features=[], tags={"tag_1": "tag"}, description="desc") -def no_warnings(func): - def wrapper_no_warnings(*args, **kwargs): - with pytest.warns(None) as warnings: - func(*args, **kwargs) - - if len(warnings) > 0: - raise AssertionError( - "Warnings were raised: " + ", ".join([str(w) for w in warnings]) - ) - - return wrapper_no_warnings - - @no_warnings def test_feature_view_kw_args_normal(): file_source = FileSource(name="my-file-source", path="test.parquet") diff --git a/sdk/python/tests/unit/test_feature_views.py b/sdk/python/tests/unit/test_feature_views.py index d6be8e0341..7b608b621d 100644 --- a/sdk/python/tests/unit/test_feature_views.py +++ b/sdk/python/tests/unit/test_feature_views.py @@ -117,7 +117,9 @@ def test_stream_feature_view_serialization(): description="desc", aggregations=[ Aggregation( - column="dummy_field", function="max", time_window=timedelta(days=1), + column="dummy_field", + function="max", + time_window=timedelta(days=1), ) ], timestamp_field="event_timestamp", @@ -153,7 +155,9 @@ def test_stream_feature_view_udfs(): description="desc", aggregations=[ Aggregation( - column="dummy_field", function="max", time_window=timedelta(days=1), + column="dummy_field", + function="max", + time_window=timedelta(days=1), ) ], timestamp_field="event_timestamp", diff --git a/sdk/python/tests/unit/test_on_demand_feature_view.py b/sdk/python/tests/unit/test_on_demand_feature_view.py index 33435b8557..5a0f5c98d8 100644 --- a/sdk/python/tests/unit/test_on_demand_feature_view.py +++ b/sdk/python/tests/unit/test_on_demand_feature_view.py @@ -107,7 +107,8 @@ def test_hash(): def test_inputs_parameter_deprecation_in_odfv(): date_request = RequestSource( - name="date_request", schema=[Field(name="some_date", dtype=UnixTimestamp)], + name="date_request", + schema=[Field(name="some_date", dtype=UnixTimestamp)], ) with pytest.warns(DeprecationWarning): diff --git a/sdk/python/tests/unit/test_proto_json.py b/sdk/python/tests/unit/test_proto_json.py index 6bfdbbbf91..b5e01744e4 100644 --- a/sdk/python/tests/unit/test_proto_json.py +++ b/sdk/python/tests/unit/test_proto_json.py @@ -12,11 +12,6 @@ FeatureVector = GetOnlineFeaturesResponse.FeatureVector -@pytest.fixture(scope="module") -def proto_json_patch(): - proto_json.patch() - - def test_feature_vector_values(proto_json_patch): # FeatureVector contains "repeated values" proto field. # We want to test that feast.types.Value can take different types in JSON @@ -81,7 +76,7 @@ def test_feast_repeated_value(proto_json_patch): # additional structure (e.g. [1,2,3] instead of {"val": [1,2,3]}) repeated_value_str = "[1,2,3]" repeated_value_proto = RepeatedValue() - Parse(repeated_value_str, repeated_value_proto) + Parse(repeated_value_str, repeated_value_proto, "") assertpy.assert_that(len(repeated_value_proto.val)).is_equal_to(3) assertpy.assert_that(repeated_value_proto.val[0].int64_val).is_equal_to(1) assertpy.assert_that(repeated_value_proto.val[1].int64_val).is_equal_to(2) @@ -106,3 +101,8 @@ def test_feature_list(proto_json_patch): assertpy.assert_that(feature_list_json).is_equal_to( ["feature-a", "feature-b", "feature-c"] ) + + +@pytest.fixture(scope="module") +def proto_json_patch(): + proto_json.patch() diff --git a/sdk/python/tests/unit/test_serialization_version.py b/sdk/python/tests/unit/test_serialization_version.py new file mode 100644 index 0000000000..00562e4000 --- /dev/null +++ b/sdk/python/tests/unit/test_serialization_version.py @@ -0,0 +1,17 @@ +import tempfile + +from assertpy import assertpy + +from feast import RepoConfig + + +def test_registry_entity_serialization_version(): + with tempfile.TemporaryDirectory() as tmpdir: + r = RepoConfig( + project="prompt_dory", + provider="local", + online_store="redis", + registry=f"{tmpdir}/registry.db", + entity_key_serialization_version=2, + ) + assertpy.assert_that(r.entity_key_serialization_version).is_equal_to(2) diff --git a/sdk/python/tests/unit/test_usage.py b/sdk/python/tests/unit/test_usage.py index 13988d3264..ca84247430 100644 --- a/sdk/python/tests/unit/test_usage.py +++ b/sdk/python/tests/unit/test_usage.py @@ -234,4 +234,4 @@ def call_length_ms(call): return ( datetime.datetime.fromisoformat(call["end"]) - datetime.datetime.fromisoformat(call["start"]) - ).total_seconds() * 10 ** 3 + ).total_seconds() * 10**3 diff --git a/sdk/python/tests/utils/online_read_write_test.py b/sdk/python/tests/utils/basic_read_write_test.py similarity index 93% rename from sdk/python/tests/utils/online_read_write_test.py rename to sdk/python/tests/utils/basic_read_write_test.py index 39846cd2ad..5a93a05a1f 100644 --- a/sdk/python/tests/utils/online_read_write_test.py +++ b/sdk/python/tests/utils/basic_read_write_test.py @@ -11,7 +11,10 @@ def basic_rw_test( ) -> None: """ This is a provider-independent test suite for reading and writing from the online store, to - be used by provider-specific tests. + be used by provider-specific tests. + + The specified feature view must have exactly two features: one named 'lat' with type Float32 + and one with name 'lon' with type String. """ table = store.get_feature_view(name=view_name) diff --git a/sdk/python/tests/utils/cli_utils.py b/sdk/python/tests/utils/cli_repo_creator.py similarity index 91% rename from sdk/python/tests/utils/cli_utils.py rename to sdk/python/tests/utils/cli_repo_creator.py index ee6ea138fb..a038b85840 100644 --- a/sdk/python/tests/utils/cli_utils.py +++ b/sdk/python/tests/utils/cli_repo_creator.py @@ -84,13 +84,11 @@ def local_repo(self, example_repo_py: str, offline_store: str): repo_example.write_text(example_repo_py) result = self.run(["apply"], cwd=repo_path) - assert ( - result.returncode == 0 - ), f"stdout: {result.stdout}\n stderr: {result.stderr}" + print(f"Apply: stdout: {str(result.stdout)}\n stderr: {str(result.stderr)}") + assert result.returncode == 0 yield FeatureStore(repo_path=str(repo_path), config=None) result = self.run(["teardown"], cwd=repo_path) - assert ( - result.returncode == 0 - ), f"stdout: {result.stdout}\n stderr: {result.stderr}" + print(f"Apply: stdout: {str(result.stdout)}\n stderr: {str(result.stderr)}") + assert result.returncode == 0 diff --git a/sdk/python/tests/utils/data_source_utils.py b/sdk/python/tests/utils/data_source_test_creator.py similarity index 88% rename from sdk/python/tests/utils/data_source_utils.py rename to sdk/python/tests/utils/data_source_test_creator.py index d5f45964ca..3f10371734 100644 --- a/sdk/python/tests/utils/data_source_utils.py +++ b/sdk/python/tests/utils/data_source_test_creator.py @@ -16,7 +16,9 @@ def prep_file_source(df, timestamp_field=None) -> Iterator[FileSource]: f.close() df.to_parquet(f.name) file_source = FileSource( - file_format=ParquetFormat(), path=f.name, timestamp_field=timestamp_field, + file_format=ParquetFormat(), + path=f.name, + timestamp_field=timestamp_field, ) yield file_source @@ -38,7 +40,10 @@ def simple_bq_source_using_table_arg(df, timestamp_field=None) -> BigQuerySource job = client.load_table_from_dataframe(df, table) job.result() - return BigQuerySource(table=table, timestamp_field=timestamp_field,) + return BigQuerySource( + table=table, + timestamp_field=timestamp_field, + ) def simple_bq_source_using_query_arg(df, timestamp_field=None) -> BigQuerySource: diff --git a/sdk/python/tests/utils/online_store_utils.py b/sdk/python/tests/utils/dynamo_table_creator.py similarity index 85% rename from sdk/python/tests/utils/online_store_utils.py rename to sdk/python/tests/utils/dynamo_table_creator.py index f72b4d5a2a..20bac122b3 100644 --- a/sdk/python/tests/utils/online_store_utils.py +++ b/sdk/python/tests/utils/dynamo_table_creator.py @@ -8,7 +8,7 @@ from feast.protos.feast.types.Value_pb2 import Value as ValueProto -def _create_n_customer_test_samples(n=10): +def create_n_customer_test_samples(n=10): return [ ( EntityKeyProto( @@ -26,7 +26,7 @@ def _create_n_customer_test_samples(n=10): ] -def _create_test_table(project, tbl_name, region): +def create_test_table(project, tbl_name, region): client = boto3.client("dynamodb", region_name=region) client.create_table( TableName=f"{project}.{tbl_name}", @@ -36,16 +36,16 @@ def _create_test_table(project, tbl_name, region): ) -def _delete_test_table(project, tbl_name, region): +def delete_test_table(project, tbl_name, region): client = boto3.client("dynamodb", region_name=region) client.delete_table(TableName=f"{project}.{tbl_name}") -def _insert_data_test_table(data, project, tbl_name, region): +def insert_data_test_table(data, project, tbl_name, region): dynamodb_resource = boto3.resource("dynamodb", region_name=region) table_instance = dynamodb_resource.Table(f"{project}.{tbl_name}") for entity_key, features, timestamp, created_ts in data: - entity_id = compute_entity_id(entity_key) + entity_id = compute_entity_id(entity_key, entity_key_serialization_version=2) with table_instance.batch_writer() as batch: batch.put_item( Item={ diff --git a/sdk/python/tests/utils/e2e_test_validation.py b/sdk/python/tests/utils/e2e_test_validation.py new file mode 100644 index 0000000000..b2eb78f3c8 --- /dev/null +++ b/sdk/python/tests/utils/e2e_test_validation.py @@ -0,0 +1,277 @@ +import math +import os +import time +from datetime import datetime, timedelta +from pathlib import Path +from typing import List, Optional + +import pandas as pd +import pytest +import yaml +from pytz import utc + +from feast import FeatureStore, FeatureView, FileSource, RepoConfig +from feast.data_format import ParquetFormat +from feast.entity import Entity +from feast.field import Field +from feast.registry import Registry +from feast.types import Array, Bytes, Int64, String +from tests.integration.feature_repos.integration_test_repo_config import ( + IntegrationTestRepoConfig, +) +from tests.integration.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) +from tests.integration.feature_repos.universal.data_sources.bigquery import ( + BigQueryDataSourceCreator, +) +from tests.integration.feature_repos.universal.data_sources.file import ( + FileDataSourceCreator, +) +from tests.integration.feature_repos.universal.data_sources.redshift import ( + RedshiftDataSourceCreator, +) + + +def validate_offline_online_store_consistency( + fs: FeatureStore, fv: FeatureView, split_dt: datetime +) -> None: + now = datetime.utcnow() + + full_feature_names = True + check_offline_store: bool = True + + # Run materialize() + # use both tz-naive & tz-aware timestamps to test that they're both correctly handled + start_date = (now - timedelta(hours=5)).replace(tzinfo=utc) + end_date = split_dt + fs.materialize(feature_views=[fv.name], start_date=start_date, end_date=end_date) + + time.sleep(10) + + # check result of materialize() + _check_offline_and_online_features( + fs=fs, + fv=fv, + driver_id=1, + event_timestamp=end_date, + expected_value=0.3, + full_feature_names=full_feature_names, + check_offline_store=check_offline_store, + ) + + _check_offline_and_online_features( + fs=fs, + fv=fv, + driver_id=2, + event_timestamp=end_date, + expected_value=None, + full_feature_names=full_feature_names, + check_offline_store=check_offline_store, + ) + + # check prior value for materialize_incremental() + _check_offline_and_online_features( + fs=fs, + fv=fv, + driver_id=3, + event_timestamp=end_date, + expected_value=4, + full_feature_names=full_feature_names, + check_offline_store=check_offline_store, + ) + + # run materialize_incremental() + fs.materialize_incremental(feature_views=[fv.name], end_date=now) + + # check result of materialize_incremental() + _check_offline_and_online_features( + fs=fs, + fv=fv, + driver_id=3, + event_timestamp=now, + expected_value=5, + full_feature_names=full_feature_names, + check_offline_store=check_offline_store, + ) + + +def _check_offline_and_online_features( + fs: FeatureStore, + fv: FeatureView, + driver_id: int, + event_timestamp: datetime, + expected_value: Optional[float], + full_feature_names: bool, + check_offline_store: bool = True, +) -> None: + # Check online store + response_dict = fs.get_online_features( + [f"{fv.name}:value"], + [{"driver_id": driver_id}], + full_feature_names=full_feature_names, + ).to_dict() + + if full_feature_names: + + if expected_value: + assert response_dict[f"{fv.name}__value"][0], f"Response: {response_dict}" + assert ( + abs(response_dict[f"{fv.name}__value"][0] - expected_value) < 1e-6 + ), f"Response: {response_dict}, Expected: {expected_value}" + else: + assert response_dict[f"{fv.name}__value"][0] is None + else: + if expected_value: + assert response_dict["value"][0], f"Response: {response_dict}" + assert ( + abs(response_dict["value"][0] - expected_value) < 1e-6 + ), f"Response: {response_dict}, Expected: {expected_value}" + else: + assert response_dict["value"][0] is None + + # Check offline store + if check_offline_store: + df = fs.get_historical_features( + entity_df=pd.DataFrame.from_dict( + {"driver_id": [driver_id], "event_timestamp": [event_timestamp]} + ), + features=[f"{fv.name}:value"], + full_feature_names=full_feature_names, + ).to_df() + + if full_feature_names: + if expected_value: + assert ( + abs( + df.to_dict(orient="list")[f"{fv.name}__value"][0] + - expected_value + ) + < 1e-6 + ) + else: + assert not df.to_dict(orient="list")[f"{fv.name}__value"] or math.isnan( + df.to_dict(orient="list")[f"{fv.name}__value"][0] + ) + else: + if expected_value: + assert ( + abs(df.to_dict(orient="list")["value"][0] - expected_value) < 1e-6 + ) + else: + assert not df.to_dict(orient="list")["value"] or math.isnan( + df.to_dict(orient="list")["value"][0] + ) + + +def make_feature_store_yaml(project, test_repo_config, repo_dir_name: Path): + offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(project) + + offline_store_config = offline_creator.create_offline_store_config() + online_store = test_repo_config.online_store + + config = RepoConfig( + registry=str(Path(repo_dir_name) / "registry.db"), + project=project, + provider=test_repo_config.provider, + offline_store=offline_store_config, + online_store=online_store, + repo_path=str(Path(repo_dir_name)), + ) + config_dict = config.dict() + if ( + isinstance(config_dict["online_store"], dict) + and "redis_type" in config_dict["online_store"] + ): + if str(config_dict["online_store"]["redis_type"]) == "RedisType.redis_cluster": + config_dict["online_store"]["redis_type"] = "redis_cluster" + elif str(config_dict["online_store"]["redis_type"]) == "RedisType.redis": + config_dict["online_store"]["redis_type"] = "redis" + config_dict["repo_path"] = str(config_dict["repo_path"]) + return yaml.safe_dump(config_dict) + + +NULLABLE_ONLINE_STORE_CONFIGS: List[IntegrationTestRepoConfig] = [ + IntegrationTestRepoConfig( + provider="local", + offline_store_creator=FileDataSourceCreator, + online_store=None, + ), +] + +if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True": + NULLABLE_ONLINE_STORE_CONFIGS.extend( + [ + IntegrationTestRepoConfig( + provider="gcp", + offline_store_creator=BigQueryDataSourceCreator, + online_store=None, + ), + IntegrationTestRepoConfig( + provider="aws", + offline_store_creator=RedshiftDataSourceCreator, + online_store=None, + ), + ] + ) + + +def validate_registry_data_source_apply(test_registry: Registry): + # Create Feature Views + batch_source = FileSource( + name="test_source", + file_format=ParquetFormat(), + path="file://feast/*", + timestamp_field="ts_col", + created_timestamp_column="timestamp", + ) + + entity = Entity(name="fs1_my_entity_1", join_keys=["test"]) + + fv1 = FeatureView( + name="my_feature_view_1", + schema=[ + Field(name="fs1_my_feature_1", dtype=Int64), + Field(name="fs1_my_feature_2", dtype=String), + Field(name="fs1_my_feature_3", dtype=Array(String)), + Field(name="fs1_my_feature_4", dtype=Array(Bytes)), + ], + entities=[entity], + tags={"team": "matchmaking"}, + batch_source=batch_source, + ttl=timedelta(minutes=5), + ) + + project = "project" + + # Register data source and feature view + test_registry.apply_data_source(batch_source, project, commit=False) + test_registry.apply_feature_view(fv1, project, commit=True) + + registry_feature_views = test_registry.list_feature_views(project) + registry_data_sources = test_registry.list_data_sources(project) + assert len(registry_feature_views) == 1 + assert len(registry_data_sources) == 1 + registry_feature_view = registry_feature_views[0] + assert registry_feature_view.batch_source == batch_source + registry_data_source = registry_data_sources[0] + assert registry_data_source == batch_source + + # Check that change to batch source propagates + batch_source.timestamp_field = "new_ts_col" + test_registry.apply_data_source(batch_source, project, commit=False) + test_registry.apply_feature_view(fv1, project, commit=True) + registry_feature_views = test_registry.list_feature_views(project) + registry_data_sources = test_registry.list_data_sources(project) + assert len(registry_feature_views) == 1 + assert len(registry_data_sources) == 1 + registry_feature_view = registry_feature_views[0] + assert registry_feature_view.batch_source == batch_source + registry_batch_source = test_registry.list_data_sources(project)[0] + assert registry_batch_source == batch_source + + test_registry.teardown() + + # Will try to reload registry, which will fail because the file has been deleted + with pytest.raises(FileNotFoundError): + test_registry._get_registry_proto(project=project) diff --git a/sdk/python/tests/utils/feature_records.py b/sdk/python/tests/utils/feature_records.py new file mode 100644 index 0000000000..acc08ec121 --- /dev/null +++ b/sdk/python/tests/utils/feature_records.py @@ -0,0 +1,496 @@ +from datetime import datetime, timedelta +from typing import Any, Dict, List, Optional + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal as pd_assert_frame_equal +from pytz import utc + +from feast import FeatureStore, utils +from feast.errors import FeatureNameCollisionError +from feast.feature_view import FeatureView + + +def convert_timestamp_records_to_utc( + records: List[Dict[str, Any]], column: str +) -> List[Dict[str, Any]]: + for record in records: + record[column] = utils.make_tzaware(record[column]).astimezone(utc) + return records + + +# Find the latest record in the given time range and filter +def find_latest_record( + records: List[Dict[str, Any]], + ts_key: str, + ts_start: datetime, + ts_end: datetime, + filter_keys: Optional[List[str]] = None, + filter_values: Optional[List[Any]] = None, +) -> Dict[str, Any]: + filter_keys = filter_keys or [] + filter_values = filter_values or [] + assert len(filter_keys) == len(filter_values) + found_record: Dict[str, Any] = {} + for record in records: + if ( + all( + [ + record[filter_key] == filter_value + for filter_key, filter_value in zip(filter_keys, filter_values) + ] + ) + and ts_start <= record[ts_key] <= ts_end + ): + if not found_record or found_record[ts_key] < record[ts_key]: + found_record = record + return found_record + + +def get_expected_training_df( + customer_df: pd.DataFrame, + customer_fv: FeatureView, + driver_df: pd.DataFrame, + driver_fv: FeatureView, + orders_df: pd.DataFrame, + order_fv: FeatureView, + location_df: pd.DataFrame, + location_fv: FeatureView, + global_df: pd.DataFrame, + global_fv: FeatureView, + field_mapping_df: pd.DataFrame, + field_mapping_fv: FeatureView, + entity_df: pd.DataFrame, + event_timestamp: str, + full_feature_names: bool = False, +): + # Convert all pandas dataframes into records with UTC timestamps + customer_records = convert_timestamp_records_to_utc( + customer_df.to_dict("records"), customer_fv.batch_source.timestamp_field + ) + driver_records = convert_timestamp_records_to_utc( + driver_df.to_dict("records"), driver_fv.batch_source.timestamp_field + ) + order_records = convert_timestamp_records_to_utc( + orders_df.to_dict("records"), event_timestamp + ) + location_records = convert_timestamp_records_to_utc( + location_df.to_dict("records"), location_fv.batch_source.timestamp_field + ) + global_records = convert_timestamp_records_to_utc( + global_df.to_dict("records"), global_fv.batch_source.timestamp_field + ) + field_mapping_records = convert_timestamp_records_to_utc( + field_mapping_df.to_dict("records"), + field_mapping_fv.batch_source.timestamp_field, + ) + entity_rows = convert_timestamp_records_to_utc( + entity_df.to_dict("records"), event_timestamp + ) + + # Set sufficiently large ttl that it effectively functions as infinite for the calculations below. + default_ttl = timedelta(weeks=52) + + # Manually do point-in-time join of driver, customer, and order records against + # the entity df + for entity_row in entity_rows: + customer_record = find_latest_record( + customer_records, + ts_key=customer_fv.batch_source.timestamp_field, + ts_start=entity_row[event_timestamp] + - _get_feature_view_ttl(customer_fv, default_ttl), + ts_end=entity_row[event_timestamp], + filter_keys=["customer_id"], + filter_values=[entity_row["customer_id"]], + ) + driver_record = find_latest_record( + driver_records, + ts_key=driver_fv.batch_source.timestamp_field, + ts_start=entity_row[event_timestamp] + - _get_feature_view_ttl(driver_fv, default_ttl), + ts_end=entity_row[event_timestamp], + filter_keys=["driver_id"], + filter_values=[entity_row["driver_id"]], + ) + order_record = find_latest_record( + order_records, + ts_key=customer_fv.batch_source.timestamp_field, + ts_start=entity_row[event_timestamp] + - _get_feature_view_ttl(order_fv, default_ttl), + ts_end=entity_row[event_timestamp], + filter_keys=["customer_id", "driver_id"], + filter_values=[entity_row["customer_id"], entity_row["driver_id"]], + ) + origin_record = find_latest_record( + location_records, + ts_key=location_fv.batch_source.timestamp_field, + ts_start=order_record[event_timestamp] + - _get_feature_view_ttl(location_fv, default_ttl), + ts_end=order_record[event_timestamp], + filter_keys=["location_id"], + filter_values=[order_record["origin_id"]], + ) + destination_record = find_latest_record( + location_records, + ts_key=location_fv.batch_source.timestamp_field, + ts_start=order_record[event_timestamp] + - _get_feature_view_ttl(location_fv, default_ttl), + ts_end=order_record[event_timestamp], + filter_keys=["location_id"], + filter_values=[order_record["destination_id"]], + ) + global_record = find_latest_record( + global_records, + ts_key=global_fv.batch_source.timestamp_field, + ts_start=order_record[event_timestamp] + - _get_feature_view_ttl(global_fv, default_ttl), + ts_end=order_record[event_timestamp], + ) + + field_mapping_record = find_latest_record( + field_mapping_records, + ts_key=field_mapping_fv.batch_source.timestamp_field, + ts_start=order_record[event_timestamp] + - _get_feature_view_ttl(field_mapping_fv, default_ttl), + ts_end=order_record[event_timestamp], + ) + + entity_row.update( + { + ( + f"customer_profile__{k}" if full_feature_names else k + ): customer_record.get(k, None) + for k in ( + "current_balance", + "avg_passenger_count", + "lifetime_trip_count", + ) + } + ) + entity_row.update( + { + (f"driver_stats__{k}" if full_feature_names else k): driver_record.get( + k, None + ) + for k in ("conv_rate", "avg_daily_trips") + } + ) + entity_row.update( + { + (f"order__{k}" if full_feature_names else k): order_record.get(k, None) + for k in ("order_is_success",) + } + ) + entity_row.update( + { + "origin__temperature": origin_record.get("temperature", None), + "destination__temperature": destination_record.get("temperature", None), + } + ) + entity_row.update( + { + (f"global_stats__{k}" if full_feature_names else k): global_record.get( + k, None + ) + for k in ( + "num_rides", + "avg_ride_length", + ) + } + ) + + # get field_mapping_record by column name, but label by feature name + entity_row.update( + { + ( + f"field_mapping__{feature}" if full_feature_names else feature + ): field_mapping_record.get(column, None) + for ( + column, + feature, + ) in field_mapping_fv.batch_source.field_mapping.items() + } + ) + + # Convert records back to pandas dataframe + expected_df = pd.DataFrame(entity_rows) + + # Move "event_timestamp" column to front + current_cols = expected_df.columns.tolist() + current_cols.remove(event_timestamp) + expected_df = expected_df[[event_timestamp] + current_cols] + + # Cast some columns to expected types, since we lose information when converting pandas DFs into Python objects. + if full_feature_names: + expected_column_types = { + "order__order_is_success": "int32", + "driver_stats__conv_rate": "float32", + "customer_profile__current_balance": "float32", + "customer_profile__avg_passenger_count": "float32", + "global_stats__avg_ride_length": "float32", + "field_mapping__feature_name": "int32", + } + else: + expected_column_types = { + "order_is_success": "int32", + "conv_rate": "float32", + "current_balance": "float32", + "avg_passenger_count": "float32", + "avg_ride_length": "float32", + "feature_name": "int32", + } + + for col, typ in expected_column_types.items(): + expected_df[col] = expected_df[col].astype(typ) + + conv_feature_name = "driver_stats__conv_rate" if full_feature_names else "conv_rate" + conv_plus_feature_name = get_response_feature_name( + "conv_rate_plus_100", full_feature_names + ) + expected_df[conv_plus_feature_name] = expected_df[conv_feature_name] + 100 + expected_df[ + get_response_feature_name("conv_rate_plus_100_rounded", full_feature_names) + ] = ( + expected_df[conv_plus_feature_name] + .astype("float") + .round() + .astype(pd.Int32Dtype()) + ) + if "val_to_add" in expected_df.columns: + expected_df[ + get_response_feature_name("conv_rate_plus_val_to_add", full_feature_names) + ] = (expected_df[conv_feature_name] + expected_df["val_to_add"]) + + return expected_df + + +def get_response_feature_name(feature: str, full_feature_names: bool) -> str: + if feature in {"conv_rate", "avg_daily_trips"} and full_feature_names: + return f"driver_stats__{feature}" + + if ( + feature + in { + "conv_rate_plus_100", + "conv_rate_plus_100_rounded", + "conv_rate_plus_val_to_add", + } + and full_feature_names + ): + return f"conv_rate_plus_100__{feature}" + + return feature + + +def assert_feature_service_correctness( + store, feature_service, full_feature_names, entity_df, expected_df, event_timestamp +): + + job_from_df = store.get_historical_features( + entity_df=entity_df, + features=feature_service, + full_feature_names=full_feature_names, + ) + + actual_df_from_df_entities = job_from_df.to_df() + + expected_df = expected_df[ + [ + event_timestamp, + "order_id", + "driver_id", + "customer_id", + get_response_feature_name("conv_rate", full_feature_names), + get_response_feature_name("conv_rate_plus_100", full_feature_names), + "driver_age", + ] + ] + + validate_dataframes( + expected_df, + actual_df_from_df_entities, + keys=[event_timestamp, "order_id", "driver_id", "customer_id"], + ) + + +def assert_feature_service_entity_mapping_correctness( + store, feature_service, full_feature_names, entity_df, expected_df, event_timestamp +): + if full_feature_names: + job_from_df = store.get_historical_features( + entity_df=entity_df, + features=feature_service, + full_feature_names=full_feature_names, + ) + actual_df_from_df_entities = job_from_df.to_df() + + expected_df: pd.DataFrame = ( + expected_df.sort_values( + by=[ + event_timestamp, + "order_id", + "driver_id", + "customer_id", + "origin_id", + "destination_id", + ] + ) + .drop_duplicates() + .reset_index(drop=True) + ) + expected_df = expected_df[ + [ + event_timestamp, + "order_id", + "driver_id", + "customer_id", + "origin_id", + "destination_id", + "origin__temperature", + "destination__temperature", + ] + ] + + validate_dataframes( + expected_df, + actual_df_from_df_entities, + keys=[ + event_timestamp, + "order_id", + "driver_id", + "customer_id", + "origin_id", + "destination_id", + ], + ) + else: + # using 2 of the same FeatureView without full_feature_names=True will result in collision + with pytest.raises(FeatureNameCollisionError): + job_from_df = store.get_historical_features( + entity_df=entity_df, + features=feature_service, + full_feature_names=full_feature_names, + ) + + +def validate_dataframes(expected_df, actual_df, keys): + expected_df: pd.DataFrame = ( + expected_df.sort_values(by=keys).drop_duplicates().reset_index(drop=True) + ) + + actual_df = ( + actual_df[expected_df.columns] + .sort_values(by=keys) + .drop_duplicates() + .reset_index(drop=True) + ) + + pd_assert_frame_equal( + expected_df, + actual_df, + check_dtype=False, + ) + + +def _get_feature_view_ttl( + feature_view: FeatureView, default_ttl: timedelta +) -> timedelta: + """Returns the ttl of a feature view if it is non-zero. Otherwise returns the specified default.""" + return feature_view.ttl if feature_view.ttl else default_ttl + + +def validate_online_features( + store: FeatureStore, driver_df: pd.DataFrame, max_date: datetime +): + """Assert that features in online store are up to date with `max_date` date.""" + # Read features back + response = store.get_online_features( + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:avg_daily_trips", + "global_daily_stats:num_rides", + "global_daily_stats:avg_ride_length", + ], + entity_rows=[{"driver_id": 1001}], + full_feature_names=True, + ) + + # Float features should still be floats. + assert ( + response.proto.results[ + list(response.proto.metadata.feature_names.val).index( + "driver_hourly_stats__conv_rate" + ) + ] + .values[0] + .float_val + > 0 + ), response.to_dict() + + result = response.to_dict() + assert len(result) == 5 + assert "driver_hourly_stats__avg_daily_trips" in result + assert "driver_hourly_stats__conv_rate" in result + assert ( + abs( + result["driver_hourly_stats__conv_rate"][0] + - get_last_feature_row(driver_df, 1001, max_date)["conv_rate"] + ) + < 0.01 + ) + assert "global_daily_stats__num_rides" in result + assert "global_daily_stats__avg_ride_length" in result + + # Test the ODFV if it exists. + odfvs = store.list_on_demand_feature_views() + if odfvs and odfvs[0].name == "conv_rate_plus_100": + response = store.get_online_features( + features=[ + "conv_rate_plus_100:conv_rate_plus_100", + "conv_rate_plus_100:conv_rate_plus_val_to_add", + ], + entity_rows=[{"driver_id": 1001, "val_to_add": 100}], + full_feature_names=True, + ) + + # Check that float64 feature is stored correctly in proto format. + assert ( + response.proto.results[ + list(response.proto.metadata.feature_names.val).index( + "conv_rate_plus_100__conv_rate_plus_100" + ) + ] + .values[0] + .double_val + > 0 + ) + + result = response.to_dict() + assert len(result) == 3 + assert "conv_rate_plus_100__conv_rate_plus_100" in result + assert "conv_rate_plus_100__conv_rate_plus_val_to_add" in result + assert ( + abs( + result["conv_rate_plus_100__conv_rate_plus_100"][0] + - (get_last_feature_row(driver_df, 1001, max_date)["conv_rate"] + 100) + ) + < 0.01 + ) + assert ( + abs( + result["conv_rate_plus_100__conv_rate_plus_val_to_add"][0] + - (get_last_feature_row(driver_df, 1001, max_date)["conv_rate"] + 100) + ) + < 0.01 + ) + + +def get_last_feature_row(df: pd.DataFrame, driver_id, max_date: datetime): + """Manually extract last feature value from a dataframe for a given driver_id with up to `max_date` date""" + filtered = df[ + (df["driver_id"] == driver_id) + & (df["event_timestamp"] < max_date.replace(tzinfo=utc)) + ] + max_ts = filtered.loc[filtered["event_timestamp"].idxmax()]["event_timestamp"] + filtered_by_ts = filtered[filtered["event_timestamp"] == max_ts] + return filtered_by_ts.loc[filtered_by_ts["created"].idxmax()] diff --git a/sdk/python/tests/utils/http_server.py b/sdk/python/tests/utils/http_server.py new file mode 100644 index 0000000000..47c6cb8ac1 --- /dev/null +++ b/sdk/python/tests/utils/http_server.py @@ -0,0 +1,13 @@ +import socket +from contextlib import closing + + +def free_port(): + sock = socket.socket() + sock.bind(("", 0)) + return sock.getsockname()[1] + + +def check_port_open(host, port) -> bool: + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + return sock.connect_ex((host, port)) == 0 diff --git a/sdk/python/tests/utils/online_write_benchmark.py b/sdk/python/tests/utils/online_write_benchmark.py index 9f2f8ba60d..8a138f41db 100644 --- a/sdk/python/tests/utils/online_write_benchmark.py +++ b/sdk/python/tests/utils/online_write_benchmark.py @@ -14,9 +14,9 @@ from feast.feature_store import FeatureStore from feast.feature_view import FeatureView from feast.field import Field -from feast.infra.provider import _convert_arrow_to_proto from feast.repo_config import RepoConfig from feast.types import Float32, Int32 +from feast.utils import _convert_arrow_to_proto def create_driver_hourly_stats_feature_view(source): diff --git a/sdk/python/tests/utils/logged_features.py b/sdk/python/tests/utils/test_log_creator.py similarity index 60% rename from sdk/python/tests/utils/logged_features.py rename to sdk/python/tests/utils/test_log_creator.py index dc844a60b4..ec0d92814c 100644 --- a/sdk/python/tests/utils/logged_features.py +++ b/sdk/python/tests/utils/test_log_creator.py @@ -3,11 +3,12 @@ import tempfile import uuid from pathlib import Path -from typing import Iterator, Union +from typing import Iterator, List, Union import numpy as np import pandas as pd import pyarrow +import pytz from feast import FeatureService, FeatureStore, FeatureView from feast.errors import FeatureViewNotFoundException @@ -15,6 +16,63 @@ from feast.protos.feast.serving.ServingService_pb2 import FieldStatus +def get_latest_rows( + df: pd.DataFrame, join_key: str, entity_values: List[str] +) -> pd.DataFrame: + """ + Return latest rows in a dataframe based on join key and entity values. + + Args: + df: Dataframe of features values. + join_key : Join key for the feature values in the dataframe. + entity_values : Entity values for the feature values in the dataframe. + + Returns: + The most recent row in the dataframe. + """ + rows = df[df[join_key].isin(entity_values)] + return rows.loc[rows.groupby(join_key)["event_timestamp"].idxmax()] + + +def generate_expected_logs( + df: pd.DataFrame, + feature_view: FeatureView, + features: List[str], + join_keys: List[str], + timestamp_column: str, +) -> pd.DataFrame: + """ + Given dataframe and feature view, generate the expected logging dataframes that would be otherwise generated by our logging infrastructure. + Args: + df: Dataframe of features values returned in `get_online_features`. + feature_view : The feature view from which the features were retrieved. + features : The list of features defined as part of this base feature view. + join_keys : Join keys for the retrieved features. + timestamp_column : Timestamp column + + Returns: + Returns dataframe containing the expected logs. + """ + logs = pd.DataFrame() + for join_key in join_keys: + logs[join_key] = df[join_key] + + for feature in features: + col = f"{feature_view.name}__{feature}" + logs[col] = df[feature] + logs[f"{col}__timestamp"] = df[timestamp_column] + logs[f"{col}__status"] = FieldStatus.PRESENT + if feature_view.ttl: + logs[f"{col}__status"] = logs[f"{col}__status"].mask( + df[timestamp_column] + < datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) + - feature_view.ttl, + FieldStatus.OUTSIDE_MAX_AGE, + ) + + return logs.sort_values(by=join_keys).reset_index(drop=True) + + def prepare_logs( source_df: pd.DataFrame, feature_service: FeatureService, store: FeatureStore ) -> pd.DataFrame: diff --git a/sdk/python/tests/utils/test_wrappers.py b/sdk/python/tests/utils/test_wrappers.py new file mode 100644 index 0000000000..efee675790 --- /dev/null +++ b/sdk/python/tests/utils/test_wrappers.py @@ -0,0 +1,14 @@ +import pytest + + +def no_warnings(func): + def wrapper_no_warnings(*args, **kwargs): + with pytest.warns(None) as warnings: + func(*args, **kwargs) + + if len(warnings) > 0: + raise AssertionError( + "Warnings were raised: " + ", ".join([str(w) for w in warnings]) + ) + + return wrapper_no_warnings diff --git a/setup.py b/setup.py index d32b13deed..f03aeefcf6 100644 --- a/setup.py +++ b/setup.py @@ -25,18 +25,18 @@ from pathlib import Path from subprocess import CalledProcessError -from setuptools import find_packages, Extension +from setuptools import Extension, find_packages try: from setuptools import setup - from setuptools.command.build_py import build_py from setuptools.command.build_ext import build_ext as _build_ext + from setuptools.command.build_py import build_py from setuptools.command.develop import develop from setuptools.command.install import install except ImportError: - from distutils.command.build_py import build_py from distutils.command.build_ext import build_ext as _build_ext + from distutils.command.build_py import build_py from distutils.core import setup NAME = "feast" @@ -46,31 +46,31 @@ REQUIRES_PYTHON = ">=3.7.0" REQUIRED = [ - "click>=7.0.0,<8.0.2", + "click>=7.0.0,<9.0.0", "colorama>=0.3.9,<1", "dill==0.3.*", "fastavro>=1.1.0,<2", "google-api-core>=1.23.0,<3", "googleapis-common-protos>=1.52.*,<2", - "grpcio>=1.34.0,<2", - "grpcio-reflection>=1.34.0,<2", + "grpcio>=1.47.0,<2", + "grpcio-reflection>=1.47.0,<2", "Jinja2>=2,<4", "jsonschema", "mmh3", - "numpy<1.22,<2", - "pandas>=1,<2", - "pandavro==1.5.*", - "protobuf>=3.10,<3.20", - "proto-plus==1.20.*", - "pyarrow>=4,<7", + "numpy>=1.22,<3", + "pandas>=1.4.3,<2", + "pandavro==1.5.*", # For some reason pandavro higher than 1.5.* only support pandas less than 1.3. + "protobuf>3.20,<4", + "proto-plus>=1.20.0,<2", + "pyarrow>=4,<9", "pydantic>=1,<2", - "pygments==2.12.0", + "pygments>=2.12.0,<3", "PyYAML>=5.4.*,<7", "SQLAlchemy[mypy]>1,<2", - "tabulate==0.8.*", + "tabulate>=0.8.0,<1", "tenacity>=7,<9", - "toml==0.10.*", - "tqdm==4.*", + "toml>=0.10.0,<1", + "tqdm>=4,<5", "typeguard", "fastapi>=0.68.0,<1", "uvicorn[standard]>=0.14.0,<1", @@ -80,11 +80,10 @@ ] GCP_REQUIRED = [ - "google-cloud-bigquery>=2,<3", + "google-cloud-bigquery[pandas]>=2,<4", "google-cloud-bigquery-storage >= 2.0.0,<3", "google-cloud-datastore>=2.1.*,<3", - "google-cloud-storage>=1.34.*,<1.41", - "google-cloud-core>=1.4.0,<2.0.0", + "google-cloud-storage>=1.34.*,<3", ] REDIS_REQUIRED = [ @@ -127,12 +126,12 @@ CI_REQUIRED = ( [ "build", - "cryptography==35.0", + "cryptography>=35.0,<36", "flake8", - "black==19.10b0", + "black>=22.6.0,<23", "isort>=5,<6", - "grpcio-tools==1.44.0", - "grpcio-testing==1.44.0", + "grpcio-tools>=1.47.0", + "grpcio-testing>=1.47.0", "minio==7.1.0", "mock==2.0.0", "moto", @@ -154,7 +153,7 @@ "sphinx-rtd-theme", "testcontainers>=3.5,<4", "adlfs==0.5.9", - "firebase-admin==4.5.2", + "firebase-admin>=5.2.0,<6", "pre-commit", "assertpy==1.1", "pip-tools", @@ -428,12 +427,18 @@ def build_extension(self, ext: Extension): print(f"CWD: {os.getcwd()}") destination = os.path.dirname(os.path.abspath(self.get_ext_fullpath(ext.name))) - subprocess.check_call(["go", "install", "golang.org/x/tools/cmd/goimports"], - env={"PATH": bin_path, **go_env}) - subprocess.check_call(["go", "get", "github.com/go-python/gopy@v0.4.0"], - env={"PATH": bin_path, **go_env}) - subprocess.check_call(["go", "install", "github.com/go-python/gopy"], - env={"PATH": bin_path, **go_env}) + subprocess.check_call( + ["go", "install", "golang.org/x/tools/cmd/goimports"], + env={"PATH": bin_path, **go_env}, + ) + subprocess.check_call( + ["go", "get", "github.com/go-python/gopy@v0.4.4"], + env={"PATH": bin_path, **go_env}, + ) + subprocess.check_call( + ["go", "install", "github.com/go-python/gopy"], + env={"PATH": bin_path, **go_env}, + ) subprocess.check_call( [ "gopy", @@ -442,10 +447,17 @@ def build_extension(self, ext: Extension): destination, "-vm", sys.executable, + "--build-tags", + "cgo,ccalloc", + "--dynamic-link=True", "-no-make", *ext.sources, ], - env={"PATH": bin_path, "CGO_LDFLAGS_ALLOW": ".*", **go_env,}, + env={ + "PATH": bin_path, + "CGO_LDFLAGS_ALLOW": ".*", + **go_env, + }, ) def copy_extensions_to_source(self): @@ -514,8 +526,8 @@ def copy_extensions_to_source(self): use_scm_version=use_scm_version, setup_requires=[ "setuptools_scm", - "grpcio", - "grpcio-tools==1.44.0", + "grpcio>=1.47.0", + "grpcio-tools>=1.47.0", "mypy-protobuf==3.1", "pybindgen==0.22.0", "sphinx!=4.0.0", diff --git a/ui/package.json b/ui/package.json index 252faf8613..22128cc968 100644 --- a/ui/package.json +++ b/ui/package.json @@ -1,6 +1,6 @@ { "name": "@feast-dev/feast-ui", - "version": "0.20.4", + "version": "0.20.5", "private": false, "files": [ "dist" diff --git a/ui/src/FeastUISansProviders.test.tsx b/ui/src/FeastUISansProviders.test.tsx index 1289cea028..09985bc133 100644 --- a/ui/src/FeastUISansProviders.test.tsx +++ b/ui/src/FeastUISansProviders.test.tsx @@ -94,3 +94,46 @@ test("routes are reachable", async () => { }); } }); + + +const featureViewName = registry.featureViews[0].spec.name; +const featureName = registry.featureViews[0].spec.features[0].name; + +test("features are reachable", async () => { + render(); + + // Wait for content to load + await screen.findByText(/Explore this Project/i); + const routeRegExp = new RegExp("Feature Views", "i"); + + userEvent.click( + screen.getByRole("button", { name: routeRegExp }), + leftClick + ); + + screen.getByRole("heading", { + name: "Feature Views", + }); + + await screen.findAllByText(/Feature Views/i); + const fvRegExp = new RegExp(featureViewName, "i"); + + userEvent.click( + screen.getByRole("link", { name: fvRegExp }), + leftClick + ) + + await screen.findByText(featureName); + const fRegExp = new RegExp(featureName, "i"); + + userEvent.click( + screen.getByRole("link", { name: fRegExp }), + leftClick + ) + // Should land on a page with the heading + // await screen.findByText("Feature: " + featureName); + screen.getByRole("heading", { + name: "Feature: " + featureName, + level: 1, + }); +}); diff --git a/ui/src/FeastUISansProviders.tsx b/ui/src/FeastUISansProviders.tsx index 628068f0f0..8a0e0b94db 100644 --- a/ui/src/FeastUISansProviders.tsx +++ b/ui/src/FeastUISansProviders.tsx @@ -13,6 +13,7 @@ import DatasourceIndex from "./pages/data-sources/Index"; import DatasetIndex from "./pages/saved-data-sets/Index"; import EntityIndex from "./pages/entities/Index"; import EntityInstance from "./pages/entities/EntityInstance"; +import FeatureInstance from "./pages/features/FeatureInstance"; import FeatureServiceIndex from "./pages/feature-services/Index"; import FeatureViewIndex from "./pages/feature-views/Index"; import FeatureViewInstance from "./pages/feature-views/FeatureViewInstance"; @@ -86,10 +87,12 @@ const FeastUISansProviders = ({ path="feature-view/" element={} /> + }> + } - /> + path="feature-view/:FeatureViewName/feature/:FeatureName/*" + element={} + /> } diff --git a/ui/src/components/FeaturesListDisplay.tsx b/ui/src/components/FeaturesListDisplay.tsx index abd9c1d2e4..dcb6ba81eb 100644 --- a/ui/src/components/FeaturesListDisplay.tsx +++ b/ui/src/components/FeaturesListDisplay.tsx @@ -4,25 +4,42 @@ import { FeastFeatureColumnType } from "../parsers/feastFeatureViews"; import useLoadFeatureViewSummaryStatistics from "../queries/useLoadFeatureViewSummaryStatistics"; import SparklineHistogram from "./SparklineHistogram"; import FeatureFlagsContext from "../contexts/FeatureFlagsContext"; +import EuiCustomLink from "./EuiCustomLink"; interface FeaturesListProps { + projectName: string; featureViewName: string; features: FeastFeatureColumnType[]; + link: boolean; } -const FeaturesList = ({ featureViewName, features }: FeaturesListProps) => { +const FeaturesList = ({ projectName, featureViewName, features, link }: FeaturesListProps) => { const { enabledFeatureStatistics } = useContext(FeatureFlagsContext); const { isLoading, isError, isSuccess, data } = useLoadFeatureViewSummaryStatistics(featureViewName); let columns: { name: string; render?: any; field: any }[] = [ - { name: "Name", field: "name" }, + { + name: "Name", + field: "name", + render: (item: string) => ( + + {item} + + ) + }, { name: "Value Type", field: "valueType", }, ]; + if (!link) { + columns[0].render = undefined; + } + if (enabledFeatureStatistics) { columns.push( ...[ diff --git a/ui/src/components/TagSearch.tsx b/ui/src/components/TagSearch.tsx index e89d4a44cc..e3f7cdd98f 100644 --- a/ui/src/components/TagSearch.tsx +++ b/ui/src/components/TagSearch.tsx @@ -163,7 +163,7 @@ const TagSearch = ({ // HTMLInputElement is hooked into useInputHack inputNode.current = node; }, - onfocus: () => { + onFocus: () => { setHasFocus(true); }, fullWidth: true, diff --git a/ui/src/custom-tabs/TabsRegistryContext.tsx b/ui/src/custom-tabs/TabsRegistryContext.tsx index a5321e9c40..9f493e6d11 100644 --- a/ui/src/custom-tabs/TabsRegistryContext.tsx +++ b/ui/src/custom-tabs/TabsRegistryContext.tsx @@ -11,6 +11,7 @@ import { import RegularFeatureViewCustomTabLoadingWrapper from "../utils/custom-tabs/RegularFeatureViewCustomTabLoadingWrapper"; import OnDemandFeatureViewCustomTabLoadingWrapper from "../utils/custom-tabs/OnDemandFeatureViewCustomTabLoadingWrapper"; import FeatureServiceCustomTabLoadingWrapper from "../utils/custom-tabs/FeatureServiceCustomTabLoadingWrapper"; +import FeatureCustomTabLoadingWrapper from "../utils/custom-tabs/FeatureCustomTabLoadingWrapper"; import DataSourceCustomTabLoadingWrapper from "../utils/custom-tabs/DataSourceCustomTabLoadingWrapper"; import EntityCustomTabLoadingWrapper from "../utils/custom-tabs/EntityCustomTabLoadingWrapper"; import DatasetCustomTabLoadingWrapper from "../utils/custom-tabs/DatasetCustomTabLoadingWrapper"; @@ -19,6 +20,7 @@ import { RegularFeatureViewCustomTabRegistrationInterface, OnDemandFeatureViewCustomTabRegistrationInterface, FeatureServiceCustomTabRegistrationInterface, + FeatureCustomTabRegistrationInterface, DataSourceCustomTabRegistrationInterface, EntityCustomTabRegistrationInterface, DatasetCustomTabRegistrationInterface, @@ -29,6 +31,7 @@ interface FeastTabsRegistryInterface { RegularFeatureViewCustomTabs?: RegularFeatureViewCustomTabRegistrationInterface[]; OnDemandFeatureViewCustomTabs?: OnDemandFeatureViewCustomTabRegistrationInterface[]; FeatureServiceCustomTabs?: FeatureServiceCustomTabRegistrationInterface[]; + FeatureCustomTabs?: FeatureCustomTabRegistrationInterface[]; DataSourceCustomTabs?: DataSourceCustomTabRegistrationInterface[]; EntityCustomTabs?: EntityCustomTabRegistrationInterface[]; DatasetCustomTabs?: DatasetCustomTabRegistrationInterface[]; @@ -154,6 +157,15 @@ const useFeatureServiceCustomTabs = (navigate: NavigateFunction) => { ); }; +const useFeatureCustomTabs = (navigate: NavigateFunction) => { + const { FeatureCustomTabs } = React.useContext(TabsRegistryContext); + + return useGenericCustomTabsNavigation( + FeatureCustomTabs || [], + navigate + ); +}; + const useDataSourceCustomTabs = (navigate: NavigateFunction) => { const { DataSourceCustomTabs } = React.useContext(TabsRegistryContext); @@ -211,6 +223,15 @@ const useFeatureServiceCustomTabRoutes = () => { ); }; +const useEntityCustomTabRoutes = () => { + const { EntityCustomTabs } = React.useContext(TabsRegistryContext); + + return genericCustomTabRoutes( + EntityCustomTabs || [], + EntityCustomTabLoadingWrapper + ); +}; + const useDataSourceCustomTabRoutes = () => { const { DataSourceCustomTabs } = React.useContext(TabsRegistryContext); @@ -220,12 +241,12 @@ const useDataSourceCustomTabRoutes = () => { ); }; -const useEntityCustomTabRoutes = () => { - const { EntityCustomTabs } = React.useContext(TabsRegistryContext); +const useFeatureCustomTabRoutes = () => { + const { FeatureCustomTabs } = React.useContext(TabsRegistryContext); return genericCustomTabRoutes( - EntityCustomTabs || [], - EntityCustomTabLoadingWrapper + FeatureCustomTabs || [], + FeatureCustomTabLoadingWrapper ); }; @@ -244,6 +265,7 @@ export { useRegularFeatureViewCustomTabs, useOnDemandFeatureViewCustomTabs, useFeatureServiceCustomTabs, + useFeatureCustomTabs, useDataSourceCustomTabs, useEntityCustomTabs, useDatasetCustomTabs, @@ -251,6 +273,7 @@ export { useRegularFeatureViewCustomTabRoutes, useOnDemandFeatureViewCustomTabRoutes, useFeatureServiceCustomTabRoutes, + useFeatureCustomTabRoutes, useDataSourceCustomTabRoutes, useEntityCustomTabRoutes, useDatasetCustomTabRoutes, diff --git a/ui/src/custom-tabs/data-tab/DataQuery.tsx b/ui/src/custom-tabs/data-tab/DataQuery.tsx new file mode 100644 index 0000000000..f101c122e4 --- /dev/null +++ b/ui/src/custom-tabs/data-tab/DataQuery.tsx @@ -0,0 +1,25 @@ +import { useQuery } from "react-query"; + +interface DataQueryInterface { + featureView: string | undefined; +} + +const DataQuery = (featureView: string) => { + const queryKey = `data-tab-namespace:${featureView}`; + + return useQuery( + queryKey, + () => { + // Customizing the URL based on your needs + const url = `/demo-custom-tabs/demo.json`; + + return fetch(url) + .then((res) => res.json()) + }, + { + enabled: !!featureView, // Only start the query when the variable is not undefined + } + ); +}; + +export default DataQuery; diff --git a/ui/src/custom-tabs/data-tab/DataTab.tsx b/ui/src/custom-tabs/data-tab/DataTab.tsx new file mode 100644 index 0000000000..144083420a --- /dev/null +++ b/ui/src/custom-tabs/data-tab/DataTab.tsx @@ -0,0 +1,110 @@ +import React from "react"; +import { z } from "zod"; +import { + EuiCode, + EuiFlexGroup, + EuiHorizontalRule, + EuiLoadingSpinner, + EuiTable, + EuiTitle, + EuiTableHeader, + EuiTableHeaderCell, + EuiPanel, + EuiFlexItem, + EuiTableRow, + EuiTableRowCell, +} from "@elastic/eui"; +import useLoadRegularFeatureView from "../../pages/feature-views/useLoadFeatureView"; +import DataQuery from "./DataQuery"; + +const FeatureViewDataRow = z.object({ + name: z.string(), + value: z.string(), +}); + +type FeatureViewDataRowType = z.infer; + +const LineHeightProp: React.CSSProperties = { + lineHeight: 1, +} + +const EuiFeatureViewDataRow = ({name, value}: FeatureViewDataRowType) => { + return ( + + + {name} + + + +
+            {value}
+          
+
+
+
+ ); +} + +const FeatureViewDataTable = (data: any) => { + var items: FeatureViewDataRowType[] = []; + + for (let element in data.data){ + const row: FeatureViewDataRowType = { + name: element, + value: JSON.stringify(data.data[element], null, 2), + }; + items.push(row); + console.log(row); + } + + return ( + + + + Data Item Name + + + Data Item Value + + + {items.map((item) => { + return + })} + + ) +} + +const DataTab = () => { + const fName = "credit_history" + const { isLoading, isError, isSuccess, data } = DataQuery(fName); + const isEmpty = data === undefined; + + return ( + + {isLoading && ( + + Loading + + )} + {isEmpty &&

No feature view with name: {fName}

} + {isError &&

Error loading feature view: {fName}

} + {isSuccess && data && ( + + + + + +

Properties

+
+ + +
+
+
+
+ )} +
+ ); +}; + +export default DataTab; diff --git a/ui/src/custom-tabs/feature-demo-tab/DemoCustomTab.tsx b/ui/src/custom-tabs/feature-demo-tab/DemoCustomTab.tsx new file mode 100644 index 0000000000..fda920daf3 --- /dev/null +++ b/ui/src/custom-tabs/feature-demo-tab/DemoCustomTab.tsx @@ -0,0 +1,83 @@ +import React from "react"; + +import { + // Feature View Custom Tabs will get these props + FeatureCustomTabProps, +} from "../types"; + +import { + EuiLoadingContent, + EuiEmptyPrompt, + EuiFlexGroup, + EuiFlexItem, + EuiCode, + EuiSpacer, +} from "@elastic/eui"; + +// Separating out the query is not required, +// but encouraged for code readability +import useDemoQuery from "./useDemoQuery"; + +const DemoCustomTab = ({ id, feastObjectQuery }: FeatureCustomTabProps) => { + // Use React Query to fetch data + // that is custom to this tab. + // See: https://react-query.tanstack.com/guides/queries + + const { isLoading, isError, isSuccess, data } = useDemoQuery({ + featureView: id, + }); + + if (isLoading) { + // Handle Loading State + // https://elastic.github.io/eui/#/display/loading + return ; + } + + if (isError) { + // Handle Data Fetching Error + // https://elastic.github.io/eui/#/display/empty-prompt + return ( + Unable to load your demo page} + body={ +

+ There was an error loading the Dashboard application. Contact your + administrator for help. +

+ } + /> + ); + } + + // Feast UI uses the Elastic UI component system. + // and are particularly + // useful for layouts. + return ( + + + +

Hello World. The following is fetched data.

+ + {isSuccess && data && ( + +
{JSON.stringify(data, null, 2)}
+
+ )} +
+ +

... and this is data from Feast UI’s own query.

+ + {feastObjectQuery.isSuccess && feastObjectQuery.featureData && ( + +
{JSON.stringify(feastObjectQuery.featureData, null, 2)}
+
+ )} +
+
+
+ ); +}; + +export default DemoCustomTab; diff --git a/ui/src/custom-tabs/feature-demo-tab/useDemoQuery.tsx b/ui/src/custom-tabs/feature-demo-tab/useDemoQuery.tsx new file mode 100644 index 0000000000..b93602dbe3 --- /dev/null +++ b/ui/src/custom-tabs/feature-demo-tab/useDemoQuery.tsx @@ -0,0 +1,44 @@ +import { useQuery } from "react-query"; +import { z } from "zod"; + +// Use Zod to check the shape of the +// json object being loaded +const demoSchema = z.object({ + hello: z.string(), + name: z.string().optional(), +}); + +// Make the type of the object available +type DemoDataType = z.infer; + +interface DemoQueryInterface { + featureView: string | undefined; +} + +const useDemoQuery = ({ featureView }: DemoQueryInterface) => { + // React Query manages caching for you based on query keys + // See: https://react-query.tanstack.com/guides/query-keys + const queryKey = `demo-tab-namespace:${featureView}`; + + // Pass the type to useQuery + // so that components consuming the + // result gets nice type hints + // on the other side. + return useQuery( + queryKey, + () => { + // Customizing the URL based on your needs + const url = `/demo-custom-tabs/demo.json`; + + return fetch(url) + .then((res) => res.json()) + .then((data) => demoSchema.parse(data)); // Use zod to parse results + }, + { + enabled: !!featureView, // Only start the query when the variable is not undefined + } + ); +}; + +export default useDemoQuery; +export type { DemoDataType }; diff --git a/ui/src/custom-tabs/reguar-fv-demo-tab/DemoCustomTab.tsx b/ui/src/custom-tabs/reguar-fv-demo-tab/DemoCustomTab.tsx index 2ce1b4e64b..4f8d7dfcb2 100644 --- a/ui/src/custom-tabs/reguar-fv-demo-tab/DemoCustomTab.tsx +++ b/ui/src/custom-tabs/reguar-fv-demo-tab/DemoCustomTab.tsx @@ -82,4 +82,4 @@ const DemoCustomTab = ({ ); }; -export default DemoCustomTab; +export default DemoCustomTab; \ No newline at end of file diff --git a/ui/src/custom-tabs/reguar-fv-demo-tab/useDemoQuery.tsx b/ui/src/custom-tabs/reguar-fv-demo-tab/useDemoQuery.tsx index b93602dbe3..965d511539 100644 --- a/ui/src/custom-tabs/reguar-fv-demo-tab/useDemoQuery.tsx +++ b/ui/src/custom-tabs/reguar-fv-demo-tab/useDemoQuery.tsx @@ -41,4 +41,4 @@ const useDemoQuery = ({ featureView }: DemoQueryInterface) => { }; export default useDemoQuery; -export type { DemoDataType }; +export type { DemoDataType }; \ No newline at end of file diff --git a/ui/src/custom-tabs/types.ts b/ui/src/custom-tabs/types.ts index f80c56d0e2..1e555d6185 100644 --- a/ui/src/custom-tabs/types.ts +++ b/ui/src/custom-tabs/types.ts @@ -2,6 +2,7 @@ import { useLoadOnDemandFeatureView, useLoadRegularFeatureView, } from "../pages/feature-views/useLoadFeatureView"; +import useLoadFeature from "../pages/features/useLoadFeature"; import useLoadFeatureService from "../pages/feature-services/useLoadFeatureService"; import useLoadDataSource from "../pages/data-sources/useLoadDataSource"; import useLoadEntity from "../pages/entities/useLoadEntity"; @@ -47,7 +48,7 @@ interface OnDemandFeatureViewCustomTabRegistrationInterface }: OnDemandFeatureViewCustomTabProps) => JSX.Element; } -// Type for Feature Service Custom Tabs +// Type for Entity Custom Tabs interface EntityCustomTabProps { id: string | undefined; feastObjectQuery: ReturnType; @@ -61,6 +62,21 @@ interface EntityCustomTabRegistrationInterface }: EntityCustomTabProps) => JSX.Element; } +// Type for Feature Custom Tabs +interface FeatureCustomTabProps { + id: string | undefined; + feastObjectQuery: ReturnType; +} +interface FeatureCustomTabRegistrationInterface + extends CustomTabRegistrationInterface { + Component: ({ + id, + feastObjectQuery, + ...args + }: FeatureCustomTabProps) => JSX.Element; +} + + // Type for Feature Service Custom Tabs interface FeatureServiceCustomTabProps { id: string | undefined; @@ -117,6 +133,8 @@ export type { DataSourceCustomTabProps, EntityCustomTabRegistrationInterface, EntityCustomTabProps, + FeatureCustomTabRegistrationInterface, + FeatureCustomTabProps, DatasetCustomTabRegistrationInterface, DatasetCustomTabProps, }; diff --git a/ui/src/graphics/FeatureIcon.tsx b/ui/src/graphics/FeatureIcon.tsx new file mode 100644 index 0000000000..e2e06749bc --- /dev/null +++ b/ui/src/graphics/FeatureIcon.tsx @@ -0,0 +1,52 @@ +import React from "react"; + +const FeatureIcon = ({ + size, + className, +}: { + size: number; + className?: string; +}) => { + return ( + + + + + + + + ); +}; + +const FeatureIcon16 = () => { + return ; +}; + +const FeatureIcon32 = () => { + return ( + + ); +}; + +export { FeatureIcon, FeatureIcon16, FeatureIcon32 }; diff --git a/ui/src/index.tsx b/ui/src/index.tsx index 3a6269a8b7..2233b90c9e 100644 --- a/ui/src/index.tsx +++ b/ui/src/index.tsx @@ -15,12 +15,14 @@ import FeastUI from "./FeastUI"; // 3. Register the tab in the appropriate array below. Each entry // is a record with three keys: label, path, and Component. // Import your component and pass it as Component +import DataTab from "./custom-tabs/data-tab/DataTab"; import RFVDemoCustomTab from "./custom-tabs/reguar-fv-demo-tab/DemoCustomTab"; import ODFVDemoCustomTab from "./custom-tabs/ondemand-fv-demo-tab/DemoCustomTab"; import FSDemoCustomTab from "./custom-tabs/feature-service-demo-tab/DemoCustomTab"; import DSDemoCustomTab from "./custom-tabs/data-source-demo-tab/DemoCustomTab"; import EntDemoCustomTab from "./custom-tabs/entity-demo-tab/DemoCustomTab"; import DatasetDemoCustomTab from "./custom-tabs/dataset-demo-tab/DemoCustomTab"; +import FDemoCustomTab from "./custom-tabs/feature-demo-tab/DemoCustomTab"; const queryClient = new QueryClient(); @@ -31,6 +33,11 @@ const tabsRegistry = { path: "demo-tab", // Subpath for the tab Component: RFVDemoCustomTab, }, + { + label: "Data Tab Demo", // Navigation Label for the tab + path: "data-tab", // Subpath for the tab + Component: DataTab, + }, ], OnDemandFeatureViewCustomTabs: [ { @@ -67,6 +74,13 @@ const tabsRegistry = { Component: DatasetDemoCustomTab, }, ], + FeatureCustomTabs: [ + { + label: "Custom Tab Demo", + path: "demo-tab", + Component: FDemoCustomTab, + }, + ], }; ReactDOM.render( diff --git a/ui/src/pages/feature-views/OnDemandFeatureViewOverviewTab.tsx b/ui/src/pages/feature-views/OnDemandFeatureViewOverviewTab.tsx index 1ea509d8df..0922f62102 100644 --- a/ui/src/pages/feature-views/OnDemandFeatureViewOverviewTab.tsx +++ b/ui/src/pages/feature-views/OnDemandFeatureViewOverviewTab.tsx @@ -15,6 +15,7 @@ import { RequestDataSourceType, FeatureViewProjectionType, } from "../../parsers/feastODFVS"; +import { useParams } from "react-router-dom"; import { EntityRelation } from "../../parsers/parseEntityRelationships"; import { FEAST_FCO_TYPES } from "../../parsers/types"; import useLoadRelationshipData from "../../queries/useLoadRelationshipsData"; @@ -39,6 +40,7 @@ const OnDemandFeatureViewOverviewTab = ({ data, }: OnDemandFeatureViewOverviewTabProps) => { const inputs = Object.entries(data.spec.sources); + const { projectName } = useParams(); const relationshipQuery = useLoadRelationshipData(); const fsNames = relationshipQuery.data @@ -71,10 +73,12 @@ const OnDemandFeatureViewOverviewTab = ({

Features ({data.spec.features.length})

- {data.spec.features ? ( + {projectName && data.spec.features ? ( ) : ( No Tags sepcified on this feature view. diff --git a/ui/src/pages/feature-views/RegularFeatureViewOverviewTab.tsx b/ui/src/pages/feature-views/RegularFeatureViewOverviewTab.tsx index d284d697e8..689bc6b902 100644 --- a/ui/src/pages/feature-views/RegularFeatureViewOverviewTab.tsx +++ b/ui/src/pages/feature-views/RegularFeatureViewOverviewTab.tsx @@ -69,10 +69,12 @@ const RegularFeatureViewOverviewTab = ({

Features ({data.spec.features.length})

- {data.spec.features ? ( + {projectName && data.spec.features ? ( ) : ( No features specified on this feature view. diff --git a/ui/src/pages/features/FeatureInstance.tsx b/ui/src/pages/features/FeatureInstance.tsx new file mode 100644 index 0000000000..6eb7d0f2d6 --- /dev/null +++ b/ui/src/pages/features/FeatureInstance.tsx @@ -0,0 +1,62 @@ +import React from "react"; +import { Route, Routes, useNavigate, useParams } from "react-router-dom"; +import { + EuiPageHeader, + EuiPageContent, + EuiPageContentBody, +} from "@elastic/eui"; + +import { FeatureIcon32 } from "../../graphics/FeatureIcon"; +import { useMatchExact } from "../../hooks/useMatchSubpath"; +import FeatureOverviewTab from "./FeatureOverviewTab"; +import { useDocumentTitle } from "../../hooks/useDocumentTitle"; +import { + useFeatureCustomTabs, + useFeatureCustomTabRoutes, +} from "../../custom-tabs/TabsRegistryContext"; + +const FeatureInstance = () => { + const navigate = useNavigate(); + let { FeatureViewName, FeatureName } = useParams(); + + const { customNavigationTabs } = useFeatureCustomTabs(navigate); + const CustomTabRoutes = useFeatureCustomTabRoutes(); + + useDocumentTitle(`${FeatureName} | ${FeatureViewName} | Feast`); + + return ( + + { + navigate(""); + }, + }, + ...customNavigationTabs, + ]} + /> + + + + } /> + {CustomTabRoutes} + + + + + ); +}; + +export default FeatureInstance; diff --git a/ui/src/pages/features/FeatureOverviewTab.tsx b/ui/src/pages/features/FeatureOverviewTab.tsx new file mode 100644 index 0000000000..0a1c48509c --- /dev/null +++ b/ui/src/pages/features/FeatureOverviewTab.tsx @@ -0,0 +1,71 @@ +import { + EuiFlexGroup, + EuiHorizontalRule, + EuiLoadingSpinner, + EuiTitle, + EuiPanel, + EuiFlexItem, + EuiDescriptionList, + EuiDescriptionListTitle, + EuiDescriptionListDescription, +} from "@elastic/eui"; +import EuiCustomLink from "../../components/EuiCustomLink"; +import React from "react"; +import { useParams } from "react-router-dom"; +import useLoadFeature from "./useLoadFeature"; + +const FeatureOverviewTab = () => { + let { projectName, FeatureViewName, FeatureName } = useParams(); + + const eName = FeatureViewName === undefined ? "" : FeatureViewName; + const fName = FeatureName === undefined ? "" : FeatureName; + const { isLoading, isSuccess, isError, data, featureData } = useLoadFeature(eName, fName); + const isEmpty = data === undefined || featureData === undefined; + + return ( + + {isLoading && ( + + Loading + + )} + {isEmpty &&

No Feature with name {FeatureName} in FeatureView {FeatureViewName}

} + {isError &&

Error loading Feature {FeatureName} in FeatureView {FeatureViewName}

} + {isSuccess && data && ( + + + + + +

Properties

+
+ + + Name + + {featureData?.name} + + + Value Type + + {featureData?.valueType} + + + FeatureView + + + {FeatureViewName} + + + +
+
+
+
+ )} +
+ ); +}; +export default FeatureOverviewTab; diff --git a/ui/src/pages/features/FeatureRawData.tsx b/ui/src/pages/features/FeatureRawData.tsx new file mode 100644 index 0000000000..efbe29d431 --- /dev/null +++ b/ui/src/pages/features/FeatureRawData.tsx @@ -0,0 +1,25 @@ +import React from "react"; +import { EuiPanel } from "@elastic/eui"; +import { useParams } from "react-router-dom"; +import useLoadFeature from "./useLoadFeature"; + +const FeatureRawData = () => { + let { FeatureViewName, FeatureName } = useParams(); + + const eName = FeatureViewName === undefined ? "" : FeatureViewName; + const fName = FeatureName === undefined ? "" : FeatureName; + + const { isSuccess, data } = useLoadFeature(eName, fName); + + return isSuccess && data ? ( + +
{JSON.stringify(data, null, 2)}
+
+ ) : ( + + No data so sad ;-; + + ); +}; + +export default FeatureRawData; diff --git a/ui/src/pages/features/useLoadFeature.ts b/ui/src/pages/features/useLoadFeature.ts new file mode 100644 index 0000000000..5ddaf28204 --- /dev/null +++ b/ui/src/pages/features/useLoadFeature.ts @@ -0,0 +1,29 @@ +import { useContext } from "react"; +import RegistryPathContext from "../../contexts/RegistryPathContext"; +import useLoadRegistry from "../../queries/useLoadRegistry"; + +const useLoadFeature = (featureViewName: string, featureName: string) => { + const registryUrl = useContext(RegistryPathContext); + const registryQuery = useLoadRegistry(registryUrl); + + const data = + registryQuery.data === undefined + ? undefined + : registryQuery.data.objects.featureViews?.find((fv) => { + return fv.spec.name === featureViewName; + }); + + const featureData = + data === undefined + ? undefined + : data?.spec.features.find((f) => { + return f.name === featureName; + }); + + return { + ...registryQuery, + featureData, + }; +}; + +export default useLoadFeature; diff --git a/ui/src/parsers/feastFeatures.ts b/ui/src/parsers/feastFeatures.ts new file mode 100644 index 0000000000..129120c168 --- /dev/null +++ b/ui/src/parsers/feastFeatures.ts @@ -0,0 +1,11 @@ +import { z } from "zod"; +import { FEAST_FEATURE_VALUE_TYPES } from "./types"; +import { jsonSchema } from "./jsonType" + +const FeastFeatureSchema = z.object({ + name: z.string(), + valueType: z.nativeEnum(FEAST_FEATURE_VALUE_TYPES), + metadata: jsonSchema.optional(), +}); + +export { FeastFeatureSchema }; diff --git a/ui/src/parsers/jsonType.ts b/ui/src/parsers/jsonType.ts new file mode 100644 index 0000000000..be484b5477 --- /dev/null +++ b/ui/src/parsers/jsonType.ts @@ -0,0 +1,11 @@ +import { z } from "zod"; + +// Taken from the zod documentation code - accepts any JSON object. +const literalSchema = z.union([z.string(), z.number(), z.boolean(), z.null()]); +type Literal = z.infer; +type Json = Literal | { [key: string]: Json } | Json[]; +const jsonSchema: z.ZodType = z.lazy(() => + z.union([literalSchema, z.array(jsonSchema), z.record(jsonSchema)]) +); + +export { jsonSchema }; diff --git a/ui/src/queries/useLoadFeatureViewSummaryStatistics.ts b/ui/src/queries/useLoadFeatureViewSummaryStatistics.ts index 0604029866..fea0bd9d81 100644 --- a/ui/src/queries/useLoadFeatureViewSummaryStatistics.ts +++ b/ui/src/queries/useLoadFeatureViewSummaryStatistics.ts @@ -9,7 +9,7 @@ const useLoadFeatureViewSummaryStatistics = (featureViewName: string) => { const { projectName } = useParams(); const queryKey = `featureViewSummaryStatistics:${featureViewName}`; - const url = `/metadata/${projectName}/featureView/${featureViewName}.json`; + const url = `/data/${projectName}/featureView/${featureViewName}.json`; return useQuery( queryKey, diff --git a/ui/src/utils/custom-tabs/FeatureCustomTabLoadingWrapper.tsx b/ui/src/utils/custom-tabs/FeatureCustomTabLoadingWrapper.tsx new file mode 100644 index 0000000000..7880f82490 --- /dev/null +++ b/ui/src/utils/custom-tabs/FeatureCustomTabLoadingWrapper.tsx @@ -0,0 +1,37 @@ +import React from "react"; +import { useParams } from "react-router-dom"; + +import { FeatureCustomTabProps } from "../../custom-tabs/types"; +import useLoadFeature from "../../pages/features/useLoadFeature"; + +interface FeatureCustomTabLoadingWrapperProps { + Component: (props: FeatureCustomTabProps) => JSX.Element; +} + +const FeatureCustomTabLoadingWrapper = ({ + Component, +}: FeatureCustomTabLoadingWrapperProps) => { + console.log(useParams()); + const { FeatureViewName, FeatureName } = useParams(); + + if (!FeatureViewName) { + throw new Error( + `This route has no 'FeatureViewName' part. This route is likely not supposed to render this component.` + ); + } + + if (!FeatureName) { + throw new Error( + `This route has no 'FeatureName' part. This route is likely not supposed to render this component.` + ); + } + + const feastObjectQuery = useLoadFeature(FeatureViewName, FeatureName); + + // do I include FeatureViewName in this? + return ( + + ); +}; + +export default FeatureCustomTabLoadingWrapper; diff --git a/ui/yarn.lock b/ui/yarn.lock index 998565a77a..ad31cbeac5 100644 --- a/ui/yarn.lock +++ b/ui/yarn.lock @@ -1476,15 +1476,37 @@ "@types/yargs" "^16.0.0" chalk "^4.0.0" +"@jridgewell/gen-mapping@^0.3.0": + version "0.3.2" + resolved "https://registry.yarnpkg.com/@jridgewell/gen-mapping/-/gen-mapping-0.3.2.tgz#c1aedc61e853f2bb9f5dfe6d4442d3b565b253b9" + integrity sha512-mh65xKQAzI6iBcFzwv28KVWSmCkdRBWoOh+bYQGW3+6OZvbbN3TqMGo5hqYxQniRcH9F2VZIoJCm4pa3BPDK/A== + dependencies: + "@jridgewell/set-array" "^1.0.1" + "@jridgewell/sourcemap-codec" "^1.4.10" + "@jridgewell/trace-mapping" "^0.3.9" + "@jridgewell/resolve-uri@^3.0.3": - version "3.0.5" - resolved "https://registry.yarnpkg.com/@jridgewell/resolve-uri/-/resolve-uri-3.0.5.tgz#68eb521368db76d040a6315cdb24bf2483037b9c" - integrity sha512-VPeQ7+wH0itvQxnG+lIzWgkysKIr3L9sslimFW55rHMdGu/qCQ5z5h9zq4gI8uBtqkpHhsF4Z/OwExufUCThew== + version "3.1.0" + resolved "https://registry.yarnpkg.com/@jridgewell/resolve-uri/-/resolve-uri-3.1.0.tgz#2203b118c157721addfe69d47b70465463066d78" + integrity sha512-F2msla3tad+Mfht5cJq7LSXcdudKTWCVYUgw6pLFOOHSTtZlj6SWNYAp+AhuqLmWdBO2X5hPrLcu8cVP8fy28w== + +"@jridgewell/set-array@^1.0.1": + version "1.1.2" + resolved "https://registry.yarnpkg.com/@jridgewell/set-array/-/set-array-1.1.2.tgz#7c6cf998d6d20b914c0a55a91ae928ff25965e72" + integrity sha512-xnkseuNADM0gt2bs+BvhO0p78Mk762YnZdsuzFV018NoG1Sj1SCQvpSqa7XUaTam5vAGasABV9qXASMKnFMwMw== + +"@jridgewell/source-map@^0.3.2": + version "0.3.2" + resolved "https://registry.yarnpkg.com/@jridgewell/source-map/-/source-map-0.3.2.tgz#f45351aaed4527a298512ec72f81040c998580fb" + integrity sha512-m7O9o2uR8k2ObDysZYzdfhb08VuEml5oWGiosa1VdaPZ/A6QyPkAJuwN0Q1lhULOf6B7MtQmHENS743hWtCrgw== + dependencies: + "@jridgewell/gen-mapping" "^0.3.0" + "@jridgewell/trace-mapping" "^0.3.9" "@jridgewell/sourcemap-codec@^1.4.10": - version "1.4.11" - resolved "https://registry.yarnpkg.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.11.tgz#771a1d8d744eeb71b6adb35808e1a6c7b9b8c8ec" - integrity sha512-Fg32GrJo61m+VqYSdRSjRXMjQ06j8YIYfcTqndLYVAaHmroZHLJZCydsWBOTDqXS2v+mjxohBWEMfg97GXmYQg== + version "1.4.14" + resolved "https://registry.yarnpkg.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.14.tgz#add4c98d341472a289190b424efbdb096991bb24" + integrity sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw== "@jridgewell/trace-mapping@^0.3.0": version "0.3.4" @@ -1494,6 +1516,14 @@ "@jridgewell/resolve-uri" "^3.0.3" "@jridgewell/sourcemap-codec" "^1.4.10" +"@jridgewell/trace-mapping@^0.3.9": + version "0.3.14" + resolved "https://registry.yarnpkg.com/@jridgewell/trace-mapping/-/trace-mapping-0.3.14.tgz#b231a081d8f66796e475ad588a1ef473112701ed" + integrity sha512-bJWEfQ9lPTvm3SneWwRFVLzrh6nhjwqw7TUFFBEMzwvg7t7PCDenf2lDwqo4NQXzdpgBXyFgDWnQA+2vkruksQ== + dependencies: + "@jridgewell/resolve-uri" "^3.0.3" + "@jridgewell/sourcemap-codec" "^1.4.10" + "@mapbox/hast-util-table-cell-style@^0.2.0": version "0.2.0" resolved "https://registry.yarnpkg.com/@mapbox/hast-util-table-cell-style/-/hast-util-table-cell-style-0.2.0.tgz#1003f59d54fae6f638cb5646f52110fb3da95b4d" @@ -2803,10 +2833,10 @@ acorn@^7.0.0, acorn@^7.1.1: resolved "https://registry.yarnpkg.com/acorn/-/acorn-7.4.1.tgz#feaed255973d2e77555b83dbc08851a6c63520fa" integrity sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A== -acorn@^8.2.4, acorn@^8.4.1, acorn@^8.7.0: - version "8.7.0" - resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.7.0.tgz#90951fde0f8f09df93549481e5fc141445b791cf" - integrity sha512-V/LGr1APy+PXIwKebEWrkZPwoeoF+w1jiOBUmuxuiUIaOHtob8Qc9BTrYo7VuI5fR8tqsy+buA2WFooR5olqvQ== +acorn@^8.2.4, acorn@^8.4.1, acorn@^8.5.0, acorn@^8.7.0: + version "8.7.1" + resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.7.1.tgz#0197122c843d1bf6d0a5e83220a788f278f63c30" + integrity sha512-Xx54uLJQZ19lKygFXOWsscKUbsBZW0CPykPhVQdhIeIwrbPmJzqeASDInc8nKBnp/JT6igTs82qPXz069H8I/A== address@^1.0.1, address@^1.1.2: version "1.1.2" @@ -7326,9 +7356,9 @@ mkdirp@^0.5.5, mkdirp@~0.5.1: minimist "^1.2.5" moment@^2.29.1: - version "2.29.2" - resolved "https://registry.yarnpkg.com/moment/-/moment-2.29.2.tgz#00910c60b20843bcba52d37d58c628b47b1f20e4" - integrity sha512-UgzG4rvxYpN15jgCmVJwac49h9ly9NurikMWGPdVxm8GZD6XjkKPxDTjQQ43gtGgnV3X0cAyWDdP2Wexoquifg== + version "2.29.4" + resolved "https://registry.yarnpkg.com/moment/-/moment-2.29.4.tgz#3dbe052889fe7c1b2ed966fcb3a77328964ef108" + integrity sha512-5LC9SOxjSc2HF6vO2CyuTDNivEdoz2IvyJJGj6X8DJ0eFyfszE0QiEd+iXmBvUP3WHxSjFH/vIsA0EN00cgr8w== ms@2.0.0: version "2.0.0" @@ -9537,7 +9567,7 @@ source-map@^0.5.0, source-map@^0.5.3: resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.5.7.tgz#8a039d2d1021d22d1ea14c80d8ea468ba2ef3fcc" integrity sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w= -source-map@^0.7.3, source-map@~0.7.2: +source-map@^0.7.3: version "0.7.3" resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.7.3.tgz#5302f8169031735226544092e64981f751750383" integrity sha512-CkCj6giN3S+n9qrYiBTX5gystlENnRW5jZeNLHpe6aue+SrHcG5VYwujhW9s4dY31mEGsxBDrHR6oI69fTXsaQ== @@ -9970,12 +10000,13 @@ terser-webpack-plugin@^5.1.3, terser-webpack-plugin@^5.2.5: terser "^5.7.2" terser@^5.0.0, terser@^5.10.0, terser@^5.7.2: - version "5.10.0" - resolved "https://registry.yarnpkg.com/terser/-/terser-5.10.0.tgz#b86390809c0389105eb0a0b62397563096ddafcc" - integrity sha512-AMmF99DMfEDiRJfxfY5jj5wNH/bYO09cniSqhfoyxc8sFoYIgkJy86G04UoZU5VjlpnplVu0K6Tx6E9b5+DlHA== + version "5.14.2" + resolved "https://registry.yarnpkg.com/terser/-/terser-5.14.2.tgz#9ac9f22b06994d736174f4091aa368db896f1c10" + integrity sha512-oL0rGeM/WFQCUd0y2QrWxYnq7tfSuKBiqTjRPWrRgB46WD/kiwHwF8T23z78H6Q6kGCuuHcPB+KULHRdxvVGQA== dependencies: + "@jridgewell/source-map" "^0.3.2" + acorn "^8.5.0" commander "^2.20.0" - source-map "~0.7.2" source-map-support "~0.5.20" test-exclude@^6.0.0: