diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..00a51aff5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +# +# https://help.github.com/articles/dealing-with-line-endings/ +# +# These are explicitly windows files and should use crlf +*.bat text eol=crlf + diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.yaml b/.github/ISSUE_TEMPLATE/bug_report_template.yaml index f6f317370..a23ee6c19 100644 --- a/.github/ISSUE_TEMPLATE/bug_report_template.yaml +++ b/.github/ISSUE_TEMPLATE/bug_report_template.yaml @@ -7,7 +7,7 @@ body: - type: markdown attributes: value: | - Thank you for submitting an issue. Please refer to our [contribution guide](https://github.com/linkedin/feathr/blob/main/docs/dev_guide/new_contributor_guide.md) for additional information. + Thank you for submitting an issue. Please refer to our [contribution guide](https://github.com/feathr-ai/feathr/blob/main/docs/dev_guide/new_contributor_guide.md) for additional information. #### Please fill in this bug report template to ensure a timely and thorough response. - type: dropdown id: contribution diff --git a/.github/ISSUE_TEMPLATE/doc_improvements.yaml b/.github/ISSUE_TEMPLATE/doc_improvements.yaml index bd8703da4..214b11198 100644 --- a/.github/ISSUE_TEMPLATE/doc_improvements.yaml +++ b/.github/ISSUE_TEMPLATE/doc_improvements.yaml @@ -7,7 +7,7 @@ body: - type: markdown attributes: value: | - Thank you for submitting an issue. Please refer to our [contribution guide](https://github.com/linkedin/feathr/blob/main/docs/dev_guide/new_contributor_guide.md) for additional information. + Thank you for submitting an issue. Please refer to our [contribution guide](https://github.com/feathr-ai/feathr/blob/main/docs/dev_guide/new_contributor_guide.md) for additional information. #### Please fill in this non-technical template to ensure a timely and thorough response. - type: dropdown id: contribution diff --git a/.github/ISSUE_TEMPLATE/feature_request_template.yaml b/.github/ISSUE_TEMPLATE/feature_request_template.yaml index ddc3c0405..9e08b470c 100644 --- a/.github/ISSUE_TEMPLATE/feature_request_template.yaml +++ b/.github/ISSUE_TEMPLATE/feature_request_template.yaml @@ -7,7 +7,7 @@ body: - type: markdown attributes: value: | - Thank you for submitting an issue. Please refer to our [contribution guide](https://github.com/linkedin/feathr/blob/main/docs/dev_guide/new_contributor_guide.md) for additional information. + Thank you for submitting an issue. Please refer to our [contribution guide](https://github.com/feathr-ai/feathr/blob/main/docs/dev_guide/new_contributor_guide.md) for additional information. #### Please fill in this bug report template to ensure a timely and thorough response. - type: dropdown id: contribution diff --git a/.github/ISSUE_TEMPLATE/non_technical_request_template.yaml b/.github/ISSUE_TEMPLATE/non_technical_request_template.yaml index c09310514..bd7e90239 100644 --- a/.github/ISSUE_TEMPLATE/non_technical_request_template.yaml +++ b/.github/ISSUE_TEMPLATE/non_technical_request_template.yaml @@ -7,7 +7,7 @@ body: - type: markdown attributes: value: | - Thank you for submitting an issue. Please refer to our [contribution guide](https://github.com/linkedin/feathr/blob/main/docs/dev_guide/new_contributor_guide.md) for additional information. + Thank you for submitting an issue. Please refer to our [contribution guide](https://github.com/feathr-ai/feathr/blob/main/docs/dev_guide/new_contributor_guide.md) for additional information. #### Please fill in this non-technical template to ensure a timely and thorough response. - type: dropdown id: contribution diff --git a/.github/workflows/.coveragerc_db b/.github/workflows/.coveragerc_db new file mode 100644 index 000000000..437c076a1 --- /dev/null +++ b/.github/workflows/.coveragerc_db @@ -0,0 +1,9 @@ +[run] +omit = feathr_project/feathr/registry/_feature_registry_purview.py + feathr_project/feathr/registry/registry_utils.py + feathr_project/feathr/spark_provider/_synapse_submission.py + feathr_project/feathr/spark_provider/_localspark_submission.py +[report] +exclude_lines = + pragma: no cover + @abstract \ No newline at end of file diff --git a/.github/workflows/.coveragerc_local b/.github/workflows/.coveragerc_local new file mode 100644 index 000000000..b3c3b213a --- /dev/null +++ b/.github/workflows/.coveragerc_local @@ -0,0 +1,9 @@ +[run] +omit = feathr_project/feathr/registry/_feature_registry_purview.py + feathr_project/feathr/registry/registry_utils.py + feathr_project/feathr/spark_provider/_databricks_submission.py + feathr_project/feathr/spark_provider/_synapse_submission.py +[report] +exclude_lines = + pragma: no cover + @abstract \ No newline at end of file diff --git a/.github/workflows/.coveragerc_sy b/.github/workflows/.coveragerc_sy new file mode 100644 index 000000000..8f971cb21 --- /dev/null +++ b/.github/workflows/.coveragerc_sy @@ -0,0 +1,9 @@ +[run] +omit = feathr_project/feathr/registry/_feature_registry_purview.py + feathr_project/feathr/registry/registry_utils.py + feathr_project/feathr/spark_provider/_databricks_submission.py + feathr_project/feathr/spark_provider/_localspark_submission.py +[report] +exclude_lines = + pragma: no cover + @abstract \ No newline at end of file diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 6e873363f..95c7e3380 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -1,5 +1,5 @@ # This workflow builds the docker container and publishes to dockerhub with appropriate tag -# It has two triggers, +# It has two triggers, # 1. daily i.e. runs everyday at specific time. # 2. Anytime a new branch is created under releases @@ -22,19 +22,19 @@ jobs: steps: - name: Check out the repo uses: actions/checkout@v3 - + - name: Log in to Docker Hub uses: docker/login-action@v2 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - + - name: Extract metadata (tags, labels) for Docker id: meta uses: docker/metadata-action@v4 with: images: feathrfeaturestore/feathr-registry - + - name: Build and push Docker image uses: docker/build-push-action@v3 with: @@ -44,6 +44,34 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} + + build_and_push_feathr_sandbox_image: + name: Push Feathr Sandbox image to Docker Hub + runs-on: ubuntu-latest + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: feathrfeaturestore/feathr-sandbox + + - name: Build and push Docker image + uses: docker/build-push-action@v3 + with: + context: . + file: FeathrSandbox.Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} # Trigger Azure Web App webhooks to pull the latest nightly image deploy: runs-on: ubuntu-latest @@ -72,4 +100,4 @@ jobs: id: deploy-to-feathr-registry-sql-rbac uses: distributhor/workflow-webhook@v3.0.1 env: - webhook_url: ${{ secrets.AZURE_WEBAPP_FEATHR_REGISTRY_SQL_RBAC_WEBHOOK }} \ No newline at end of file + webhook_url: ${{ secrets.AZURE_WEBAPP_FEATHR_REGISTRY_SQL_RBAC_WEBHOOK }} diff --git a/.github/workflows/publish-to-maven.yml b/.github/workflows/publish-to-maven.yml index ae4d98e68..3dcea6e4a 100644 --- a/.github/workflows/publish-to-maven.yml +++ b/.github/workflows/publish-to-maven.yml @@ -1,18 +1,18 @@ name: Publish package to the Maven Central Repository -on: +on: push: # This pipeline will get triggered everytime there is a new tag created. - # It is required + # It is required tags: ["*"] jobs: publish-to-maven: runs-on: ubuntu-latest - + steps: - name: Checkout source uses: actions/checkout@v2 - + # Setting up JDK 8, this is required to build Feathr - name: Set up JDK 8 uses: actions/setup-java@v2 @@ -27,12 +27,29 @@ jobs: # CI release command defaults to publishSigned # Sonatype release command defaults to sonaTypeBundleRelease - # https://github.com/sbt/sbt-ci-release - - name: Sbt ci release - run: | - sbt ci-release + - name: Gradle publish + if: startsWith(github.head_ref, 'releases/v') + run: gradle clean publish env: PGP_PASSPHRASE: ${{ secrets.MAVEN_GPG_PASSPHRASE }} PGP_SECRET: ${{ secrets.MAVEN_GPG_PRIVATE_KEY }} SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} + + # Publish Released Fat Jar to Blob Storage + - name: Gradle build + run: | + ./gradlew build + # remote folder for CI upload + echo "CI_SPARK_REMOTE_JAR_FOLDER=feathr_jar_release" >> $GITHUB_ENV + # get local jar name without path + echo "FEATHR_LOCAL_JAR_FULL_NAME_PATH=$(ls build/libs/*.jar)" >> $GITHUB_ENV + + - name: Azure Blob Storage Upload (Overwrite) + uses: fixpoint/azblob-upload-artifact@v4 + with: + connection-string: ${{secrets.SPARK_JAR_BLOB_CONNECTION_STRING}} + name: ${{ env.CI_SPARK_REMOTE_JAR_FOLDER}} + path: ${{ env.FEATHR_LOCAL_JAR_FULL_NAME_PATH}} + container: ${{secrets.SPARK_JAR_BLOB_CONTAINER}} + cleanup: "true" \ No newline at end of file diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml index 0eb0e059b..5cb10fa1c 100644 --- a/.github/workflows/pull_request_push_test.yml +++ b/.github/workflows/pull_request_push_test.yml @@ -22,13 +22,13 @@ on: - "docs/**" - "ui/**" - "**/README.md" - + schedule: # Runs daily at 1 PM UTC (9 PM CST), will send notification to TEAMS_WEBHOOK - cron: '00 13 * * *' jobs: - sbt_test: + gradle_test: runs-on: ubuntu-latest if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test')) steps: @@ -41,7 +41,7 @@ jobs: java-version: "8" distribution: "temurin" - name: Run tests - run: sbt clean && sbt test + run: ./gradlew clean && ./gradlew test python_lint: runs-on: ubuntu-latest @@ -75,15 +75,15 @@ jobs: with: java-version: "8" distribution: "temurin" - - name: Build JAR + - name: Gradle build run: | - sbt assembly + ./gradlew build # remote folder for CI upload echo "CI_SPARK_REMOTE_JAR_FOLDER=feathr_jar_github_action_$(date +"%H_%M_%S")" >> $GITHUB_ENV # get local jar name without paths so version change won't affect it - echo "FEATHR_LOCAL_JAR_NAME=$(ls target/scala-2.12/*.jar| xargs -n 1 basename)" >> $GITHUB_ENV + echo "FEATHR_LOCAL_JAR_NAME=$(ls build/libs/*.jar| xargs -n 1 basename)" >> $GITHUB_ENV # get local jar name without path - echo "FEATHR_LOCAL_JAR_FULL_NAME_PATH=$(ls target/scala-2.12/*.jar)" >> $GITHUB_ENV + echo "FEATHR_LOCAL_JAR_FULL_NAME_PATH=$(ls build/libs/*.jar)" >> $GITHUB_ENV - name: Set up Python 3.8 uses: actions/setup-python@v2 with: @@ -127,9 +127,8 @@ jobs: SQL1_USER: ${{secrets.SQL1_USER}} SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}} run: | - # run only test with databricks. run in 4 parallel jobs - pytest -n 6 feathr_project/test/ - + # run only test with databricks. run in 6 parallel jobs + pytest -n 6 --cov-report term-missing --cov=feathr_project/feathr feathr_project/test --cov-config=.github/workflows/.coveragerc_db azure_synapse_test: # might be a bit duplication to setup both the azure_synapse test and databricks test, but for now we will keep those to accelerate the test speed runs-on: ubuntu-latest @@ -143,15 +142,16 @@ jobs: with: java-version: "8" distribution: "temurin" - - name: Build JAR + + - name: Gradle build run: | - sbt assembly + ./gradlew build # remote folder for CI upload echo "CI_SPARK_REMOTE_JAR_FOLDER=feathr_jar_github_action_$(date +"%H_%M_%S")" >> $GITHUB_ENV # get local jar name without paths so version change won't affect it - echo "FEATHR_LOCAL_JAR_NAME=$(ls target/scala-2.12/*.jar| xargs -n 1 basename)" >> $GITHUB_ENV + echo "FEATHR_LOCAL_JAR_NAME=$(ls build/libs/*.jar| xargs -n 1 basename)" >> $GITHUB_ENV # get local jar name without path - echo "FEATHR_LOCAL_JAR_FULL_NAME_PATH=$(ls target/scala-2.12/*.jar)" >> $GITHUB_ENV + echo "FEATHR_LOCAL_JAR_FULL_NAME_PATH=$(ls build/libs/*.jar)" >> $GITHUB_ENV - name: Set up Python 3.8 uses: actions/setup-python@v2 with: @@ -196,8 +196,8 @@ jobs: SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}} run: | # skip databricks related test as we just ran the test; also seperate databricks and synapse test to make sure there's no write conflict - # run in 4 parallel jobs to make the time shorter - pytest -n 6 feathr_project/test/ + # run in 6 parallel jobs to make the time shorter + pytest -n 6 -m "not databricks" --cov-report term-missing --cov=feathr_project/feathr feathr_project/test --cov-config=.github/workflows/.coveragerc_sy local_spark_test: runs-on: ubuntu-latest @@ -211,15 +211,16 @@ jobs: with: java-version: "8" distribution: "temurin" - - name: Build JAR + + - name: Gradle build run: | - sbt assembly + ./gradlew build # remote folder for CI upload echo "CI_SPARK_REMOTE_JAR_FOLDER=feathr_jar_github_action_$(date +"%H_%M_%S")" >> $GITHUB_ENV # get local jar name without paths so version change won't affect it - echo "FEATHR_LOCAL_JAR_NAME=$(ls target/scala-2.12/*.jar| xargs -n 1 basename)" >> $GITHUB_ENV + echo "FEATHR_LOCAL_JAR_NAME=$(ls build/libs/*.jar| xargs -n 1 basename)" >> $GITHUB_ENV # get local jar name without path - echo "FEATHR_LOCAL_JAR_FULL_NAME_PATH=$(ls target/scala-2.12/*.jar)" >> $GITHUB_ENV + echo "FEATHR_LOCAL_JAR_FULL_NAME_PATH=$(ls build/libs/*.jar)" >> $GITHUB_ENV - name: Set up Python 3.8 uses: actions/setup-python@v2 with: @@ -255,11 +256,44 @@ jobs: SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}} run: | # skip cloud related tests - pytest feathr_project/test/test_local_spark_e2e.py + pytest --cov-report term-missing --cov=feathr_project/feathr/spark_provider feathr_project/test/test_local_spark_e2e.py --cov-config=.github/workflows/.coveragerc_local + + registry_test: + runs-on: ubuntu-latest + if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'registry test')) + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Set up JDK 8 + uses: actions/setup-java@v2 + with: + java-version: "8" + distribution: "temurin" + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install Feathr Package + run: | + python -m pip install --upgrade pip + if [ -f ./registry/test/requirements.txt ]; then pip install -r ./registry/test/requirements.txt; fi + if [ -f ./registry/purview-registry/requirements.txt ]; then pip install -r ./registry/purview-registry/requirements.txt; fi + if [ -f ./registry/sql-registry/requirements.txt ]; then pip install -r ./registry/sql-registry/requirements.txt; fi + - name: Run Registry Test Cases + env: + AZURE_CLIENT_ID: ${{secrets.AZURE_CLIENT_ID}} + AZURE_TENANT_ID: ${{secrets.AZURE_TENANT_ID}} + AZURE_CLIENT_SECRET: ${{secrets.AZURE_CLIENT_SECRET}} + PURVIEW_NAME: "feathrazuretest3-purview1" + CONNECTION_STR: ${{secrets.CONNECTION_STR}} + run: | + pytest --cov-report term-missing --cov=registry/sql-registry/registry --cov-config=registry/test/.coveragerc registry/test/test_sql_registry.py + pytest --cov-report term-missing --cov=registry/purview-registry/registry --cov-config=registry/test/.coveragerc registry/test/test_purview_registry.py failure_notification: # If any failure, warning message will be sent - needs: [sbt_test, python_lint, databricks_test, azure_synapse_test, local_spark_test] + needs: [gradle_test, python_lint, databricks_test, azure_synapse_test, local_spark_test] runs-on: ubuntu-latest if: failure() && github.event_name == 'schedule' steps: @@ -269,7 +303,7 @@ jobs: notification: # Final Daily Report with all job status - needs: [sbt_test, python_lint, databricks_test, azure_synapse_test, local_spark_test] + needs: [gradle_test, python_lint, databricks_test, azure_synapse_test, local_spark_test] runs-on: ubuntu-latest if: always() && github.event_name == 'schedule' steps: @@ -277,4 +311,4 @@ jobs: run: echo "NOW=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Notification run: | - curl -H 'Content-Type: application/json' -d '{"text": "${{env.NOW}} Daily Report: 1. SBT Test ${{needs.sbt_test.result}}, 2. Python Lint Test ${{needs.python_lint.result}}, 3. Databricks Test ${{needs.databricks_test.result}}, 4. Synapse Test ${{needs.azure_synapse_test.result}} , 5. LOCAL SPARK TEST ${{needs.local_spark_test.result}}. Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' ${{ secrets.TEAMS_WEBHOOK }} \ No newline at end of file + curl -H 'Content-Type: application/json' -d '{"text": "${{env.NOW}} Daily Report: 1. Gradle Test ${{needs.gradle_test.result}}, 2. Python Lint Test ${{needs.python_lint.result}}, 3. Databricks Test ${{needs.databricks_test.result}}, 4. Synapse Test ${{needs.azure_synapse_test.result}} , 5. LOCAL SPARK TEST ${{needs.local_spark_test.result}}. Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' ${{ secrets.TEAMS_WEBHOOK }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 976c0b239..6c87cf6dc 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ .AppleDouble .LSOverride metastore_db -src/integTest +feathr-impl/src/integTest test-output temp @@ -189,17 +189,16 @@ cython_debug/ # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* -target/ .idea .project/target .project/project .DS_store -.DS_Store *.jar -src/main/scala/META-INF/MANIFEST.MF +feathr-impl/src/main/scala/META-INF/MANIFEST.MF *.MF feathr_project/feathr_cli.egg-info/* *.pyc +*.iml # VS Code .vscode @@ -207,9 +206,21 @@ feathr_project/feathr_cli.egg-info/* #Local Build null/* +# Ignore Gradle project-specific cache directory +.gradle + +# Ignore Gradle build output directory +build + # For Metal Server .metals/ .bloop/ project/.bloop metals.sbt +feathr-data-models/src/mainGeneratedDataTemplate/ + .bsp/sbt.json + +# Feathr output debug folder +**/debug/ + diff --git a/.husky/pre-commit b/.husky/pre-commit old mode 100755 new mode 100644 index d24fdfc60..0312b7602 --- a/.husky/pre-commit +++ b/.husky/pre-commit @@ -1,4 +1,4 @@ #!/usr/bin/env sh . "$(dirname -- "$0")/_/husky.sh" -npx lint-staged +npx lint-staged \ No newline at end of file diff --git a/FeathrRegistry.Dockerfile b/FeathrRegistry.Dockerfile index f3c2d6792..c127b81c6 100644 --- a/FeathrRegistry.Dockerfile +++ b/FeathrRegistry.Dockerfile @@ -11,7 +11,7 @@ RUN npm install && npm run build FROM python:3.9 ## Install dependencies -RUN apt-get update -y && apt-get install -y nginx +RUN apt-get update -y && apt-get install -y nginx freetds-dev COPY ./registry /usr/src/registry WORKDIR /usr/src/registry/sql-registry RUN pip install -r requirements.txt diff --git a/FeathrSandbox.Dockerfile b/FeathrSandbox.Dockerfile new file mode 100644 index 000000000..219cf97af --- /dev/null +++ b/FeathrSandbox.Dockerfile @@ -0,0 +1,87 @@ +# TODO: persist the SQLite file in the volumes + +# Stage 1: build frontend ui +FROM node:16-alpine as ui-build +WORKDIR /usr/src/ui +COPY ./ui . + +## Use api endpoint from same host and build production static bundle +RUN echo 'REACT_APP_API_ENDPOINT=http://localhost:8000' >> .env.production +RUN npm install && npm run build + + +FROM jupyter/pyspark-notebook + +USER root + +## Install dependencies +RUN apt-get update -y && apt-get install -y nginx freetds-dev sqlite3 libsqlite3-dev lsb-release redis gnupg redis-server lsof + +# UI Sectioin +## Remove default nginx index page and copy ui static bundle files +RUN rm -rf /usr/share/nginx/html/* +COPY --from=ui-build /usr/src/ui/build /usr/share/nginx/html +COPY ./deploy/nginx.conf /etc/nginx/nginx.conf + + +# Feathr Package Installation Section +# always install feathr from main +WORKDIR /home/jovyan/work +COPY --chown=1000:100 ./feathr_project ./feathr_project +RUN python -m pip install -e ./feathr_project + + +# Registry Section +# install registry +COPY ./registry /usr/src/registry +WORKDIR /usr/src/registry/sql-registry +RUN pip install -r requirements.txt + + + +## Start service and then start nginx +WORKDIR /usr/src/registry +COPY ./feathr-sandbox/start_local.sh /usr/src/registry/ + +# install code server +# RUN curl -fsSL https://code-server.dev/install.sh | sh + +# default dir by the jupyter image +WORKDIR /home/jovyan/work +USER jovyan +# copy as the jovyan user +# UID is like this: uid=1000(jovyan) gid=100(users) groups=100(users) +COPY --chown=1000:100 ./docs/samples/local_quickstart_notebook.ipynb . +COPY --chown=1000:100 ./feathr-sandbox/feathr_init_script.py . + +# Run the script so that maven cache can be added for better experience. Otherwise users might have to wait for some time for the maven cache to be ready. +RUN python feathr_init_script.py +RUN python -m pip install interpret + +USER root +WORKDIR /usr/src/registry +RUN ["chmod", "+x", "/usr/src/registry/start_local.sh"] + +# remove ^M chars in Linux to make sure the script can run +RUN sed -i "s/\r//g" /usr/src/registry/start_local.sh + + +# install a Kafka single node instance +# Reference: https://www.looklinux.com/how-to-install-apache-kafka-single-node-on-ubuntu/ +RUN wget https://downloads.apache.org/kafka/3.3.1/kafka_2.12-3.3.1.tgz && tar xzf kafka_2.12-3.3.1.tgz && mv kafka_2.12-3.3.1 /usr/local/kafka && rm kafka_2.12-3.3.1.tgz + +# /usr/local/kafka/bin/zookeeper-server-start.sh /usr/local/kafka/config/zookeeper.properties +# /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties + +WORKDIR /home/jovyan/work + + +# 80: Feathr UI +# 8000: Feathr REST API +# 8888: Jupyter +# 8080: VsCode +# 7080: Interpret +EXPOSE 80 8000 8080 8888 7080 2181 +# run the service so we can initialize +# RUN ["/bin/bash", "/usr/src/registry/start.sh"] +CMD ["/bin/bash", "/usr/src/registry/start_local.sh"] diff --git a/build.gradle b/build.gradle new file mode 100644 index 000000000..82aa48514 --- /dev/null +++ b/build.gradle @@ -0,0 +1,198 @@ +import com.vanniktech.maven.publish.SonatypeHost + +buildscript { + ext.junitJupiterVersion = '5.6.1' + ext.pegasusVersion = '29.22.16' + ext.mavenVersion = '3.6.3' + ext.springVersion = '5.3.19' + ext.springBootVersion = '2.5.12' + apply from: './repositories.gradle' + buildscript.repositories.addAll(project.repositories) + dependencies { + classpath 'com.linkedin.pegasus:gradle-plugins:' + pegasusVersion + } +} + +plugins { + id 'java' + // Currently "maven-publish" has some issues with publishing to Nexus repo. So, we will use a different plugin. + // See https://issues.sonatype.org/browse/OSSRH-86507 for more details. + id "com.vanniktech.maven.publish" version "0.22.0" + id 'signing' +} + +repositories { + mavenCentral() + mavenLocal() + maven { + url "https://repository.mulesoft.org/nexus/content/repositories/public/" + } + maven { + url "https://linkedin.jfrog.io/artifactory/open-source/" // GMA, pegasus + } + +} + +configurations { + // configuration that holds jars to include in the jar + extraLibs + + // Dependencies that will be provided at runtime in the cloud execution + provided + + compileOnly.extendsFrom(provided) + testImplementation.extendsFrom provided +} + +jar { + archivesBaseName = "feathr_2.12" + duplicatesStrategy = DuplicatesStrategy.EXCLUDE + manifest { + attributes('Class-Path': [project.configurations.runtimeClasspath], + 'Main-Class': 'com.linkedin.feathr.offline.job.FeatureJoinJob', + "Implementation-title": "Build jar for local experimentation") + } + from { + configurations.runtimeClasspath.collect { it.isDirectory() ? it : zipTree(it) } + } + exclude 'META-INF/*.RSA', 'META-INF/*.SF','META-INF/*.DSA' + + // Explicitly exclude com/linkedin/data and org/apache/hadoop files from the final jar. They can cause issues in other downstream applications. + exclude 'com/linkedin/data/**' + exclude 'org/apache/hadoop/**' + zip64 = true +} + +dependencies { + implementation project(":feathr-compute") + implementation project(":feathr-config") + implementation project(":feathr-data-models") + implementation project(":feathr-impl") + // needed to include data models in jar + extraLibs project(path: ':feathr-data-models', configuration: 'dataTemplate') + implementation 'net.snowflake:snowflake-jdbc:3.13.18' + implementation 'net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2' + provided 'com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21' + provided 'com.azure.cosmos.spark:azure-cosmos-spark_3-2_2-12:4.11.1' + provided 'com.microsoft.sqlserver:mssql-jdbc:10.2.0.jre8' + provided 'org.eclipse.jetty:jetty-util:9.3.24.v20180605' + provided 'org.apache.kafka:kafka-clients:3.1.0' + provided 'org.apache.spark:spark-core_2.12:3.1.3' + provided 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.3' + provided 'org.postgresql:postgresql:42.3.4' +} + +ext { + // Version numbers shared between multiple dependencies + // FUTURE consider version catalogs https://docs.gradle.org/current/userguide/platforms.html + ver = [ + scala : '2.12.15', + scala_rt: '2.12', + spark : '3.1.3' + ] +} + +project.ext.spec = [ + 'product' : [ + 'pegasus' : [ + 'd2' : 'com.linkedin.pegasus:d2:29.33.3', + 'data' : 'com.linkedin.pegasus:data:29.33.3', + 'dataAvro1_6' : 'com.linkedin.pegasus:data-avro-1_6:29.33.3', + 'generator': 'com.linkedin.pegasus:generator:29.33.3', + ], + 'jackson' : [ + 'dataformat_csv' : "com.fasterxml.jackson.dataformat:jackson-dataformat-csv:2.12.6", + 'dataformat_yaml' : "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.12.6", + 'dataformat_hocon' : "com.jasonclawson:jackson-dataformat-hocon:1.1.0", + 'module_scala' : "com.fasterxml.jackson.module:jackson-module-scala_$ver.scala_rt:2.12.6", + 'jackson_databind' : "com.fasterxml.jackson.core:jackson-databind:2.12.6.1", + 'jackson_core': "com.fasterxml.jackson.core:jackson-core:2.12.6", + 'jackson_module_caseclass' : "com.github.changvvb:jackson-module-caseclass_$ver.scala_rt:1.1.1", + ], + 'spark_redis' : "com.redislabs:spark-redis_$ver.scala_rt:3.0.0", + 'typesafe_config' : "com.typesafe:config:1.3.4", + 'hadoop' : [ + 'mapreduce_client_core' : "org.apache.hadoop:hadoop-mapreduce-client-core:2.7.7", + 'common' : "org.apache.hadoop:hadoop-common:2.7.7", + ], + 'spark' : [ + 'spark_core' : "org.apache.spark:spark-core_$ver.scala_rt:$ver.spark", + 'spark_avro' : "org.apache.spark:spark-avro_$ver.scala_rt:$ver.spark", + 'spark_hive' : "org.apache.spark:spark-hive_$ver.scala_rt:$ver.spark", + 'spark_sql' : "org.apache.spark:spark-sql_$ver.scala_rt:$ver.spark", + 'spark_catalyst' : "org.apache.spark:spark-catalyst_$ver.scala_rt:$ver.spark", + "spark_sql_kafka" : "org.apache.spark:spark-sql-kafka-0-10_$ver.scala_rt:3.1.3" + ], + 'scala' : [ + 'scala_library' : "org.scala-lang:scala-library:$ver.scala", + 'scalatest' : "org.scalatest:scalatest_$ver.scala_rt:3.0.0", + ], + 'avro' : "org.apache.avro:avro:1.10.2", + "avroUtil": "com.linkedin.avroutil1:helper-all:0.2.100", + "azure": "com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21", + 'fastutil' : "it.unimi.dsi:fastutil:8.1.1", + 'mvel' : "org.mvel:mvel2:2.2.8.Final", + 'protobuf' : "com.google.protobuf:protobuf-java:2.6.1", + 'guava' : "com.google.guava:guava:25.0-jre", + 'xbean' : "org.apache.xbean:xbean-asm6-shaded:4.10", + 'log4j' : "log4j:log4j:1.2.17", + 'jetty': "org.eclipse.jetty:jetty-util:9.3.24.v20180605", + 'kafka': "org.apache.kafka:kafka-clients:3.1.0", + + 'json' : "org.json:json:20180130", + 'sqlserver': "com.microsoft.sqlserver:mssql-jdbc:10.2.0.jre8", + 'postgresql': "org.postgresql:postgresql:42.3.4", + 'equalsverifier' : "nl.jqno.equalsverifier:equalsverifier:3.1.12", + 'mockito' : "org.mockito:mockito-core:3.1.0", + 'snowflake-jdbc' : "net.snowflake:3.13.18", + "spark-snowflake_2.12" : "net.snowflake:2.10.0-spark_3.2", + "mockito_inline": "org.mockito:mockito-inline:2.28.2", + 'testing' : "org.testng:testng:6.14.3", + 'jdiagnostics' : "org.anarres.jdiagnostics:jdiagnostics:1.0.7", + 'jsonSchemaVali': "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1", + "antlr": "org.antlr:antlr4:4.8", + "antlrRuntime": "org.antlr:antlr4-runtime:4.8", + "jsqlparser": "com.github.jsqlparser:jsqlparser:3.1", + + ] +] + +if (hasProperty('buildScan')) { + buildScan { + termsOfServiceUrl = 'https://gradle.com/terms-of-service' + termsOfServiceAgree = 'yes' + } +} + +allprojects { + plugins.withId("com.vanniktech.maven.publish.base") { + group = "com.linkedin.feathr" + version = project.version + mavenPublishing { + publishToMavenCentral(SonatypeHost.DEFAULT) + signAllPublications() + pom { + name = 'Feathr' + description = 'An Enterprise-Grade, High Performance Feature Store' + url = 'https://github.com/linkedin/feathr' + licenses { + license { + name = 'APL2' + url = 'http://www.apache.org/licenses/LICENSE-2.0.txt' + } + } + developers { + developer { + id = 'feathr_dev' + name = 'Feathr Dev' + email = 'feathrai@gmail.com' + } + } + scm { + connection = 'scm:git@github.com:linkedin/feathr.git' + url = 'https://github.com/linkedin/feathr' + } + } + } + } +} diff --git a/build.sbt b/build.sbt deleted file mode 100644 index 7289f3139..000000000 --- a/build.sbt +++ /dev/null @@ -1,107 +0,0 @@ -import sbt.Keys.publishLocalConfiguration - -ThisBuild / resolvers += Resolver.mavenLocal -ThisBuild / scalaVersion := "2.12.15" -ThisBuild / version := "0.9.0" -ThisBuild / organization := "com.linkedin.feathr" -ThisBuild / organizationName := "linkedin" -val sparkVersion = "3.1.3" - -publishLocalConfiguration := publishLocalConfiguration.value.withOverwrite(true) - -val localAndCloudDiffDependencies = Seq( - "org.apache.spark" %% "spark-avro" % sparkVersion, - "org.apache.spark" %% "spark-sql" % sparkVersion, - "org.apache.spark" %% "spark-hive" % sparkVersion, - "org.apache.spark" %% "spark-catalyst" % sparkVersion, - "org.apache.logging.log4j" % "log4j-core" % "2.17.2", - "com.typesafe" % "config" % "1.3.4", - "com.fasterxml.jackson.core" % "jackson-databind" % "2.12.6.1", - "org.apache.hadoop" % "hadoop-mapreduce-client-core" % "2.7.7", - "org.apache.hadoop" % "hadoop-common" % "2.7.7", - "org.apache.avro" % "avro" % "1.8.2", - "org.apache.xbean" % "xbean-asm6-shaded" % "4.10", - "org.apache.spark" % "spark-sql-kafka-0-10_2.12" % "3.1.3" -) - -val cloudProvidedDeps = localAndCloudDiffDependencies.map(x => x % "provided") - -val localAndCloudCommonDependencies = Seq( - "com.microsoft.azure" % "azure-eventhubs-spark_2.12" % "2.3.21", - "org.apache.kafka" % "kafka-clients" % "3.1.0", - "com.google.guava" % "guava" % "31.1-jre", - "org.testng" % "testng" % "6.14.3" % Test, - "org.mockito" % "mockito-core" % "3.1.0" % Test, - "nl.jqno.equalsverifier" % "equalsverifier" % "3.1.13" % Test, - "org.scalatest" %% "scalatest" % "3.0.9" % Test, - "it.unimi.dsi" % "fastutil" % "8.1.1", - "org.mvel" % "mvel2" % "2.2.8.Final", - "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.12.6", - "com.fasterxml.jackson.dataformat" % "jackson-dataformat-yaml" % "2.12.6", - "com.fasterxml.jackson.dataformat" % "jackson-dataformat-csv" % "2.12.6", - "com.jasonclawson" % "jackson-dataformat-hocon" % "1.1.0", - "com.redislabs" %% "spark-redis" % "3.0.0", - "org.scalatest" %% "scalatest" % "3.0.9" % "test", - "org.apache.xbean" % "xbean-asm6-shaded" % "4.10", - "com.google.protobuf" % "protobuf-java" % "2.6.1", - "net.snowflake" % "snowflake-jdbc" % "3.13.18", - "net.snowflake" % "spark-snowflake_2.12" % "2.10.0-spark_3.2", - "org.apache.commons" % "commons-lang3" % "3.12.0", - "org.xerial" % "sqlite-jdbc" % "3.36.0.3", - "com.github.changvvb" %% "jackson-module-caseclass" % "1.1.1", - "com.azure.cosmos.spark" % "azure-cosmos-spark_3-1_2-12" % "4.11.1", - "org.eclipse.jetty" % "jetty-util" % "9.3.24.v20180605" -) // Common deps - -val jdbcDrivers = Seq( - "com.microsoft.sqlserver" % "mssql-jdbc" % "10.2.0.jre8", - "net.snowflake" % "snowflake-jdbc" % "3.13.18", - "org.postgresql" % "postgresql" % "42.3.4", -) - -// For azure -lazy val root = (project in file(".")) - .settings( - name := "feathr", - // To assemble, run sbt assembly -java-home /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home - assembly / mainClass := Some("com.linkedin.feathr.offline.job.FeatureJoinJob"), - libraryDependencies ++= cloudProvidedDeps, - libraryDependencies ++= localAndCloudCommonDependencies, - libraryDependencies ++= jdbcDrivers, - libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-core" % sparkVersion % "provided" - ) - ) - -// If you want to build jar for feathr test, enable this and comment out root -//lazy val localCliJar = (project in file(".")) -// .settings( -// name := "feathr-cli", -// // To assemble, run sbt assembly -java-home /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home -// assembly / mainClass := Some("com.linkedin.feathr.cli.FeatureExperimentEntryPoint"), -// // assembly / mainClass := Some("com.linkedin.feathr.offline.job.FeatureJoinJob"), -// libraryDependencies ++= localAndCloudDiffDependencies, -// libraryDependencies ++= localAndCloudCommonDependencies, -// libraryDependencies ++= Seq( -// // See https://stackoverflow.com/questions/55923943/how-to-fix-unsupported-class-file-major-version-55-while-executing-org-apache -// "org.apache.spark" %% "spark-core" % sparkVersion exclude("org.apache.xbean","xbean-asm6-shaded") -// ) -// ) - - -// To assembly with certain java version: sbt assembly -java-home "/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home" -// Please specify the feathr version feathr-assembly-X.X.X-SNAPSHOT.jar -// To execute the jar: java -jar target/scala-2.12/feathr-assembly-0.5.0-SNAPSHOT.jar (Please use the latest version of the jar) - -assembly / assemblyMergeStrategy := { - // See https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file - // See https://stackoverflow.com/questions/62232209/classnotfoundexception-caused-by-java-lang-classnotfoundexception-csv-default - case PathList("META-INF","services",xs @ _*) => MergeStrategy.filterDistinctLines - case PathList("META-INF",xs @ _*) => MergeStrategy.discard - case _ => MergeStrategy.first -} - -// Some systems(like Hadoop) use different versinos of protobuf(like v2) so we have to shade it. -assemblyShadeRules in assembly := Seq( - ShadeRule.rename("com.google.protobuf.**" -> "shade.protobuf.@1").inAll, -) \ No newline at end of file diff --git a/deploy/start.sh b/deploy/start.sh index e8bd1eea5..e46115238 100755 --- a/deploy/start.sh +++ b/deploy/start.sh @@ -33,19 +33,8 @@ nginx # Start API app LISTENING_PORT="8000" -if [ "x$REACT_APP_ENABLE_RBAC" == "x" ]; then - echo "RBAC flag not configured, only launch registry app" - if [ "x$PURVIEW_NAME" == "x" ]; then - echo "Purview flag is not configured, run SQL registry" - cd sql-registry - uvicorn main:app --host 0.0.0.0 --port $LISTENING_PORT - else - echo "Purview flag is configured, run Purview registry" - cd purview-registry - uvicorn main:app --host 0.0.0.0 --port $LISTENING_PORT - fi -else - echo "RBAC flag configured, launch both rbac and reigstry apps" +if [ "$REACT_APP_ENABLE_RBAC" == "true" ]; then + echo "RBAC flag configured and set to true, launch both rbac and reigstry apps" if [ "x$PURVIEW_NAME" == "x" ]; then echo "Purview flag is not configured, run SQL registry" cd sql-registry @@ -65,4 +54,15 @@ else export RBAC_API_AUDIENCE="${REACT_APP_AZURE_CLIENT_ID}" export RBAC_CONNECTION_STR="${CONNECTION_STR}" uvicorn main:app --host 0.0.0.0 --port $LISTENING_PORT +else + echo "RBAC flag not configured or not equal to true, only launch registry app" + if [ "x$PURVIEW_NAME" == "x" ]; then + echo "Purview flag is not configured, run SQL registry" + cd sql-registry + uvicorn main:app --host 0.0.0.0 --port $LISTENING_PORT + else + echo "Purview flag is configured, run Purview registry" + cd purview-registry + uvicorn main:app --host 0.0.0.0 --port $LISTENING_PORT + fi fi diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 2735306c5..000000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,35 +0,0 @@ -FROM python:3.9 - -ARG CODE_SERVER_VERSION=2.1692-vsc1.39.2 -ENV CODE_SERVER_AUTH=none \ - CODE_SERVER_PASSWORD=feathr \ - JUPYTER_TOKEN=feathr - -# install enssential libs -RUN apt-get update && \ - apt-get install -y \ - wget curl git cmake - - -# install useful tools: sshd, jupyterlab and so on... -RUN apt-get update && \ - apt-get install -y supervisor vim sudo zip unzip htop && \ - pip install --no-cache-dir jupyterlab==0.35.4 pandavro - -RUN mkdir /feathr -WORKDIR /feathr -RUN pip install git+https://github.com/linkedin/feathr.git#subdirectory=feathr_project - -# install code-server -RUN mkdir -p /opt/code-server && \ - wget -qO- https://github.com/cdr/code-server/releases/download/${CODE_SERVER_VERSION}/code-server${CODE_SERVER_VERSION}-linux-x86_64.tar.gz | tar -xzf - --strip=1 -C /opt/code-server - - -# copy supervisord configuration and mkdir -COPY supervisord.conf /opt/supervisord.conf -RUN mkdir -p /opt/logs - -# 8080: VsCode, 9090: jupyter -EXPOSE 8080 9090 - -CMD ["supervisord", "-c", "/opt/supervisord.conf"] \ No newline at end of file diff --git a/docker/supervisord.conf b/docker/supervisord.conf deleted file mode 100644 index 8550a8ca1..000000000 --- a/docker/supervisord.conf +++ /dev/null @@ -1,39 +0,0 @@ -[unix_http_server] -file=/var/run/supervisor.sock ; (the path to the socket file) -chmod=0700 ; sockef file mode (default 0700) - -[supervisorctl] -serverurl=unix:///var/run/supervisor.sock ; use a unix:// URL for a unix socket - -[supervisord] -pidfile=/var/run/supervisord.log -logfile=/opt/logs/supervisord.log -logfile_maxbytes=256MB -logfile_backups=8 -loglevel=info -nodaemon=true - -[program:jupyter] -directory=/ -command=jupyter lab --ip 0.0.0.0 --port 9090 --allow-root --LabApp.token=%(ENV_JUPYTER_TOKEN)s -stdout_logfile=/opt/logs/jupyter.log -autostart=true -autorestart=true -startsecs=1 -startretries=3 -stopasgroup=true -killasgroup=true -priority=1001 - -[program:codeserver] -directory=/ -environment=PASSWORD="%(ENV_CODE_SERVER_PASSWORD)s" -command=/opt/code-server/code-server --host 0.0.0.0 --port 8080 --auth %(ENV_CODE_SERVER_AUTH)s -stdout_logfile=/opt/logs/code-server.log -autostart=true -autorestart=true -startsecs=1 -startretries=3 -stopasgroup=true -killasgroup=true -priority=1002 \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index ca67ed446..acfae7e62 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@