executorch/.github/workflows/cuda-perf.yml at main · pytorch/executorch

421 lines (377 loc) · 16.5 KB
name: cuda-perf
    branches:
      - main
      - release/*
      - ciflow/cuda-perf/*
  pull_request:
      - .github/workflows/cuda-perf.yml
      - .ci/scripts/cuda_benchmark.py
      - .ci/scripts/cuda_perf_prompts/**
  workflow_dispatch:
    inputs:
      models:
        description: Models to be benchmarked (comma-separated HuggingFace model IDs)
        required: false
        type: string
      quantizations:
        description: Quantization types (comma-separated)
        required: false
        type: string
      num_runs:
        description: Number of benchmark runs per model
        required: false
        type: string
        default: "50"
concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
  set-parameters:
    runs-on: ubuntu-22.04
    outputs:
      benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
      - uses: actions/checkout@v3
        with:
          submodules: 'false'
      - uses: actions/setup-python@v4
        with:
          python-version: '3.10'
      - name: Set parameters
        id: set-parameters
        shell: bash
        env:
          ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt,SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4'
          ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only'
          NUM_RUNS: ${{ inputs.num_runs || '50' }}
        run: |
          set -eux
          MODELS="${{ inputs.models }}"
          QUANTIZATIONS="${{ inputs.quantizations }}"
          # Use all models/quantizations unless overridden by workflow_dispatch
          if [ -z "$MODELS" ]; then
            MODELS="$ALL_MODELS"
          if [ -z "$QUANTIZATIONS" ]; then
            QUANTIZATIONS="$ALL_QUANTIZATIONS"
          # Split models and quantizations into arrays
          IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
          IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS"
          # Generate benchmark configs (skip invalid model/quant combinations)
          CONFIGS='{"include":['
          FIRST=true
          for MODEL in "${MODEL_ARRAY[@]}"; do
            for QUANT in "${QUANT_ARRAY[@]}"; do
              # Qwen3.5 MoE only supports quantized-int4-tile-packed
              if [[ "$MODEL" == *"Qwen3.5-35B-A3B"* ]] && [ "$QUANT" != "quantized-int4-tile-packed" ]; then
                continue
              if [ "$FIRST" = true ]; then
                FIRST=false
              else
                CONFIGS+=','
              # Sanitize model name for use in artifact paths
              MODEL_SAFE=$(echo "$MODEL" | sed 's/\//_/g')
              CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}"
            done
          done
          CONFIGS+=']}'
          echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT
          echo "Generated benchmark configs:"
          echo "$CONFIGS" | python -m json.tool
  export-models:
    name: export-models
    needs: set-parameters
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    permissions:
      id-token: write
      contents: read
    secrets: inherit
    strategy:
      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
      fail-fast: false
      timeout: 90
      secrets-env: EXECUTORCH_HF_TOKEN
      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
      gpu-arch-type: cuda
      gpu-arch-version: "12.6"
      use-custom-docker-registry: false
      submodules: recursive
      upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
        set -eux
        echo "::group::Setup ExecuTorch"
        # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
        export USE_MKL=OFF
        ./install_executorch.sh
        echo "::endgroup::"
        echo "::group::Setup Huggingface"
        pip install -U "huggingface_hub[cli]<1.0" accelerate
        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
        echo "::endgroup::"
        echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}"
        OUTPUT_DIR="model_artifacts"
        mkdir -p "$OUTPUT_DIR"
        bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR"
        # Move artifacts to RUNNER_ARTIFACT_DIR for upload
        mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/"
        ls -lah "${RUNNER_ARTIFACT_DIR}"
        echo "::endgroup::"
  benchmark-cuda:
    name: benchmark-cuda
      - set-parameters
      - export-models
    if: always()
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    permissions:
      id-token: write
      contents: read
    strategy:
      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
      fail-fast: false
      timeout: 90
      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
      gpu-arch-type: cuda
      gpu-arch-version: "12.6"
      use-custom-docker-registry: false
      submodules: recursive
      download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
      upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }}
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
        set -eux
        echo "::group::Setup environment"
        ./install_requirements.sh
        pip list
        echo "::endgroup::"
        echo "::group::Prepare model artifacts"
        mkdir -p model_artifacts
        cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte
        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd
        # Copy additional files if they exist
        if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then
          cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/
        if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then
          cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/
        if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then
          cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/
        if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then
          cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/
        if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then
          cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/
        if [ -f "${RUNNER_ARTIFACT_DIR}/tokenizer.model" ]; then
          cp "${RUNNER_ARTIFACT_DIR}/tokenizer.model" model_artifacts/
        if [ -f "${RUNNER_ARTIFACT_DIR}/test_audio.wav" ]; then
          cp "${RUNNER_ARTIFACT_DIR}/test_audio.wav" model_artifacts/
        # Copy tokenizer files
        for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do
          if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then
            cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/
        done
        ls -lah model_artifacts/
        echo "::endgroup::"
        echo "::group::Build runner"
        bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts
        echo "::endgroup::"
        echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs"
        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
        # Get GPU name using nvidia-smi
        GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
        echo "Detected GPU: $GPU_NAME"
        # Get CUDA driver version
        CUDA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)
        echo "CUDA Driver Version: $CUDA_DRIVER_VERSION"
        # Create results directory (separate from model artifacts)
        RESULTS_DIR="benchmark_results"
        mkdir -p "$RESULTS_DIR"
        # Determine model name and runner command based on model
        case "${{ matrix.model }}" in
          mistralai/Voxtral-Mini-3B-2507)
            RUNNER="cmake-out/examples/models/voxtral/voxtral_runner"
            PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte"
            TOKENIZER="model_artifacts/tekken.json"
            AUDIO="model_artifacts/poem.wav"
            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
            MODEL_NAME="voxtral_${{ matrix.quant }}"
          openai/whisper-*)
            RUNNER="cmake-out/examples/models/whisper/whisper_runner"
            PREPROCESSOR="model_artifacts/whisper_preprocessor.pte"
            AUDIO="model_artifacts/output.wav"
            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
            MODEL_NAME=$(echo "${{ matrix.model }}" | sed 's/openai\///')_${{ matrix.quant }}
          google/gemma-3-4b-it)
            RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner"
            IMAGE="docs/source/_static/img/et-logo.png"
            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0"
            MODEL_NAME="gemma3_${{ matrix.quant }}"
          nvidia/parakeet-tdt)
            RUNNER="cmake-out/examples/models/parakeet/parakeet_runner"
            AUDIO="model_artifacts/test_audio.wav"
            TOKENIZER="model_artifacts/tokenizer.model"
            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --audio_path $AUDIO --tokenizer_path $TOKENIZER"
            MODEL_NAME="parakeet_${{ matrix.quant }}"
          SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
            RUNNER="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
            TOKENIZER="model_artifacts/tokenizer.json"
            # Use a checked-in long prompt (>1000 tokens) for benchmarking. A
            # static, meaningful prompt avoids the degenerate / repetitive
            # outputs that can result from synthetic prompts built by
            # repeating the same sentence.
            PROMPT_FILE=".ci/scripts/cuda_perf_prompts/qwen3_5_moe_long_prompt.txt"
            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --prompt_file $PROMPT_FILE --max_new_tokens 512 --temperature 0"
            MODEL_NAME="qwen3_5_moe_${{ matrix.quant }}"
            echo "Error: Unsupported model '${{ matrix.model }}'"
            exit 1
        esac
        # Run benchmark using cuda_benchmark.py
        python .ci/scripts/cuda_benchmark.py \
          --runner_command "$RUNNER_CMD" \
          --model_name "$MODEL_NAME" \
          --num_runs "${{ matrix.num_runs }}" \
          --output_json "$RESULTS_DIR/benchmark_results.json" \
          --output_v3 "$RESULTS_DIR/benchmark_results_v3.json" \
          --model "${{ matrix.model }}" \
          --quantization "${{ matrix.quant }}" \
          --git_sha "${{ github.sha }}" \
          --workflow_run_id "${{ github.run_id }}" \
          --workflow_run_url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
          --gpu_name "$GPU_NAME" \
          --cuda_driver_version "$CUDA_DRIVER_VERSION"
        # Save additional metadata
        cat > "$RESULTS_DIR/metadata.json" <<EOF
          "model": "${{ matrix.model }}",
          "quantization": "${{ matrix.quant }}",
          "num_runs": ${{ matrix.num_runs }},
          "runner": "$RUNNER",
          "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
          "git_sha": "${{ github.sha }}",
          "workflow_run_id": "${{ github.run_id }}",
          "workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
        EOF
        # Only copy benchmark results to RUNNER_ARTIFACT_DIR for upload (not the entire model)
        # First, clean up the downloaded model artifacts from RUNNER_ARTIFACT_DIR
        rm -rf "${RUNNER_ARTIFACT_DIR}"/*
        # Then copy only the benchmark result JSON files
        cp "$RESULTS_DIR"/*.json "${RUNNER_ARTIFACT_DIR}/"
        echo "Benchmark results prepared for upload:"
        ls -lah "${RUNNER_ARTIFACT_DIR}"
        echo "::endgroup::"
  upload-benchmark-results:
      - benchmark-cuda
    if: always()
    runs-on: ubuntu-22.04
    environment: upload-benchmark-results
    permissions:
      id-token: write
      contents: read
      - uses: actions/checkout@v3
        with:
          submodules: false
      - name: Setup Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.10'
      - name: Download all benchmark results
        uses: actions/download-artifact@v4
        with:
          pattern: results-*
          path: all_results/
      - name: Process and display results
        shell: bash
        run: |
          set -eux
          echo "::group::Benchmark Results Summary"
          for RESULT_DIR in all_results/results-*/; do
            if [ -f "$RESULT_DIR/benchmark_results.json" ]; then
              echo ""
              echo "================================"
              echo "Results from: $(basename "$RESULT_DIR")"
              echo "================================"
              # Display benchmark results (mean performance)
              cat "$RESULT_DIR/benchmark_results.json" | python -m json.tool
              # Display metadata
              if [ -f "$RESULT_DIR/metadata.json" ]; then
                echo ""
                echo "--- Metadata ---"
                cat "$RESULT_DIR/metadata.json" | python -m json.tool
              echo ""
          done
          echo "::endgroup::"
      - name: Authenticate with AWS
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
          role-duration-seconds: 18000
          aws-region: us-east-1
      - name: Upload to S3
        shell: bash
        env:
          S3_BUCKET: gha-artifacts
          S3_PREFIX: executorch-cuda-perf/${{ github.run_id }}/${{ github.run_attempt }}
        run: |
          set -eux
          pip install awscli
          echo "Uploading benchmark results to S3..."
          aws s3 sync all_results/ "s3://${S3_BUCKET}/${S3_PREFIX}/" \
            --exclude "*" \
            --include "*.json" \
            --include "*.log"
          echo "Results uploaded to: s3://${S3_BUCKET}/${S3_PREFIX}/"
      - name: Prepare v3 results for dashboard upload
        shell: bash
        run: |
          set -eux
          echo "::group::Prepare v3 results"
          mkdir -p benchmark-results/v3
          # Collect all v3 results into a single directory
          for RESULT_DIR in all_results/results-*/; do
            if [ -f "$RESULT_DIR/benchmark_results_v3.json" ]; then
              # Generate unique filename based on directory name
              FILENAME=$(basename "$RESULT_DIR")
              cp "$RESULT_DIR/benchmark_results_v3.json" "benchmark-results/v3/${FILENAME}.json"
              echo "✓ Copied $FILENAME v3 results"
          done
          echo "V3 results prepared:"
          ls -lah benchmark-results/v3/
          echo "::endgroup::"
      - name: Upload benchmark results to dashboard
        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
        with:
          benchmark-results-dir: benchmark-results/v3
          dry-run: false
          schema-version: v3
          github-token: ${{ secrets.GITHUB_TOKEN }}
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

cuda-perf.yml

Latest commit

History

cuda-perf.yml

File metadata and controls