executorch/.github/workflows/metal.yml at main · pytorch/executorch

288 lines (262 loc) · 11.6 KB
name: Test Metal Backend
    branches:
      - main
      - release/*
      - ciflow/metal/*
  pull_request:
      - .github/workflows/metal.yml
      - backends/apple/metal/**
      - backends/aoti/**
      - examples/models/qwen3_5_moe/**
      - extension/llm/export/**
  workflow_dispatch:
concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true
  test-metal-builds:
    name: test-executorch-metal-build
    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
      runner: macos-m2-stable
      python-version: '3.11'
      submodules: 'recursive'
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      timeout: 90
      script: |
        set -eux
        echo "::group::Test ExecuTorch Metal build"
        PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh
        echo "::endgroup::"
  test-metal-modules:
    name: test-metal-backend-modules
    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
      runner: macos-m2-stable
      python-version: '3.11'
      submodules: 'recursive'
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      timeout: 120
      script: |
        set -eux
        echo "::group::Setup ExecuTorch"
        PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
        echo "::endgroup::"
        echo "::group::Build Metal Runtime"
        ${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --build
        echo "::endgroup::"
        echo "::group::Run Metal Backend Module Tests"
        ${CONDA_RUN} python -m unittest backends.apple.metal.tests.test_modules.TestMetalBackendModules
        echo "::endgroup::"
  test-metal-qwen35-moe-tiny:
    name: test-metal-qwen35-moe-tiny
    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
      runner: macos-m2-stable
      python-version: '3.11'
      submodules: 'recursive'
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      timeout: 120
      script: |
        set -eux
        echo "::group::Setup ExecuTorch"
        PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
        echo "::endgroup::"
        # Isolate Inductor cache per job to prevent PCH conflicts
        export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX")
        export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX")
        echo "::group::Export Qwen 3.5 MoE (tiny model, Metal)"
        ${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \
          --tiny-test \
          --backend metal \
          --qlinear fpa4w \
          --output-dir /tmp/qwen35_moe_metal_tiny
        echo "::endgroup::"
        echo "::group::Build Metal runtime and Qwen 3.5 MoE runner"
        ${CONDA_RUN} cmake --workflow --preset llm-release-metal
        cd examples/models/qwen3_5_moe
        ${CONDA_RUN} cmake --workflow --preset qwen3-5-moe-metal
        cd -
        echo "::endgroup::"
        # Create a byte-level tokenizer for the tiny model (vocab_size=256).
        # Maps each byte value to its own token ID so any prompt produces valid IDs.
        ${CONDA_RUN} python - <<'PY'
        import json
        vocab = {chr(i) if 32 <= i < 127 else f'<0x{i:02X}>': i for i in range(256)}
        tokenizer = {
          'version': '1.0',
          'model': {'type': 'BPE', 'vocab': vocab, 'merges': []},
          'added_tokens': [{'id': i, 'content': chr(i) if 32 <= i < 127 else f'<0x{i:02X}>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': False, 'special': False} for i in range(256)],
        with open('/tmp/qwen35_moe_metal_tiny/tokenizer.json', 'w') as f:
          json.dump(tokenizer, f)
        print('Created byte-level tokenizer.json')
        RUNNER=./cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner
        # Patch absolute libomp install name to rpath-based lookup (same as test_model_e2e.sh)
        if otool -L "$RUNNER" | grep -q "/opt/llvm-openmp/lib/libomp.dylib"; then
          install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER"
        MODEL=/tmp/qwen35_moe_metal_tiny/model.pte
        TOKENIZER=/tmp/qwen35_moe_metal_tiny/tokenizer.json
        echo "::group::Run Qwen 3.5 MoE inference (T=1 decode)"
        # Single-char prompt → 1 token → exercises decode-only path
        set +e
        OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \
          --prompt "A" --temperature 0 --max_new_tokens 4 2>&1)
        RC=$?
        set -e
        echo "$OUTPUT"
        if [ $RC -ne 0 ]; then
          echo "Failed: runner exited with code $RC"
          exit 1
        echo "$OUTPUT" | grep -q "Prompt tokens: 1" || { echo "Failed: expected 1 prompt token for decode path"; exit 1; }
        echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: decode did not complete"; exit 1; }
        echo "Success: decode completed"
        echo "::endgroup::"
        echo "::group::Run Qwen 3.5 MoE inference (T>2 prefill + decode)"
        set +e
        OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \
          --prompt "one two three" --temperature 0 --max_new_tokens 4 2>&1)
        RC=$?
        set -e
        echo "$OUTPUT"
        if [ $RC -ne 0 ]; then
          echo "Failed: runner exited with code $RC"
          exit 1
        # Byte-level tokenizer: "one two three" = 13 tokens (13 bytes)
        PROMPT_TOKENS=$(echo "$OUTPUT" | grep -o "Prompt tokens: [0-9]*" | head -1 | grep -o "[0-9]*")
        if [ "$PROMPT_TOKENS" -le 2 ]; then
          echo "Failed: expected >2 prompt tokens for prefill path, got $PROMPT_TOKENS"
          exit 1
        echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: prefill + decode did not complete"; exit 1; }
        echo "Success: prefill ($PROMPT_TOKENS tokens) + decode completed"
        echo "::endgroup::"
  export-model-metal-artifact:
    name: export-model-metal-artifact
    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
    secrets: inherit
    strategy:
      fail-fast: false
      matrix:
        model:
          - repo: "mistralai"
            name: "Voxtral-Mini-3B-2507"
          - repo: "mistralai"
            name: "Voxtral-Mini-4B-Realtime-2602"
          - repo: "openai"
            name: "whisper-small"
          - repo: "openai"
            name: "whisper-large-v3-turbo"
          - repo: "nvidia"
            name: "parakeet-tdt"
        quant:
          - "non-quantized"
          - "quantized-int4-metal"
        exclude:
          # Exclude non-quantized for Voxtral Realtime (too large)
          - model:
              repo: "mistralai"
              name: "Voxtral-Mini-4B-Realtime-2602"
            quant: "non-quantized"
      runner: macos-m2-stable
      python-version: '3.11'
      submodules: 'recursive'
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      timeout: 90
      secrets-env: EXECUTORCH_HF_TOKEN
      upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }}
      script: |
        set -eux
        echo "::group::Setup Huggingface"
        ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" accelerate
        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
        echo "::endgroup::"
        echo "::group::Setup Optimum-ExecuTorch"
        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
        echo "Using optimum-executorch version: ${OPTIMUM_ET_VERSION}"
        ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
        echo "::endgroup::"
        echo "::group::Setup ExecuTorch"
        PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
        echo "::endgroup::"
        echo "::group::Pip List"
        ${CONDA_RUN} pip list
        echo "::endgroup::"
        # Isolate Inductor cache and precompiled headers (PCH) per job to prevent
        # PCH mtime conflicts between parallel matrix jobs on the same runner.
        # TORCHINDUCTOR_CACHE_DIR isolates the code cache; setting TMPDIR isolates
        # the PCH dir, which PyTorch derives from tempfile.gettempdir() independently.
        export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX")
        export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX")
        ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
  test-model-metal-e2e:
    name: test-model-metal-e2e
    needs: export-model-metal-artifact
    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
    strategy:
      fail-fast: false
      matrix:
        model:
          - repo: "mistralai"
            name: "Voxtral-Mini-3B-2507"
          - repo: "mistralai"
            name: "Voxtral-Mini-4B-Realtime-2602"
          - repo: "openai"
            name: "whisper-small"
          - repo: "openai"
            name: "whisper-large-v3-turbo"
          - repo: "nvidia"
            name: "parakeet-tdt"
        quant:
          - "non-quantized"
          - "quantized-int4-metal"
        exclude:
          # Exclude non-quantized for Voxtral Realtime (too large)
          - model:
              repo: "mistralai"
              name: "Voxtral-Mini-4B-Realtime-2602"
            quant: "non-quantized"
      runner: macos-m2-stable
      python-version: '3.11'
      submodules: 'recursive'
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      timeout: 90
      download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }}
      script: |
        set -eux
        echo "::group::Print machine info"
        uname -a
        if [ $(uname -s) == Darwin ]; then
          sw_vers
          # Print RAM in GB
          RAM_BYTES=$(sysctl -n hw.memsize)
          RAM_GB=$(echo "scale=2; $RAM_BYTES/1024/1024/1024" | bc)
          echo "Available RAM (GB): $RAM_GB"
          sysctl machdep.cpu.brand_string
          sysctl machdep.cpu.core_count
          # Print number of GPU cores (Apple Silicon)
          if command -v system_profiler &> /dev/null; then
            GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Total Number of Cores/ {print $5; exit}')
            if [ -z "$GPU_CORES" ]; then
              # Fallback: try to parse "Core Count" from Apple GPU section
              GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Core Count/ {print $3; exit}')
            echo "GPU Cores: ${GPU_CORES:-Unknown}"
          else
            echo "system_profiler not available, cannot determine GPU cores."
        echo "::endgroup::"
        ${CONDA_RUN} bash .ci/scripts/test_model_e2e.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

metal.yml

Latest commit

History

metal.yml

File metadata and controls