diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml deleted file mode 100644 index 7eaf017fbc..0000000000 --- a/.github/workflows/build-and-release.yaml +++ /dev/null @@ -1,145 +0,0 @@ -name: Build Release - -on: workflow_dispatch - -permissions: - contents: write - -jobs: - build_wheels: - name: Build wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-22.04, windows-2022, macos-14, macos-15] - - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - # Used to host cibuildwheel - - uses: actions/setup-python@v5 - with: - python-version: "3.9" - - - name: Install dependencies (Linux/MacOS) - if: runner.os != 'Windows' - run: | - python -m pip install --upgrade pip - python -m pip install uv - RUST_LOG=trace python -m uv pip install -e .[all] --verbose - shell: bash - - - name: Install dependencies (Windows) - if: runner.os == 'Windows' - env: - RUST_LOG: trace - run: | - python -m pip install --upgrade pip - python -m pip install uv - python -m uv pip install -e .[all] --verbose - shell: cmd - - - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 - env: - # disable repair - CIBW_REPAIR_WHEEL_COMMAND: "" - with: - package-dir: . - output-dir: wheelhouse - - - uses: actions/upload-artifact@v4 - with: - name: wheels-${{ matrix.os }} - path: ./wheelhouse/*.whl - - build_wheels_arm64: - name: Build arm64 wheels - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - - - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 - env: - CIBW_SKIP: "*musllinux* pp*" - CIBW_REPAIR_WHEEL_COMMAND: "" - CIBW_ARCHS: "aarch64" - CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DCMAKE_CROSSCOMPILING=ON" - CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" - with: - output-dir: wheelhouse - - - name: Upload wheels as artifacts - uses: actions/upload-artifact@v4 - with: - name: wheels_arm64 - path: ./wheelhouse/*.whl - - build_sdist: - name: Build source distribution - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - uses: actions/setup-python@v5 - with: - python-version: "3.9" - - - name: Install dependencies (Linux/MacOS) - if: runner.os != 'Windows' - run: | - python -m pip install --upgrade pip - python -m pip install uv - RUST_LOG=trace python -m uv pip install -e .[all] --verbose - python -m uv pip install build - shell: bash - - - name: Install dependencies (Windows) - if: runner.os == 'Windows' - env: - RUST_LOG: trace - run: | - python -m pip install --upgrade pip - python -m pip install uv - python -m uv pip install -e .[all] --verbose - python -m uv pip install build - shell: cmd - - - name: Build source distribution - run: | - python -m build --sdist - - - uses: actions/upload-artifact@v4 - with: - name: sdist - path: ./dist/*.tar.gz - - release: - name: Release - needs: [build_wheels, build_wheels_arm64, build_sdist] - runs-on: ubuntu-latest - - steps: - - uses: actions/download-artifact@v4 - with: - merge-multiple: true - path: dist - - - uses: softprops/action-gh-release@v2 - with: - files: dist/* - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml deleted file mode 100644 index b290f6273f..0000000000 --- a/.github/workflows/build-docker.yaml +++ /dev/null @@ -1,50 +0,0 @@ -name: Build Docker - -on: workflow_dispatch - -permissions: - contents: write - packages: write - -jobs: - docker: - name: Build and push Docker image - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push - id: docker_build - uses: docker/build-push-action@v6 - with: - context: . - file: "docker/simple/Dockerfile" - push: ${{ startsWith(github.ref, 'refs/tags/') }} - pull: true - platforms: linux/amd64,linux/arm64 - tags: | - ghcr.io/abetlen/llama-cpp-python:latest - ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }} - build-args: | - BUILDKIT_INLINE_CACHE=1 - - - name: Publish to GitHub Tag - if: steps.docker_build.outputs.digest && startsWith(github.ref, 'refs/tags/') - run: | - echo "Docker image published for tag: ${{ github.ref_name }}" diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 889a1679a4..d7a3a90d81 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -1,23 +1,24 @@ -name: Build Wheels(CU124) for Linux # Workflow name +name: Build Wheels (CU124) for Linux on: - workflow_dispatch: # Manual trigger + workflow_dispatch: permissions: contents: write jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu124 runs-on: ubuntu-22.04 container: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + strategy: - matrix: # Define the build matrix directly here + fail-fast: false + matrix: os: ["ubuntu-22.04"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.4.1"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc + cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real"] defaults: run: @@ -25,108 +26,131 @@ jobs: env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 steps: - name: Install dependencies run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - run: nvcc -V + - name: Show CUDA version + run: nvcc -V - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + - name: Build wheel env: LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + set -euo pipefail - # --- Post-build steps to get info for rename wheel file and release tag --- + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi wheel_path=$(ls dist/*.whl | head -n 1) filename=$(basename "$wheel_path") - # Split wheel filename + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - # Rename wheel file mv "$wheel_path" "dist/$new_filename" echo "Renamed wheel to: $new_filename" - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" - - name: Get Current Date # Step to get current date for the release tag + - name: Get current date id: get-date run: | - # Get date in YYYYMMDD format using bash date command currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu124-win.yml b/.github/workflows/build-wheels-cu124-win.yml index 01bd48e7de..e856533410 100644 --- a/.github/workflows/build-wheels-cu124-win.yml +++ b/.github/workflows/build-wheels-cu124-win.yml @@ -8,85 +8,141 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu124 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.4.1"] - releasetag: ["Basic"] cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real;90-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +153,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split wheel filename: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +162,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 568824c642..9f28a57ca2 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -1,23 +1,24 @@ -name: Build Wheels(CU126) for Linux # Workflow name +name: Build Wheels (CU126) for Linux on: - workflow_dispatch: # Manual trigger + workflow_dispatch: permissions: contents: write jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu126 runs-on: ubuntu-22.04 container: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 + strategy: - matrix: # Define the build matrix directly here + fail-fast: false + matrix: os: ["ubuntu-22.04"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.6.3"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc + cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real"] defaults: run: @@ -25,108 +26,131 @@ jobs: env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 steps: - name: Install dependencies run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - run: nvcc -V + - name: Show CUDA version + run: nvcc -V - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + - name: Build wheel env: LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + set -euo pipefail - # --- Post-build steps to get info for rename wheel file and release tag --- + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi wheel_path=$(ls dist/*.whl | head -n 1) filename=$(basename "$wheel_path") - # Split wheel filename + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - # Rename wheel file mv "$wheel_path" "dist/$new_filename" echo "Renamed wheel to: $new_filename" - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" - - name: Get Current Date # Step to get current date for the release tag + - name: Get current date id: get-date run: | - # Get date in YYYYMMDD format using bash date command currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu126-win.yml b/.github/workflows/build-wheels-cu126-win.yml index 9330cb130b..b77b17917f 100644 --- a/.github/workflows/build-wheels-cu126-win.yml +++ b/.github/workflows/build-wheels-cu126-win.yml @@ -8,85 +8,141 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu126 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.6.3"] - releasetag: ["Basic"] cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real;90-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +153,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split wheel filename: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +162,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index d1c387c52a..c6b255c9f9 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -1,23 +1,24 @@ -name: Build Wheels(CU128) for Linux # Workflow name +name: Build Wheels (CU128) for Linux on: - workflow_dispatch: # Manual trigger + workflow_dispatch: permissions: contents: write jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu128 runs-on: ubuntu-22.04 container: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 + strategy: - matrix: # Define the build matrix directly here + fail-fast: false + matrix: os: ["ubuntu-22.04"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.8.1"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] defaults: run: @@ -25,108 +26,131 @@ jobs: env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 steps: - name: Install dependencies run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - run: nvcc -V + - name: Show CUDA version + run: nvcc -V - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + - name: Build wheel env: LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + set -euo pipefail - # --- Post-build steps to get info for rename wheel file and release tag --- + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi wheel_path=$(ls dist/*.whl | head -n 1) filename=$(basename "$wheel_path") - # Split wheel filename + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - # Rename wheel file mv "$wheel_path" "dist/$new_filename" echo "Renamed wheel to: $new_filename" - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" - - name: Get Current Date # Step to get current date for the release tag + - name: Get current date id: get-date run: | - # Get date in YYYYMMDD format using bash date command currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index 98ebbc4127..223473dde6 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -8,85 +8,141 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu128 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.8.1"] - releasetag: ["Basic"] - cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +153,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split file name: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +162,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml deleted file mode 100644 index 4f4305ad3e..0000000000 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ /dev/null @@ -1,132 +0,0 @@ -name: Build Wheels(CU130) for Linux - -on: - workflow_dispatch: # Manual trigger - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} - runs-on: ubuntu-22.04 - container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04 - strategy: - matrix: # Define the build matrix directly here - os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions - cuda: ["13.0.2"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc - - defaults: - run: - shell: bash - - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - - steps: - - name: Install dependencies - run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code - with: - submodules: "recursive" - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v7 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - run: nvcc -V - - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel - env: - LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR - run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - - # --- Post-build steps to get info for rename wheel file and release tag --- - - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') - - wheel_path=$(ls dist/*.whl | head -n 1) - filename=$(basename "$wheel_path") - - # Split wheel filename - IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" - new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - - # Rename wheel file - mv "$wheel_path" "dist/$new_filename" - echo "Renamed wheel to: $new_filename" - - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step - - - name: Get Current Date # Step to get current date for the release tag - id: get-date - run: | - # Get date in YYYYMMDD format using bash date command - currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release - with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index d6187d7bf4..790d7c9665 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -8,85 +8,199 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu130 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["13.0.2"] - releasetag: ["Basic"] cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive + + - name: Inspect Visual Studio OpenMP runtime paths + run: | + Write-Output "ProgramFiles=$env:ProgramFiles" + Write-Output "ProgramFiles(x86)=${env:ProgramFiles(x86)}" + Write-Output "" + + $vsRoots = @( + "$env:ProgramFiles\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "$env:ProgramFiles\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC" + ) + + foreach ($root in $vsRoots) { + Write-Output "Checking root: $root" + + if (Test-Path $root) { + Write-Output " Exists: yes" + Write-Output " MSVC version directories:" + + Get-ChildItem $root -Directory -ErrorAction SilentlyContinue | + Sort-Object Name | + ForEach-Object { + Write-Output " $($_.FullName)" + } + + Write-Output " OpenMP runtime candidates:" + + Get-ChildItem $root -Recurse -Filter "libomp140.x86_64.dll" -ErrorAction SilentlyContinue | + Sort-Object FullName | + ForEach-Object { + $sizeKB = [Math]::Round($_.Length / 1KB, 2) + $sizeMB = [Math]::Round($_.Length / 1MB, 4) + + Write-Output " Path: $($_.FullName)" + Write-Output " Size: $($_.Length) bytes / $sizeKB KB / $sizeMB MB" + } + } else { + Write-Output " Exists: no" + } + + Write-Output "" + } + + Write-Output "Checking System32 fallback:" + $system32OpenMP = "C:\Windows\System32\libomp140.x86_64.dll" + + if (Test-Path $system32OpenMP) { + $dll = Get-Item $system32OpenMP + $sizeKB = [Math]::Round($dll.Length / 1KB, 2) + $sizeMB = [Math]::Round($dll.Length / 1MB, 4) + + Write-Output " Path: $($dll.FullName)" + Write-Output " Size: $($dll.Length) bytes / $sizeKB KB / $sizeMB MB" + } else { + Write-Output " Not found: $system32OpenMP" + } - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +211,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split file name: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +220,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu131-linux.yml b/.github/workflows/build-wheels-cu131-linux.yml new file mode 100644 index 0000000000..d70f8a01c8 --- /dev/null +++ b/.github/workflows/build-wheels-cu131-linux.yml @@ -0,0 +1,156 @@ +name: Build Wheels (CU131) for Linux + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu131 + runs-on: ubuntu-22.04 + container: nvidia/cuda:13.1.2-cudnn-devel-ubuntu22.04 + + strategy: + fail-fast: false + matrix: + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions + cuda: ["13.1.2"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real;121-real"] + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Install dependencies + run: | + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Show CUDA version + run: nvcc -V + + - name: Build wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" + run: | + set -euo pipefail + + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true + + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi + + wheel_path=$(ls dist/*.whl | head -n 1) + filename=$(basename "$wheel_path") + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" + + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" + new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" + + mv "$wheel_path" "dist/$new_filename" + echo "Renamed wheel to: $new_filename" + + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" + + - name: Get current date + id: get-date + run: | + currentDate=$(date +%Y%m%d) + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml new file mode 100644 index 0000000000..14bea65d19 --- /dev/null +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -0,0 +1,191 @@ +name: Build Wheels (CU131) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu131 + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["windows-2022"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] + cuda: ["13.1.1"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + + defaults: + run: + shell: pwsh + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Add MSBuild to PATH + uses: microsoft/setup-msbuild@v3 + with: + msbuild-architecture: x64 + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Install CUDA ${{ matrix.cuda }} + uses: Jimver/cuda-toolkit@v0.2.35 + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda }} + use-github-cache: false + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 + } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + + # Write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV + + - name: Get current date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 40675b4c26..2b00d1abaa 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -8,8 +8,8 @@ permissions: jobs: build_wheels: - name: Build wheels (Metal macos) - runs-on: macos-latest + name: Build wheels (Metal macos-26) + runs-on: macos-26 outputs: version: ${{steps.get_version.outputs.version}} @@ -53,8 +53,7 @@ jobs: -DCMAKE_CROSSCOMPILING=on -DGGML_METAL=on -DGGML_METAL_USE_BF16=on - -DGGML_METAL_EMBED_LIBRARY=off - -DGGML_METAL_SHADER_DEBUG=on" + -DGGML_METAL_EMBED_LIBRARY=on" with: package-dir: . output-dir: wheelhouse2 @@ -75,7 +74,7 @@ jobs: uses: actions/checkout@v6 - name: Download artifacts - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: merge-multiple: true path: dist2 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 335b0f0ac3..ec81b294c4 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,35 +24,27 @@ jobs: # Don't cancel other jobs in the matrix if one fails fail-fast: false matrix: - os: [ubuntu-latest, windows-latest] - python-version: ["3.9", "3.13", "3.14"] + os: [ubuntu-latest, windows-2022] + python-version: ["3.9", "3.14"] include: # macOS Non-Metal - - os: macos-14 + - os: macos-15-intel python-version: "3.9" - cmake_args: "-DLLAMA_METAL=off" + cmake_args: "-DLLAMA_METAL=off -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3" metal_status: "(No Metal)" - - os: macos-14 - python-version: "3.13" - cmake_args: "-DLLAMA_METAL=off" - metal_status: "(No Metal)" - - os: macos-14 + - os: macos-15-intel python-version: "3.14" - cmake_args: "-DLLAMA_METAL=off" + cmake_args: "-DLLAMA_METAL=off -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3" metal_status: "(No Metal)" # macOS Metal - - os: macos-14 + - os: macos-26 python-version: "3.9" - cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" - metal_status: "(Metal)" - - os: macos-14 - python-version: "3.13" - cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" + cmake_args: "-DGGML_METAL_EMBED_LIBRARY=off -DGGML_RPC=on" metal_status: "(Metal)" - - os: macos-14 + - os: macos-26 python-version: "3.14" - cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" + cmake_args: "-DGGML_METAL_EMBED_LIBRARY=off -DGGML_RPC=on" metal_status: "(Metal)" steps: diff --git a/.gitmodules b/.gitmodules index 7edf0975dc..f56cca32df 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git + url = https://github.com/ggml-org/llama.cpp.git diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index ff3e950cd1..0000000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Read the Docs configuration file for MkDocs projects -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Set the version of Python and other tools you might need -build: - os: ubuntu-22.04 - tools: - python: "3.11" - -mkdocs: - configuration: mkdocs.yml - -python: - install: - - method: pip - path: . - - requirements: docs/requirements.txt - -submodules: - include: all - recursive: true \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 253b2ae4cc..1865195db3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,424 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.40-Milestone] Reasoning Budget Control, Gemma 4 12B Support, Enhanced Jinja2ChatFormatter, NGram k/k4v Speculative Decoding, Faster Native Sampling and Multimodal Improvements + +- feat(internals): Add `ReasoningBudgetSampler` support + - Add Python-backed `ReasoningBudgetSampler` for first reasoning-block control + - Install the sampler before probability filters to preserve forced end tokens + - Support `reasoning_budget` **-1/0/N** semantics in sampling params + - Force `reasoning_budget_message` + `reasoning_end` when the budget is exhausted + - Add manual `force_reasoning_budget()` at the sampling-context level + - Match llama.cpp force behavior by allowing only `COUNTING -> FORCING` + - Keep DONE as permanent passthrough and ignore later reasoning tags + - Support prefilled reasoning starts with `reasoning_start_in_prompt` + - Preserve UTF-8 boundary safety before forcing the end sequence + - Keep Python-backed custom sampler callbacks alive across C sampler usage + - Avoid shallow-copying custom_samplers when cloning sampler chains + - Add `verbose` parameter to `ReasoningBudgetSampler` to print high-level + state transitions to stderr. + - Log key events: initialization, `reasoning_start matched`, `budget exhausted`, + `forced end sequence`, `UTF-8 boundary waiting`, `manual force`, `natural end`, `reset`. + - Pass `verbose=getattr(model, "verbose", False)` from `LlamaSamplingContext` + when building the sampler chain. + - Preserve verbose flag when cloning the sampler. + +- feat(Llama): pass `reasoning budget` params through Llama APIs + - Add `reasoning budget` params to public completion and chat entry points + - Forward the params from chat handlers into `create_completion` + - Propagate reasoning budget controls down to `generate` and `sampling params` + - Document -1/0/N reasoning_budget behavior in completion docstrings + - Support custom `reasoning_start` and `reasoning_end` tags without model-specific inference + - Support `reasoning_budget_message` and `reasoning_start_in_prompt` + - Wire `MTMD chat handler` to the same reasoning budget controls + +- feat(sampling): add reasoning budget configurations + * Introduce reasoning budget and block control parameters to `LlamaSamplingParams` + to mirror llama.cpp CLI semantics. This includes: + - `reasoning_budget` + - `reasoning_start` / `reasoning_end` + - `reasoning_budget_message` + - `reasoning_start_in_prompt` + - `reasoning_start_max_tokens` + - Fix typo from typ_p to typical_p in logs + - Also updated `print_params()` to include these new metrics. + +- feat: add `ReasoningBudgetState` enum and `TokenMatcher` helper class to _internals.py + * Introduce `ReasoningBudgetState` enum and `TokenMatcher` helper class + to `_internals.py`. This lays the groundwork for the upcoming + `ReasoningBudgetSampler`, mirroring the state machine defined in + `common/reasoning-budget.h`. + + - `ReasoningBudgetState`: Tracks the lifecycle of the first reasoning block. + - `TokenMatcher`: Handles incremental matching for multi-token sequences. + +- docs(README): document reasoning budget sampler usage + - Add README section for first reasoning-block budget control + - Document reasoning_budget -1/0/N semantics and related sampler parameters + - Explain reasoning_budget_message injection before reasoning_end + - Add examples for default tags, Mistral [THINK] tags, and Gemma4 channel tags + - Clarify when to use reasoning_start_in_prompt for prefilled thinking tags + - Note that reasoning_start_in_prompt is not a generic thinking-enabled switch + - Mention verbose transition logs for reasoning-budget state changes + - docs(README): Update ReasoningBudgetSampler quick link + +- feat(chat-format): Update `google/gemma-4` chat template jinja + +- feat(llama): enhance chat template initialization with full special tokens + * Update Llama.__init__ to register additional tokenizer special tokens + and improve stop token handling for chat templates. + + - Expose extra special tokens (EOT, SEP, NL, PAD, MASK) via + `special_tokens_map` to Jinja2ChatFormatter. + - Keep BOS and EOS tokens as explicit parameters, no longer redundantly + put them in `special_tokens_map`. + - Build `stop_token_ids` once, including EOS and EOT tokens, skipping + invalid (-1) ids. + - Update try-block comment: now `{% generation %}` blocks are supported, + guard only against malformed or model-specific templates. + - This ensures better compatibility with HuggingFace-style chat templates + while maintaining llama-cpp-python prompt-rendering behavior. + +- **feat(chat-format): improve Jinja2ChatFormatter HF compatibility** + * Enhance Jinja2ChatFormatter to better support HuggingFace-style chat + templates while keeping the formatter lightweight and aligned with + llama-cpp-python's prompt-rendering needs. + + - Key changes: + - Add IgnoreGenerationTags Jinja extension for HF `{% generation %}` blocks. + - Enable Jinja loop controls for chat templates using break/continue. + - Register Transformers-compatible `tojson` behavior. + - Register `raise_exception` and `strftime_now` as Jinja globals. + - Add `special_tokens_map` support for additional template variables. + - Add optional `documents` argument for document-aware templates. + - Precompute text stop sequences and token-id stopping criteria. + - Improve type normalization for `stop_token_ids`. + - Expand docstrings for formatter initialization and render-time variables. + +- docs(wiki): update SCHEMA.md to v0.4 with full wiki path layout + - Added comprehensive docs/wiki/ directory structure overview. + - Reorganized modules description; removed hardcoded module page list. + - Clarified top-level file purposes and update guidance. + - Updated page type examples and templates (Class/Module, Feature, Example, Development). + - Strengthened cross-linking rules and update/placeholder guidance. + - Bumped schema version from 0.3 → 0.4 and last_modified date. + +- docs(install): add source-aligned build and backend guide + * Document installation workflows for llama-cpp-python with a focus on + the underlying llama.cpp CMake build configuration. + - Add virtual environment, source install, editable install, rebuild, and + verification guidance. + - Document common CMake options such as GGML_NATIVE, + GGML_BACKEND_DL, GGML_CPU_ALL_VARIANTS, and compiler selection. + - Summarize backend-specific build flags for CUDA, BLAS, Metal, Vulkan, + OpenVINO, HIP, SYCL, OpenCL, CANN, ZenDNN, and zDNN. + - Include backend runtime notes and common installation pitfalls while + keeping server-related installation content out of the page. + - docs(wiki): link installation guide from index + * Promote the completed installation guide into the wiki entry point so + new users can find build and backend setup instructions before reading + API-specific documentation. + - Add a Getting Started section that links to install.md. + - Move installation to the top of the recommended reading order. + - Mark install.md as an available page. + - Remove installation from the planned documentation areas. + - docs(readme): link detailed installation wiki guide + +- feat(mtmd): improve fallback chat template for multimodal models + - Add BOS/EOS token handling to the default MTMD chat format. + - Use a clearer role-based template with explicit USER and ASSISTANT prefixes. + - Append a newline after each message to keep generated prompts readable. + - Treat EOS as the end marker for the serialized conversation history before + the optional generation prompt. + - Improve fallback behavior for multimodal GGUF models that do not provide a + chat template, such as OCR-oriented models like `DeepSeek-OCR 1/2`. + - Make the default system prompt a single normalized string while preserving + its original meaning. + - Clean up minor formatting around MTMD context parameter initialization. + - docs(Readme): Update `Deepseek-OCR-2-GGUF` Link + - docs(README): update `MinerU2.5-Pro-2605-1.2B` OCR model support and link + + This improves prompt compatibility for multimodal models that either lack a + GGUF chat template or are not yet covered by a complete custom chat handler. + +- refactor(internals): align model metadata wrappers with llama.cpp API + - Use `llama_vocab_n_tokens()` instead of the old vocab size helper. + - Add Python wrappers for model description, size, chat template, and + trained RoPE frequency scaling. + - Clarify model capability helpers with docstrings matching llama.cpp + semantics. + - Rename `desc()` and `size()` to `model_desc()` and `model_size()` to + make their scope explicit. + - Drop the unused `get_tensor()` stub since llama.cpp does not expose it. + - Route rerank template lookup through `LlamaModel.model_chat_template()` for + consistency with the internal model abstraction. + +- feat(chat_handler): update multimodal handlers for Qwen2.5-VL, Qwen3-VL, and PaddleOCR + - Update PaddleOCRChatHandler to support version 1.6 + - Add token configuration and stop sequences for Qwen2.5-VL and Qwen3-VL + - Standardize input_ids initialization in __call__ methods for Qwen2.5-VL, Qwen3-ASR, and Qwen3-VL handlers + +- **perf(eval): skip unnecessary logit array copies during native sampling** + * Introduce the `copy_logits` parameter to `Llama.eval()` to control + whether C-level logits are copied into the Python `self.scores` array. + - Automatically disable `copy_logits` during the generation loop unless + Python-side hooks (`logits_processor`, `stopping_criteria`) or + `logits_all` explicitly require them. + - Skip logit copies entirely for intermediate prompt evaluations (e.g., + before hybrid checkpoints). + - Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch + the final token's logits when copying is required. + + In a PDF-reading summarization workload, this reduced the end-to-end completion + time from 41.32s to 25.93s, a ~37.2% improvement. The main generation hot path + also improved noticeably: + + - `_create_completion`: 41.32s -> 25.93s + - `generate`: 37.82s -> below the top sampled entries + - `eval`: 35.14s -> 21.96s + - logits retrieval/copy path: 29.89s `get_logits()` -> 18.68s `get_logits_ith()` + - `decode`: 3.89s -> 2.25s + - `detokenize`: 2.60s -> 1.33s + - `sample`: 2.35s -> 2.03s + + This significantly reduces CPU overhead and memory bandwidth during generation, + as the native `llama.cpp` sampler reads directly from the C context without + needing to expose the `n_vocab` array to Python on every token. + +- docs(CUDA): Add note about PDL optimization for newer NVIDIA GPUs (CC ≥ 90) + +- docs(readme/wiki): update supported embeddings models table + - Add `jina-embeddings-v2-base-zh` + - Add `jina-embeddings-v3` + - Minor table formatting clean up + +- docs(development): add AI agent prompt for git commit generation + * Introduce `git-commit-generation-agent.md` to the development wiki to + standardize the creation of high-quality git commit messages using LLM + assistants. + + - Define the system persona, core principles (Conventional Commits, DCO), + and strict formatting rules for generating commits. + - Provide concrete template examples for build, performance, and + documentation updates. + - Ensure future maintainers and contributors can easily generate + consistent, maintainer-level commits that explicitly explain the "Why" + and "How" of code changes. + +- docs(wiki): add development helper to index + * Introduce the development section in the wiki index so maintainer-facing + workflows and LLM-assisted helper tools are discoverable from the main + navigation. + + - Add a Development section with a link to the Git commit generation agent. + Include the helper in the recommended reading order for new wiki users. + - Add development/git-commit-generation-agent.md to the available pages list. + +- feat(LlamaContext): add safety checks and docstrings to logits retrieval + - Add explicit null pointer validation to `get_logits` and `get_logits_ith`. + These methods now raise a `RuntimeError` instead of silently returning + invalid pointers when logits are unavailable or the index is out of bounds. + - Add comprehensive docstrings to both methods, detailing the underlying + buffer shape and memory layout. + - Include a performance warning in `get_logits_ith` about the internal + synchronization/reordering overhead to discourage its use on the hot path. + +- **feat(speculative): upgrade ngram map decoder with k/k4v modes +Enhance `LlamaNGramMapDecoding` to align with the upstream llama.cpp +ngram-map algorithm, offering better memory management and draft quality.** + - Introduce `mode` selection ("k" and "k4v"): "k" stores only historical + positions for memory efficiency, while "k4v" caches continuation values + directly for faster lookups. + - Add `min_hits` threshold to filter out low-confidence drafts. + - Implement `max_entries_per_key` to cap dictionary growth and prevent + memory bloat during long-context generations. + - Improve state synchronization (`_sync_and_index`) using `sync_check_tokens` + to safely verify incremental history appends. + - Add explicit lifecycle management methods (`clear`, `close`, `accept`) + for better API symmetry and resource cleanup. + - examples: add benchmark script for speculative decoding + - Add `benchmark_speculative.py` to the `examples/benchmark` directory. + - Test `LlamaPromptLookupDecoding` and `LlamaNGramMapDecoding` (k/k4v). + - Include diverse test scenarios (code, JSON logs, tables, essays) to + measure tokens-per-second (TPS) speedup compared to baseline generation. + +- docs(speculative): update wiki for NGramMap k/k4v modes and lifecycle APIs +Reflect the recent architectural upgrades to `LlamaNGramMapDecoding` in +the official documentation. + + - Document the new `__init__` parameters (`mode`, `min_hits`, + `max_entries_per_key`, `sync_check_tokens`) and their validation rules. + - Add a detailed comparison table explaining the memory and behavior + differences between the `"k"` and `"k4v"` lookup modes. + - Document the newly exposed lifecycle methods (`clear`, `close`, `accept`). + - Add comprehensive usage examples demonstrating `k4v` mode with memory caps. + - Update internal state descriptions (replacing `_ngram_map` with `_map_k` + and `_map_k4v`). + - Add a strong production warning against the legacy `LlamaPromptLookupDecoding` + and cross-link the new `benchmark_speculative.py` script. + +- docs(readme): revamp speculative decoding documentation +Expand the Speculative Decoding section to fully document the +new `LlamaNGramMapDecoding` capabilities and configuration options. + + - Clarify that `LlamaNGramMapDecoding` is a model-free prompt lookup + decoder that does not require a secondary GGUF draft model. + - Add a detailed parameter table explaining `mode` (k vs. k4v), + `min_hits`, memory caps, and sync thresholds. + - Provide usage examples and tuning recommendations for different + hardware (e.g., lowering `num_pred_tokens` for CPU setups). + - Demote the older `LlamaPromptLookupDecoding` to a legacy section, + warning about its sliding-window overhead on long contexts. + - Add practical notes on performance and state management (`clear()`). + +- docs(readme): Removed outdated macOS installation guides and added the latest installation notes. + +- docs(readme): Add Windows ROCm build instructions(by **@0xDELUXA**) + - Optimize the formatting of the ROCm section in README.md. + +- fix: wire LFM VL chat handlers into server loader(by **@JayAnderson360**) + +- build(cmake): disable building of upstream unified binary + - Set `LLAMA_BUILD_APP` to `OFF` to prevent the compilation of the new + unified `llama` binary introduced in upstream llama.cpp. + + - Since the Python package only requires the underlying shared libraries + and specific targets, explicitly disabling the standalone application + reduces build times and prevents unnecessary executable artifacts from + being compiled. + +- build(deps): align Jinja2 minimum with Transformers + - Require Jinja2 >= 3.1.0 for HuggingFace-style chat template support. + + - The updated Jinja2ChatFormatter relies on behavior aligned with Transformers' + chat-template runtime, which also requires Jinja2 3.1 or newer. Updating the + minimum dependency avoids parser/runtime differences with older Jinja versions. + +- ci : update metal build/test job to macos-26/macos-15-intel + - Build on the Tahoe runners in order to enable the tensor API for M5 and A19. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/f71af352a52b8efe824c7a698d0632afa4794c01](https://github.com/ggml-org/llama.cpp/commit/f71af352a52b8efe824c7a698d0632afa4794c01) + +- feat: Sync llama.cpp llama/mtmd/ggml API Binding 20260606 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/a778c57d73ec7d4f43e2518a513e7d4cf68a0df8...db8292d336ae1e708623792426481c414754353e + +## [0.3.39] Dynamic GGML Backends, Qwen3-ASR/MiniCPM-V-4.6, On-Device Hybrid Checkpoint, and Granular Logging + +- **ci(cu131/128/126/124): build wheels with GGML dynamic backends for windows/Linux** + - Replace the old CPU/AVX release tag matrix with a single backend + wheel layout. + - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Windows wheels ship + runtime-loadable GGML backend DLLs and CPU variant backends. + - Use the Windows LLVM toolchain and disable non-wheel targets such as examples, + tests, tools, server, embedded UI, and curl. + - Remove the `.basic` style local version suffix and publish wheels + as `+cu131`. + - Update CUDA architectures to CUDA 13.1 and simplify CMake argument handling. + - Note: for full x64 CPU variant coverage on Windows, LLVM/Clang builds are preferred. MSVC may skip some variants such as zen4, cooperlake, or sapphirerapids due to compiler intrinsic support limitations. + +- **feat(core): support loading GGML_BACKEND_DL dynamic backend libraries from wheel lib** + - Import `ggml_backend_load_all_from_path` and `ggml_backend_reg_count` + from `_ggml`. + - Load dynamic ggml backend libraries from the packaged `llama_cpp/lib` + directory after `llama_backend_init()`. + - Support wheels built with `GGML_BACKEND_DL`, where CPU variants and + accelerator backends such as `ggml-cpu-*` and `ggml-cuda` are shipped as + separate runtime libraries. + - Print the registered backend count in verbose mode to help diagnose backend + discovery issues. + +- **build(cmake): refactor install target lists for new GGML backend layout** + - Categorize build targets into logical groups (`LLAMA_CPP_TARGETS`, + `GGML_CORE_TARGETS`, `GGML_CPU_VARIANT_TARGETS`, and `GGML_BACKEND_TARGETS`) + to improve maintainability and keep the Python package installation in sync + with the updated upstream GGML backend layout. + - Add missing targets such as `llama-common` and the separated + `ggml-cpu-*` CPU variant backends. + - Ensure all grouped targets are passed through `llama_cpp_python_install_target`. + - Update llama build option descriptions to match the current upstream naming style. + - Explicitly disable `LLAMA_BUILD_SERVER` to avoid building the server target for Python package wheels. + - Explicitly disable `LLAMA_BUILD_UI` and `LLAMA_USE_PREBUILT_UI` because the + embedded server Web UI is not needed for wheel builds. + - Keep examples, tests, and curl support disabled for minimal wheel artifacts. + - Add a cleanup function to strip `cmake`, `pkgconfig`, and import libraries from the python wheel runtime directories. + - Ensures Windows builds only package the required runtime DLLs. + +- **Implement Qwen3ASRChatHandler for Qwen3-ASR models.** + - Integrate MTMD multimodal logic to extract and inject `audio_url` and base64 `input_audio` data directly into the `<|audio_start|><|audio_pad|>[DATA]<|audio_end|>` sequence. + - Define a default multilingual transcription system prompt and configure model-specific stop tokens. + - docs(README.md): add Qwen3-ASR documentation and usage example + - Update the supported multi-modal models table to include `qwen3-asr` and the `Qwen3ASRChatHandler`. + - Add a new dedicated section for Speech-to-Text inference with a complete, collapsible Python script. + - Provide a `build_media_payload` helper function to demonstrate proper Base64 encoding of local `.wav` and `.mp3` files into OpenAI-compatible `input_audio` schemas. + - Include a critical warning advising users to use BF16 quantization for the multimodal projector (`mmproj`) to prevent audio degradation. + - Clarify usage mechanics, specifically that all instructions must be placed in the `system` role due to the ASR template's text-dropping behavior. + +- **Implement MiniCPMV46ChatHandler for MiniCPM-V-4.6** + +- **feat(core): integrate fine-grained logging API into Llama class** + - This commit exposes the newly refactored `_logger` configuration system directly through the `Llama` class, providing users with robust, programmatic control over native `llama.cpp` backend logs. + - docs(wiki): document runtime verbosity and log filters for Llama + - docs(Llama.md): update verbose=False vs. verbosity=0 note + - Key changes: + - Expand `Llama.__init__` with `verbosity`, `log_filters`, and `log_filters_case_sensitive` parameters. + - Add instance methods for runtime log management (`set_verbosity`, `get_verbosity`, `set_log_filters`, `add_log_filters`, `clear_log_filters`, etc.). + - Add comprehensive docstrings explaining the 0-5 verbosity scale and explicitly noting the process-global nature of the native backend logger. + - Advantages over the legacy implementation: + - Granular Control: Replaces the restrictive binary `verbose=True/False` flag (which only toggled between ERROR and DEBUG) with a granular 6-tier scale (output, error, warn, info, trace, debug). + - Dynamic Filtering: Empowers users to actively suppress specific noisy C++ logs using custom substring filters, removing the need for hardcoded internal patches. + - Better Discoverability: Attaches logging controls directly to the `Llama` object, making log management much more accessible and intuitive without requiring users to import internal logger modules. + +- **feat(logger): refactor and enhance ggml logging configuration system** + - Introduce a `LoggerConfig` dataclass to provide fine-grained control over native ggml/llama.cpp runtime logging. + - Align `verbosity` levels (0 to 5) with upstream `llama.cpp` conventions (`common/log.h`). + - Implement a dynamic, configurable substring filtering system, replacing the hardcoded "CUDA Graph" patch with `DEFAULT_LOG_FILTERS`. + - Add comprehensive public APIs for log management: `configure_logging`, `set_verbosity`, `set_quiet`, `set_silent`, `set_log_filters`, and `add_log_filters`. + - Maintain backwards compatibility for the existing `set_verbose(bool)` function. + - Improve the `ggml_log_callback` to correctly handle `GGML_LOG_LEVEL_CONT` by inheriting the verbosity of the preceding log message. + - Route `GGML_LOG_LEVEL_NONE` to `stdout` and all other diagnostic logs to `stderr` by default. + - docs(Logger.md): Upload Logger documentation + +- fix(MTMDChatHandler): correct audio_url content type check and improve variable handling + - Changed condition from `content == "audio_url"` to `content_type == "audio_url"` for proper type-based dispatching. + - Extracted `audio_url` variable for better readability. + - Converted `else` to `elif content_type == "input_audio"` to make the control flow explicit and safer. + +- fix(_internals): Remove unnecessary free operations; models should not be released within the context. + +- **feat(cache): add on-device hybrid checkpoint support** + - Introduce `HybridCheckpointCache` with dual-mode behavior (Host/On-Device). + - Device mode utilizes `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` to keep tensor + payloads in `llama_context` VRAM, reducing host-device copy overhead. + - Host mode remains the default, preserving full Python-owned rollback history. + - Implement safety guards against stale on-device checkpoint restores and + enforce one active device checkpoint per `seq_id`. + - Unify checkpoint management with shared FIFO eviction. + - Expose `checkpoint_on_device` in `Llama.__init__` and reduce default + `ctx_checkpoints` from 32 to 16. + - Enhance verbose logging and docs to clarify host vs. VRAM ownership + semantics and track memory usage accurately. + - Rename internal `_flag_partial` to `_flags` to support multiple state flags. + - Update /docs/wiki/core/Llama.md for on_device option + - Update /docs/wiki/modules/LlamaCache.md for on_device option + +- docs: Update /docs/wiki and README.md file and remove outdated mkdocs workflow + - docs(readme): update wheel requirements and dynamic CPU backend info + - Update supported CUDA versions to include 12.8 and 13.1, while outlining + the supported compute architectures (SM70 up to SM120a). + - Document the transition to `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` + starting in `0.3.39-preview`. + - Clarify that dynamic CPU backend loading eliminates the need for separate + `Basic` and `AVX2` wheel distributions. + - Add a technical note in the FAQ recommending LLVM/Clang over MSVC for + achieving full x64 CPU variant coverage on Windows. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/d14ce3dab4de197adec5166faa54ac5db8262f26](https://github.com/ggml-org/llama.cpp/commit/d14ce3dab4de197adec5166faa54ac5db8262f26) + +- feat: Sync llama.cpp llama/mtmd/ggml API Binding 20260517 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/ef27f333f367fdc53dc1a729ad8bb6c3c9362514...e87041e4ee6a89798abe9f36315f60f3fb06c5cb + ## [0.3.38] Optimized CJK Detokenization, Sync Grammar Parser, and Patched CUDA Graph Logs - perf: Optimize detokenize buffer sizing for CJK-heavy outputs @@ -397,7 +815,7 @@ This commit significantly overhauls the media parsing and loading pipeline in `M - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/f5ddcd1696eca5069dc7915f4d4c03c9a709afea](https://github.com/ggml-org/llama.cpp/commit/f5ddcd1696eca5069dc7915f4d4c03c9a709afea) -## [0.3.30] Milestone Release +## [0.3.30-Milestone] Milestone Release I will update the release notes for version 0.3.30 in the [discussion](https://github.com/JamePeng/llama-cpp-python/discussions). diff --git a/CMakeLists.txt b/CMakeLists.txt index 04d3ec1fff..5b2cfeeb8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,13 +11,12 @@ set(CMAKE_INSTALL_LIBDIR llama_cpp/lib CACHE PATH "" FORCE) set(CMAKE_INSTALL_INCLUDEDIR llama_cpp/include CACHE PATH "" FORCE) -# Helper function to install targets to Python package directories +# Install a built target into the Python package runtime directory. function(llama_cpp_python_install_target target) if(NOT TARGET ${target}) return() endif() - # Define install destinations to avoid code duplication set(INSTALL_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" @@ -33,6 +32,9 @@ function(llama_cpp_python_install_target target) RESOURCE DESTINATION ${DIR} ) + # Copy runtime DLL dependencies of this target when available. + # This does not replace explicit installation of dynamic backend + # targets such as ggml-cpu-*; those are installed as targets below. # Automatically handle Windows DLL installation for each target if (WIN32) install( @@ -57,6 +59,180 @@ function(llama_cpp_python_install_target target) endif() endfunction() + +# Copy an extra Windows runtime DLL into the Python package runtime directory +# during the CMake install step. +# +# Some dynamically loaded backend libraries depend on runtime DLLs that are not +# always discoverable through $. One important example +# is libomp140.x86_64.dll, required by LLVM OpenMP CPU backend variants. +function(llama_cpp_python_install_windows_runtime_file runtime_file) + if(NOT WIN32) + return() + endif() + + if(NOT runtime_file) + return() + endif() + + if(NOT EXISTS "${runtime_file}") + message(WARNING + "Windows runtime DLL was selected but does not exist and will not be copied: " + "${runtime_file}" + ) + return() + endif() + + # Normalize Windows paths for generated cmake_install.cmake. + # Without this, paths like C:\Program Files (...) may produce invalid + # CMake escape sequences such as \P during install. + file(TO_CMAKE_PATH "${runtime_file}" runtime_file_cmake) + + set(INSTALL_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" + "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" + ) + + foreach(DIR ${INSTALL_DIRS}) + file(TO_CMAKE_PATH "${DIR}" DIR_CMAKE) + + message(STATUS + "Will copy Windows runtime DLL during install: " + "${runtime_file_cmake} -> ${DIR_CMAKE}" + ) + + install( + FILES "${runtime_file_cmake}" + DESTINATION "${DIR_CMAKE}" + ) + endforeach() +endfunction() + + +# Locate and install the Windows LLVM OpenMP runtime when available. +# +# GGML CPU all-variant backends built with LLVM/Clang + OpenMP depend on +# libomp140.x86_64.dll. Since ggml-cpu-*.dll files are loaded dynamically via +# ggml_backend_load_all_from_path(), the OpenMP runtime must be packaged next to +# them under llama_cpp/lib. +# +# CI may pass LLAMA_CPP_OPENMP_RUNTIME_DLL explicitly. Local builds can rely on +# fallback search paths for Visual Studio Enterprise / BuildTools. +function(llama_cpp_python_install_windows_openmp_runtime) + if(NOT WIN32) + return() + endif() + + set(OPENMP_RUNTIME_DLL "") + set(OPENMP_RUNTIME_SOURCE "") + set(FOUND_OPENMP_DLLS "") + + if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL) + if(EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "LLAMA_CPP_OPENMP_RUNTIME_DLL") + else() + message(WARNING + "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: " + "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio " + "VC143 LLVM OpenMP runtime discovery." + ) + endif() + endif() + + if(NOT OPENMP_RUNTIME_DLL) + file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) + file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) + + set(VS_OPENMP_VC143_PATTERNS + # Prefer VS 2022 VC143 LLVM OpenMP redist paths. + # The MSVC version directory is intentionally globbed because + # GitHub runners may contain versions such as 14.44.35112 or 14.44.35207. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + + # Secondary VS layout fallbacks for unusual installations. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + ) + + foreach(PATTERN ${VS_OPENMP_VC143_PATTERNS}) + file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}") + list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS}) + endforeach() + + if(FOUND_OPENMP_DLLS) + list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS) + list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING) + list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) + set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 VC143 LLVM OpenMP redist") + endif() + endif() + + if(NOT OPENMP_RUNTIME_DLL) + set(SYSTEM32_OPENMP_RUNTIME_DLL "C:/Windows/System32/libomp140.x86_64.dll") + + if(EXISTS "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "System32 fallback") + endif() + endif() + + if(OPENMP_RUNTIME_DLL) + message(STATUS + "Selected Windows LLVM OpenMP runtime from ${OPENMP_RUNTIME_SOURCE}: " + "${OPENMP_RUNTIME_DLL}" + ) + llama_cpp_python_install_windows_runtime_file("${OPENMP_RUNTIME_DLL}") + else() + message(WARNING + "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. " + "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL, Visual Studio 2022 " + "Enterprise/BuildTools VC143 redist paths under Program Files and " + "Program Files (x86), with a fuzzy MSVC version match such as " + "14.44.35112 or 14.44.35207, and C:/Windows/System32 as a final fallback. " + "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " + "the packaged ggml-cpu-*.dll files may fail to load at runtime. " + "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " + "to package it explicitly." + ) + endif() +endfunction() + + +# Remove development-only artifacts from Python wheel runtime directories. +# +# Upstream install rules may place CMake package files, pkg-config files, and +# Windows import libraries under llama_cpp/lib because CMAKE_INSTALL_LIBDIR is +# redirected there for wheel builds. They are not needed at runtime. +function(llama_cpp_python_cleanup_dev_files) + if(NOT WIN32) + return() + endif() + + set(INSTALL_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" + "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" + ) + + foreach(DIR ${INSTALL_DIRS}) + install(CODE " + if(EXISTS \"${DIR}\") + file(GLOB LLAMA_CPP_IMPORT_LIBS \"${DIR}/*.lib\") + if(LLAMA_CPP_IMPORT_LIBS) + file(REMOVE \${LLAMA_CPP_IMPORT_LIBS}) + endif() + + file(REMOVE_RECURSE + \"${DIR}/cmake\" + \"${DIR}/pkgconfig\" + ) + endif() + ") + endforeach() +endfunction() + + if (LLAMA_BUILD) set(BUILD_SHARED_LIBS "On") @@ -72,16 +248,26 @@ if (LLAMA_BUILD) set(CMAKE_SKIP_RPATH FALSE) # Enable building of the common library - set(LLAMA_BUILD_COMMON ON CACHE BOOL "llama.cpp: build common utils library" FORCE) + set(LLAMA_BUILD_COMMON ON CACHE BOOL "llama: build common utils library" FORCE) # Enable build and link OpenSSL - set(LLAMA_OPENSSL ON CACHE BOOL "llama.cpp: build and link OpenSSL" FORCE) + set(LLAMA_OPENSSL ON CACHE BOOL "llama: use openssl to support HTTPS" FORCE) # Disable building of examples - set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama.cpp: build examples" FORCE) + set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama: build examples" FORCE) # Disable building of tests - set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama.cpp: build tests" FORCE) + set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama: build tests" FORCE) + + # Disable building of server + set(LLAMA_BUILD_SERVER OFF CACHE BOOL "llama: build server example" FORCE) + + # Disable building of unified binary + set(LLAMA_BUILD_APP OFF CACHE BOOL "llama: build the unified binary" FORCE) + + # Disable build the embedded Web UI for server + set(LLAMA_BUILD_UI OFF CACHE BOOL "llama: build the embedded Web UI for server" FORCE) + set(LLAMA_USE_PREBUILT_UI OFF CACHE BOOL "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" FORCE) # Disable building curl support set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: use libcurl to download model from an URL" FORCE) @@ -117,14 +303,38 @@ if (LLAMA_BUILD) set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() - # Define list of GGML targets to install - set(GGML_TARGETS + # Define list of LLAMA_CPP/GGML targets to install + set(LLAMA_CPP_TARGETS llama + llama-common + ) + set(GGML_CORE_TARGETS ggml ggml-base ggml-blas - ggml-cann ggml-cpu + ggml-rpc + ) + + set(GGML_CPU_VARIANT_TARGETS + ggml-cpu-x64 + ggml-cpu-sse42 + ggml-cpu-sandybridge + ggml-cpu-ivybridge + ggml-cpu-piledriver + ggml-cpu-haswell + ggml-cpu-skylakex + ggml-cpu-cannonlake + ggml-cpu-cascadelake + ggml-cpu-cooperlake + ggml-cpu-icelake + ggml-cpu-alderlake + ggml-cpu-sapphirerapids + ggml-cpu-zen4 + ) + + set(GGML_BACKEND_TARGETS + ggml-cann ggml-cuda ggml-hexagon ggml-hip @@ -132,7 +342,6 @@ if (LLAMA_BUILD) ggml-musa ggml-opencl ggml-openvino - ggml-rpc ggml-sycl ggml-virtgpu ggml-vulkan @@ -141,8 +350,12 @@ if (LLAMA_BUILD) ggml-zendnn ) - # Loop through targets to avoid repetitive function calls - foreach(TARGET_NAME ${GGML_TARGETS}) + foreach(TARGET_NAME + ${LLAMA_CPP_TARGETS} + ${GGML_CORE_TARGETS} + ${GGML_CPU_VARIANT_TARGETS} + ${GGML_BACKEND_TARGETS} + ) llama_cpp_python_install_target(${TARGET_NAME}) endforeach() @@ -170,4 +383,13 @@ if (LLAMA_BUILD) llama_cpp_python_install_target(mtmd) endif() + + # Install Windows LLVM OpenMP runtime when available. + # This must run before cleanup so the final wheel keeps runtime DLLs but + # removes development-only files such as .lib, cmake/, and pkgconfig/. + llama_cpp_python_install_windows_openmp_runtime() + + # Run after all runtime targets are installed, including mtmd. + llama_cpp_python_cleanup_dev_files() + endif() diff --git a/README.md b/README.md index c9aba7d42d..433e031ae0 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,11 @@ This package provides: - [Dynamic LoRA Example](https://github.com/JamePeng/llama-cpp-python#dynamic-lora-example) - [Control Vector Injection (Representation Engineering)](https://github.com/JamePeng/llama-cpp-python#control-vector-injection-representation-engineering) - [Sampling Configuration & Usage (LlamaSamplingParams)](https://github.com/JamePeng/llama-cpp-python#sampling-configuration--usage-llamasamplingparams) + - [How to use the ReasoningBudgetSampler](https://github.com/JamePeng/llama-cpp-python#reasoning-budget-first-reasoning-block) - [Multi-modal Models Support](https://github.com/JamePeng/llama-cpp-python#multi-modal-models) - Support Models Lists - [Loading a Local Image With Qwen3VL(Thinking/Instruct)](https://github.com/JamePeng/llama-cpp-python#loading-a-local-image-with-qwen3vlthinkinginstruct) + - [Speech Recognition With Qwen3-ASR (Speech-to-Text)](https://github.com/JamePeng/llama-cpp-python#speech-recognition-with-qwen3-asr-speech-to-text) - [Comprehensive Omni MultiModal Example: Gemma-4 (Vision + Audio + Text)](https://github.com/JamePeng/llama-cpp-python#comprehensive-omni-multimodal-example-gemma-4-vision--audio--text) - [Embeddings & Reranking (GGUF)](https://github.com/JamePeng/llama-cpp-python#embeddings--reranking-gguf) - [1. Text Embeddings (Vector Search)](https://github.com/JamePeng/llama-cpp-python#1-text-embeddings-vector-search) @@ -63,6 +65,8 @@ Thank you for your continuous support! ## Installation +For a structured source-install and backend build guide, see [docs/wiki/install.md](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/install.md). + Requirements: - Python 3.9+ @@ -159,14 +163,46 @@ $env:CMAKE_ARGS = "-DGGML_CUDA=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` +Note: **Programmatic Dependent Launch (PDL)** is a CUDA optimization for newer NVIDIA GPUs (CC >= 90; does not include Ada). +It enables stream-level dependency-driven concurrent execution of CUDA kernels within the same stream, achieving similar kernel launch overhead reduction as CUDA Graphs. If you have a newer NVIDIA GPU (e.g. `Hoppper`, `Blackwell` and above), you can achieve significant speedups and latency reduction in token generation across nearly all models when compiling with ` -DGGML_CUDA_PDL=ON`. + **Pre-built Wheel (New)** -It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: +It is also possible to install a pre-built wheel with CUDA support. Make sure your system meets the following requirements: -- CUDA Version is 12.4, 12.6, 12.8 or 13.0 -- Python Version is 3.10, 3.11, 3.12, 3.13 or 3.14 -- Basic version(Default): A version compiled without using AVX instructions (for compatibility with CPU platforms lacking AVX instructions or with AVX instruction compatibility issues). -- AVX2 version: A version compiled using AVX2 instructions. +- CUDA version: 12.4, 12.6, 12.8, or 13.1 +- Python version: 3.10, 3.11, 3.12, 3.13, or 3.14 +- Starting with `0.3.39-preview`, Windows and Linux x64 wheels are built with `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS`. + +This means CPU backends are shipped as dynamically loaded runtime libraries under: + +```text +site-packages/llama_cpp/lib +```` + +Supported CPU backend variants may include: + +* `ggml-cpu-x64` +* `ggml-cpu-sse42` +* `ggml-cpu-sandybridge` +* `ggml-cpu-ivybridge` +* `ggml-cpu-piledriver` +* `ggml-cpu-haswell` +* `ggml-cpu-skylakex` +* `ggml-cpu-cannonlake` +* `ggml-cpu-cascadelake` +* `ggml-cpu-cooperlake` +* `ggml-cpu-icelake` +* `ggml-cpu-alderlake` +* `ggml-cpu-sapphirerapids` +* `ggml-cpu-zen4` + +The old `Basic` and `AVX2` wheel variants are no longer required for the new dynamic-backend wheels. GGML can load the compatible CPU backend at runtime, which improves CPU instruction-set compatibility across different x64 machines. + +Before `0.3.39-preview`: + +* `Basic`: compiled without AVX instructions for maximum compatibility. +* `AVX2`: compiled with AVX2 instructions for newer CPUs. Check the releases page: https://github.com/JamePeng/llama-cpp-python/releases @@ -239,6 +275,8 @@ On MacOS, Metal is enabled by default(`GGML_METAL=ON`). Using Metal makes the co To disable the Metal build at compile time use the `CMAKE_ARGS="-DGGML_METAL=OFF"` cmake option. +When built with Metal support, you can explicitly disable GPU inference with the `n-gpu-layers=0` parameter. + ```bash pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` @@ -247,6 +285,7 @@ pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements: +- CPU Arch: arm64 - MacOS Version is 11.0 or later - Python Version is 3.10, 3.11, 3.12, 3.13 or 3.14 @@ -258,18 +297,55 @@ https://github.com/JamePeng/llama-cpp-python/releases
HIP (ROCm) -This provides GPU acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. + -
+ Linux ROCm -You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). + This provides GPU acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. -To install with HIP / ROCm support for AMD cards, set the `GGML_HIP=ON` environment variable before installing: + You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). -```bash -CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" -``` -Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system. + To install with HIP / ROCm support for AMD cards, set the `GGML_HIP=ON` environment variable before installing: + + ```bash + CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" + ``` + Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system. + + More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip + +
+ + -
+ Windows ROCm + + > **Note:** Install TheRock ROCm, activate your venv, then run in PowerShell. Replace `gfx1200` with your GPU architecture. + + ```powershell + cmd /c '"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" >nul 2>&1 && set' | ForEach-Object { if ($_ -match '^([^=]+)=(.*)$') { [System.Environment]::SetEnvironmentVariable($matches[1], $matches[2], 'Process') } } + + rocm-sdk init + + $ROCM_DEVEL = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_devel" + $ROCM_CORE = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_core" + $ROCM_GFX = (Get-Item "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_libraries_gfx*").FullName -More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip + $env:HIP_PATH = $ROCM_DEVEL + $env:ROCM_PATH = $ROCM_DEVEL + $env:HIP_DEVICE_LIB_PATH = "$ROCM_CORE\lib\llvm\amdgcn\bitcode" + $env:PATH = "$ROCM_DEVEL\bin;$ROCM_DEVEL\lib\llvm\bin;$ROCM_GFX\bin;$env:PATH" + $env:CMAKE_GENERATOR = "Ninja" + $env:HIP_PLATFORM = "amd" + $env:CC = "$ROCM_DEVEL\lib\llvm\bin\clang.exe" + $env:CXX = "$ROCM_DEVEL\lib\llvm\bin\clang++.exe" + $env:HIP_CLANG_PATH = "$ROCM_DEVEL\lib\llvm\bin" + + $R = $ROCM_DEVEL -replace '\\', '/' + $env:CMAKE_ARGS = "-DGGML_HIP=ON -DGGML_HIPBLAS=on -DGPU_TARGETS=gfx1200 -DCMAKE_HIP_ARCHITECTURES=gfx1200 -DCMAKE_C_COMPILER=`"$R/lib/llvm/bin/clang.exe`" -DCMAKE_CXX_COMPILER=`"$R/lib/llvm/bin/clang++.exe`" -DHIP_LIBRARIES=`"$R/lib/amdhip64.lib`" -DCMAKE_PREFIX_PATH=`"$R`"" + + pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" --no-cache-dir + ``` + +
@@ -348,46 +424,83 @@ CMAKE_ARGS="-DGGML_RPC=on" pip install "llama-cpp-python @ git+https://github.co -### Windows Notes - +### Install Notes
-Error: Can't find 'nmake' or 'CMAKE_C_COMPILER' + Optimization Options (Optional) -If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install: +> **💡 Tip:** If you want to save compilation time, you can skip building of llama.cpp with the standalone examples, tools, tests, and server by adding the following flags, as they are not required for Python bindings: -```ps -$env:CMAKE_GENERATOR = "MinGW Makefiles" -$env:CMAKE_ARGS = "-DGGML_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe" +```bash +-DLLAMA_BUILD_EXAMPLES=OFF \ +-DLLAMA_BUILD_TOOLS=OFF \ +-DLLAMA_BUILD_TESTS=OFF \ +-DLLAMA_BUILD_SERVER=OFF ``` - -See the above instructions and set `CMAKE_ARGS` to the BLAS backend you want to use.
-### MacOS Notes +
+ CUDA compiler warning suppression is optional +CUDA nvcc compiler may print many template-related warnings from ggml-cuda, such as: -Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/) +```bash +warning #177-D +warning #221-D +warning #550-D +``` -
-M1 Mac Performance Issue +These usually generate a huge amount of noisy diagnostics rather than build blockers. They constantly flood logs and consume CPU printing performance. -Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: +For cleaner CI/local logs, you can pass: ```bash -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh +-DCMAKE_CUDA_FLAGS="--diag-suppress=177 --diag-suppress=221 --diag-suppress=550" ``` - -Otherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
-M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))` + Notes for `GGML_BACKEND_DL` + `GGML_CPU_ALL_VARIANTS` builds +When building wheels with `GGML_BACKEND_DL=ON` and `GGML_CPU_ALL_VARIANTS=ON`, +GGML CPU backends are built as separate dynamic libraries, such as: + +```text +ggml-cpu-x64.dll +ggml-cpu-haswell.dll +ggml-cpu-alderlake.dll +ggml-cpu-zen4.dll +``` +These backend libraries must be packaged together under: + +```text +site-packages/llama_cpp/lib +``` -Try installing with +The runtime must also explicitly load them with: -```bash -CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +```text +ggml_backend_load_all_from_path() ``` + +### Windows notes + +For full x64 CPU variant coverage, `LLVM/Clang` is recommended. `MSVC` may skip some variants such as `zen4`, `cooperlake`, or `sapphirerapids`. + +If `GGML_OPENMP=ON` is used, the LLVM OpenMP runtime must also be packaged next to the backend DLLs: + +```text +libomp140.x86_64.dll +``` + +Without this file, `ggml-cpu-*.dll` may fail to load dynamically at runtime. + +### Wheel packaging checklist + +* Enable `GGML_BACKEND_DL=ON` +* Enable `GGML_CPU_ALL_VARIANTS=ON` +* Use `GGML_NATIVE=OFF` for portable wheels +* Install all `ggml-cpu-*` backend libraries into `llama_cpp/lib` +* Package required runtime dependencies such as `libomp140.x86_64.dll` +* Remove development-only files such as `.lib`, `cmake/`, and `pkgconfig/` +
### Upgrading and Reinstalling @@ -787,6 +900,83 @@ Mirostat actively maintains a target entropy (`tau`) during generation to preven * **`logits_processor`** (`LogitsProcessorList`, optional): Custom Python callbacks to modify the logits tensor in-place before sampling. * **`stopping_criteria`** (`StoppingCriteriaList`, optional): Custom Python callbacks to halt generation based on the current sequence or scores. + +### Reasoning Budget (First Reasoning Block) + +`llama-cpp-python` provides a generic reasoning-budget sampler for models that expose their thinking content with visible start/end tags. It controls only the **first visible reasoning block** in the generated output. After that block naturally ends or is forcibly closed, the sampler switches to passthrough mode and later reasoning tags are ignored. + +This feature is intentionally model-agnostic. It does not infer model families, inspect chat templates, or guess thinking tags. If a model uses tags other than `...`, pass the correct `reasoning_start` and `reasoning_end` explicitly. + +| Parameter | Default | Description | +| --- | --- | --- | +| `reasoning_budget` | `-1` | Token budget for the first visible reasoning block. `-1` disables the sampler, `0` forces an immediate end after the block starts, and `N > 0` allows at most `N` generated tokens inside the block. | +| `reasoning_start` | `""` | Token/text sequence that marks the beginning of the first reasoning block. | +| `reasoning_end` | `""` | Token/text sequence that naturally ends the reasoning block. When the budget is exhausted, the sampler forces this sequence. | +| `reasoning_budget_message` | `None` | Optional message inserted before `reasoning_end` when the budget is exhausted. | +| `reasoning_start_in_prompt` | `False` | Set to `True` only when the prompt/chat template has already inserted `reasoning_start`, so the sampler should start counting from the first generated token. | +| `reasoning_start_max_tokens` | `32` | Safety window for non-reasoning outputs. If `reasoning_start` is not generated within this many output tokens, the sampler becomes a no-op. Set to `None` to wait indefinitely. | + +Basic usage with the default `...` tags: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_budget_message="\n[reasoning budget exhausted]\n", + # You can also inject a natural-language transition before reasoning_end: + # reasoning_budget_message="\n...Wait, I have been thinking long enough. Let me start answering the user's question.\n", +) +``` +When the budget is exhausted, the sampler forces: `reasoning_budget_message` + `reasoning_end` + +For Mistral-style thinking tags, pass the tags explicitly: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_start="[THINK]", + reasoning_end="[/THINK]", +) +``` + +For Gemma4 channel-style thinking, adjust the start and end markers to match the visible channel tags: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_start="<|channel>", + reasoning_end="", +) +``` + +Use `reasoning_start_in_prompt=True` when the prompt or chat template has already inserted the reasoning start tag. In that case, the sampler will not see the start tag during generation, so it must start directly in `COUNTING` state from the first generated token. This is suitable for thinking models or handlers that prefill the assistant prefix with a thinking tag, for example: + +```text +<|im_start|>assistant\n\n +``` + +Example: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_start="", + reasoning_end="", + reasoning_start_in_prompt=True, +) +``` + +`reasoning_start_in_prompt` is **not** a generic "thinking enabled" switch. It should only be set when the final prompt already contains `reasoning_start` before generation begins. For templates that merely enable thinking but still expect the model to generate the start tag itself, keep `reasoning_start_in_prompt=False`. + +When `verbose=True`, high-level reasoning-budget transitions are printed to stderr, such as initialization, start-tag detection, budget exhaustion, forced ending, and final passthrough. + ### 🛠️ Usage Example You can pass these parameters directly when calling the model to generate text. @@ -835,6 +1025,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` | | [minicpm-v-4.5](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) | `MiniCPMv45ChatHandler` | `minicpm-v-4.5` | +| [minicpm-v-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6-gguf) | `MiniCPMv46ChatHandler` | `minicpm-v-4.6` | | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | | [gemma4](https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` | | [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | @@ -842,8 +1033,11 @@ Below are the supported multi-modal models and their respective chat handlers (P | [granite-docling](https://huggingface.co/ibm-granite/granite-docling-258M-GGUF) | `GraniteDoclingChatHandler` | `granite-docling` | | [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | | [lfm2.5-vl](https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF) | `LFM25VLChatHandler` | `lfm2.5-vl` | +| [deepseek-ocr](https://huggingface.co/JamePeng2023/DeepSeek-OCR-2-GGUF) | `MTMDChatHandler` | `None` | +| [mineru2.5-pro](https://huggingface.co/JamePeng2023/MinerU2.5-Pro-2605-1.2B-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | +| [qwen3-asr](https://huggingface.co/JamePeng2023/Qwen3-ASR-1.7B-GGUF) | `Qwen3ASRChatHandler` | `qwen3-asr` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | | [qwen3.5](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) | `Qwen35ChatHandler` | `qwen3.5` | | [qwen3.6](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF) | `Qwen35ChatHandler` | `qwen3.6` | @@ -1071,6 +1265,111 @@ print(res["choices"][0]["message"]["content"])
+## Speech Recognition With Qwen3-ASR (Speech-to-Text) + +The `Qwen3ASRChatHandler` is specifically designed for the Qwen3 Automatic Speech Recognition (ASR) models. Unlike standard multimodal models, this handler aggregates system prompts for instructions and automatically extracts audio data from the user's message, ignoring any user text. + +> **⚠️ Important Note on Quantization:** > For Qwen3-ASR models, it is highly recommended to use the **BF16** version of the multimodal projector (`mmproj`). Other quantizations are known to cause severe audio degradation. + +**Example Code**:
+ +```python +from llama_cpp import Llama +from llama_cpp.llama_chat_format import Qwen3ASRChatHandler +import base64 +import os + +# 1. Define paths to the model and the BF16 multimodal projector +MODEL_PATH = r"./Qwen3-ASR-1.7B-BF16.gguf" +MMPROJ_PATH = r"./mmproj-Qwen3-ASR-1.7b-BF16.gguf" + +# 2. Initialize the Llama model with the dedicated ASR handler +llm = Llama( + model_path=MODEL_PATH, + chat_handler=Qwen3ASRChatHandler( + clip_model_path=MMPROJ_PATH, + verbose=False, + ), + n_gpu_layers=-1, + n_ctx=10240, + verbose=False, + verbosity=0 +) + +# 3. Helper function to encode audio files into OpenAI-compatible payloads +_MEDIA_MIME_TYPES = { + '.wav': ('audio', 'wav'), + '.mp3': ('audio', 'mp3'), +} + +def build_media_payload(file_path: str) -> dict: + """Reads a local audio file and converts it into the LLM input structure.""" + if not os.path.isfile(file_path): + raise FileNotFoundError(f"Media file not found: {file_path}") + + extension = os.path.splitext(file_path)[1].lower() + media_category, mime_or_format = _MEDIA_MIME_TYPES.get(extension, ('unknown', 'application/octet-stream')) + + if media_category == 'unknown': + print(f"Warning: Unknown extension '{extension}'.") + + # Read and Base64 encode the file + with open(file_path, "rb") as f: + encoded_data = base64.b64encode(f.read()).decode("utf-8") + + if media_category == 'audio': + return { + "type": "input_audio", + "input_audio": { + "data": encoded_data, + "format": mime_or_format + } + } + else: + return {"type": "text", "text": f"[Attached unsupported file: {file_path}]"} + + +# ======================== +# Main Inference Section +# ======================== + +media_paths = ["./audio/test.wav"] +user_content = [build_media_payload(path) for path in media_paths] + +# 4. Generate the transcription +response = llm.create_chat_completion( + messages=[ + { + "role": "system", + "content": ( + "You are an advanced multilingual Speech-to-Text model. " + "Accurately transcribe the audio into text in its original spoken language. " + "You should ignore background noise, filler words, and stutters where possible, " + "and format the final output with correct grammar and capitalization." + ) + }, + { + "role": "user", + "content": user_content + } + ], + temperature=1.0, + top_p=0.95, + top_k=64, + max_tokens=10240, +) + +print(f"Transcribe: {response['choices'][0]['message']['content']}") + +``` + +#### How it works: + +* **`input_audio` Schema:** The script reads the local `.wav` or `.mp3` file, encodes it in Base64, and wraps it in an OpenAI-compatible `"type": "input_audio"` dictionary. +* **System Prompt:** Because the Qwen3-ASR template strips out user text, all instructions (like translation requests or formatting rules) **must** be placed in the `"system"` role. + +
+ ## Comprehensive Omni MultiModal Example: Gemma-4 (Vision + Audio + Text) Below is a complete, production-ready example demonstrating how to dynamically route and process both image and audio files. It includes a universal media processor that automatically converts local files into the correct payload structure (Data URIs for images, and `input_audio` for audio files). @@ -1245,7 +1544,9 @@ run_inference( | Model | Type | Link | Status | |--------------------|-----------|--------------------------------------------------------|--------------| -| `bge-m3` | Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`bge-m3`| Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`jina-embeddings-v2-base-zh`| Embedding |[jina-embeddings-v2-base-zh-GGUF](https://huggingface.co/gpustack/jina-embeddings-v2-base-zh-GGUF) | Useful ✅ | +|`jina-embeddings-v3`| Embedding |[jina-embeddings-v3-GGUF](https://huggingface.co/second-state/jina-embeddings-v3-GGUF) | Useful ✅ | |`bge-reranker-v2-m3`| Rerank |[bge-reranker-v2-m3-GGUF](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF) | Useful ✅ | |`qwen3-reranker`| Rerank |[Qwen3-Reranker-GGUF](https://huggingface.co/JamePeng2023/Qwen3-Reranker-GGUF) | Useful ✅ | @@ -1378,44 +1679,116 @@ emb = llm.create_embedding("text") --- -### Speculative Decoding +## Speculative Decoding + +`llama-cpp-python` supports speculative decoding through a `draft_model` passed to the `Llama` class. -`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model. +Speculative decoding lets a draft decoder propose candidate tokens before the main model verifies them. This can improve generation speed, especially for repetitive or structured outputs such as code, JSON, boilerplate text, templates, and long-form responses with repeated patterns. -The fastest way to use speculative decoding is through the `LlamaNGramMapDecoding`(**Recommend**) or `LlamaPromptLookupDecoding` class. +The recommended built-in draft decoder is `LlamaNGramMapDecoding`. -Just pass this as a draft model to the `Llama` class during initialization. +Unlike neural draft-model speculative decoding, `LlamaNGramMapDecoding` does not require a second GGUF model. It is a model-free prompt n-gram lookup decoder that predicts draft tokens from already verified token history. ```python from llama_cpp import Llama from llama_cpp.llama_speculative import LlamaNGramMapDecoding llama = Llama( - model_path="path/to/qwen-3.6-27b.gguf", + model_path="path/to/model.gguf", n_ctx=4096, n_gpu_layers=-1, draft_model=LlamaNGramMapDecoding( ngram_size=3, - num_pred_tokens=10 - ) + num_pred_tokens=10, + ), ) response = llama.create_chat_completion( - messages=[{"role": "user", "content": "Write a python script..."}] + messages=[ + { + "role": "user", + "content": "Write a Python script using sqlite3 with repeated CRUD classes.", + } + ] +) +```` + +`LlamaNGramMapDecoding` maintains an internal n-gram index and can reuse repeated token patterns from the current prompt and generated context. Compared with the legacy sliding-window prompt lookup decoder, it avoids scanning the full token history on every call, making draft generation much cheaper for long contexts. + +#### Advanced configuration + +```python +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +draft_model = LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=2, + max_entries_per_key=None, + sync_check_tokens=16, ) ``` -Note: `LlamaPromptLookupDecoding.num_pred_tokens` is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. Now, `LlamaNGramMapDecoding` with the new Hash Map algorithm, draft generation becomes instantaneous $O(1)$, and the time consumption is almost 0 regardless of whether you set the prediction to 2 or 10 words. -### Adjusting the Context Window +| Parameter | Default | Description | +| --------------------- | ----------------------------------------: | ------------------------------------------------------------------------------------------------------------------------------------------------ | +| `ngram_size` | `3` | Number of tokens used as the lookup key. Larger values require stricter matches. | +| `num_pred_tokens` | `10` | Maximum number of draft tokens to propose. | +| `mode` | `"k"` | N-gram map mode. `"k"` stores key-to-position mappings. `"k4v"` stores key-to-continuation mappings. | +| `min_hits` | `2` | Minimum number of historical matches required before returning draft tokens. Use `1` for higher recall, or `2+` to reduce low-confidence drafts. | +| `max_entries_per_key` | `None` in `"k"` mode, `8` in `"k4v"` mode | Optional memory cap per n-gram key. Strongly recommended for `"k4v"` mode. | +| `sync_check_tokens` | `16` | Number of trailing tokens used to detect whether the new input is an incremental append or requires rebuilding the internal index. | + +#### Choosing a mode + +`LlamaNGramMapDecoding` supports two modes: -The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements. +* `mode="k"`: stores n-gram keys mapped to historical positions. This is the default and is usually the best starting point. +* `mode="k4v"`: stores n-gram keys mapped directly to continuation tokens. This can make continuation lookup cheaper, but uses more memory. When using `"k4v"`, keeping `max_entries_per_key` enabled is recommended. + +For most users, the default configuration is enough: + +```python +draft_model=LlamaNGramMapDecoding() +``` -For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object: +For higher recall, especially when the prompt has fewer repeated patterns, you can lower `min_hits`: ```python -llm = Llama(model_path="./models/llama-model.gguf", n_ctx=2048) +draft_model=LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + min_hits=1, +) ``` +For CPU-only machines, smaller draft lengths such as `num_pred_tokens=2` may still be a better tradeoff. For GPU inference, larger values such as `num_pred_tokens=10` are often reasonable, but the best value depends on model size, prompt structure, backend, and acceptance rate. + +#### Legacy prompt lookup decoder + +`LlamaPromptLookupDecoding` is still available for compatibility: + +```python +from llama_cpp.llama_speculative import LlamaPromptLookupDecoding + +draft_model = LlamaPromptLookupDecoding( + max_ngram_size=3, + num_pred_tokens=10, +) +``` + +However, it uses a legacy NumPy sliding-window lookup and may have higher overhead on long contexts. For new usage, prefer `LlamaNGramMapDecoding`. + +#### Notes + +* Speculative decoding still requires the main model to verify proposed draft tokens. +* Speedup depends on how many draft tokens are accepted. +* Prompt n-gram speculative decoding works best when the current context contains repeated patterns. +* It is especially useful for code generation, structured text, repeated templates, and boilerplate-heavy completions. +* `LlamaNGramMapDecoding` stores internal Python-side history and indexes. If you want to reuse the same decoder instance for an unrelated generation, call `draft_model.clear()`. + +--- + ## Docker image See here: https://github.com/JamePeng/llama-cpp-python/tree/main/docker#cuda_simple @@ -1587,23 +1960,26 @@ This error is primarily caused by the following reasons: 3. **CUDA Version Mismatch:** Regarding `ggml-cuda.dll`, the CUDA version of the pre-compiled library does not match your local CUDA Toolkit version (e.g., a mismatch between CUDA 12.X and CUDA 13.X). It is recommended to fully configure your local CUDA Toolkit environment (ensuring the PATH for dynamic libraries is set and the nvcc compiler is recognized). Then, clone the code and compile it locally. -### Why are libraries compiled by other authors only around 100MB, while your pre-compiled versions range from 300MB to 900MB? +### Why are libraries compiled by other authors only around 100MB, while your pre-compiled versions are 300MB or larger? -My GitHub Actions script is configured to compile against **all supported CUDA compute architectures** for each specific CUDA version I maintain. +My GitHub Actions workflow is configured to compile against multiple supported CUDA compute architectures for each CUDA version I maintain. For example: -* **CUDA 13.0.2:** Currently supports architectures from SM75 (Turing) up to SM120a (Blackwell). -* **CUDA 12.4.1 and 12.6.3:** Support older architectures as well, such as SM70. -* *(Note: The Windows versions are built to support every architecture compatible with the respective CUDA version).* +- **CUDA 13.1 and CUDA 12.8:** currently target architectures from SM75 (Turing) up to SM120a / SM121a (Blackwell generation, depending on CUDA support). +- **CUDA 12.4 and CUDA 12.6:** currently target architectures from SM70 (Volta) up to SM90 (Hopper). + +Libraries from other authors are often smaller because they may only compile for a single architecture, such as RTX 30 series (`SM86`) or RTX 40 series (`SM89`). To maximize compatibility, these wheels include CUDA kernels for a wider range of GPUs. You only need to choose the wheel that matches your installed CUDA version. + + - **Updated 2026-05-16 / 2026-05-17:** Starting with `0.3.39-preview`, Windows wheels support the `GGML_BACKEND_DL` + `GGML_CPU_ALL_VARIANTS` runtime layout. CPU backend libraries such as `ggml-cpu-*.dll` are packaged under `site-packages/llama_cpp/lib` and loaded dynamically at runtime. This allows GGML to select a compatible CPU backend automatically, reducing the need for separate `Basic` / `AVX2` wheel variants. -The reason libraries from other authors are smaller is that they often **only compile for a single architecture** (e.g., targeting only the RTX 30 series [SM86] or the RTX 40 series [SM89]). To maximize convenience, I provide an **integrated compilation** covering a wide range of hardware; you simply need to select the CUDA version that matches your environment to load and run it. + - Note: for full x64 CPU variant coverage on Windows, LLVM/Clang builds are preferred. MSVC may skip some variants such as `zen4`, `cooperlake`, or `sapphirerapids` due to compiler intrinsic support limitations. ### Quick tips for develop/user (continuously updated): * 1. I've determined that `llama_cpp.server` is currently in a semi-deprecated state (meaning it won't be maintained unless absolutely necessary, and I might even consider deleting or separating it to reduce the library size). I highly recommend using the `llama-server` program maintained by the upstream `llama.cpp` project, which offers a lower-level implementation, more frequent maintenance and optimization, and more reliable API calls. -* 2. Regarding AMD and Intel graphics cards, AMD can certainly use ROCm as the primary backend (but the drawback is that it's basically only stable on Linux platforms), and Intel's Sycl will also encounter some compilation difficulties. I consistently recommend using the Vulkan backend for these two types of graphics cards for greater efficiency and stability, because the upstream `llama.cpp` Vulkan backend is actively maintained by many developers, generally allowing you to enjoy new feature optimizations and bug fixes earlier and faster. +* 2. Regarding AMD and Intel graphics cards, AMD can use ROCm as the primary backend, while Intel's Sycl will encounter some compilation difficulties. I consistently recommend using the Vulkan backend for these two types of graphics cards for greater efficiency and stability, because the upstream `llama.cpp` Vulkan backend is actively maintained by many developers, generally allowing you to enjoy new feature optimizations and bug fixes earlier and faster. * 3. If you are using hybrid multimodal model for building ComfyUI nodes or running single-turn API wrappers where you do not need multi-turn state rollbacks, simply initialize your Llama instance with `ctx_checkpoints=0`: diff --git a/docs/api-reference.md b/docs/api-reference.md deleted file mode 100644 index ab51ef754e..0000000000 --- a/docs/api-reference.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -title: API Reference ---- - -## High Level API - -High-level Python bindings for llama.cpp. - -::: llama_cpp.Llama - options: - members: - - __init__ - - tokenize - - detokenize - - reset - - eval - - sample - - generate - - create_embedding - - embed - - create_completion - - __call__ - - create_chat_completion - - create_chat_completion_openai_v1 - - set_cache - - save_state - - load_state - - token_bos - - token_eos - - from_pretrained - show_root_heading: true - -::: llama_cpp.LlamaGrammar - options: - members: - - from_string - - from_json_schema - -::: llama_cpp.LlamaCache - options: - show_root_heading: true - -::: llama_cpp.LlamaState - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessor - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessorList - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteria - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteriaList - options: - show_root_heading: true - -## Low Level API - -Low-level Python bindings for llama.cpp using Python's ctypes library. - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true - # filter only members starting with `llama_` - filters: - - "^llama_" - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - # filter only members starting with `LLAMA_` - filters: - - "^LLAMA_" - -## Misc - -::: llama_cpp.llama_types - options: - show_if_no_docstring: true \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 100644 index 047bc14424..0000000000 --- a/docs/changelog.md +++ /dev/null @@ -1 +0,0 @@ --8<- "CHANGELOG.md" \ No newline at end of file diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 60bc7aef42..0000000000 --- a/docs/index.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Getting Started ---- - --8<- "README.md" \ No newline at end of file diff --git a/docs/install/macos.md b/docs/install/macos.md deleted file mode 100644 index e006fc0a3c..0000000000 --- a/docs/install/macos.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: MacOS Install with Metal GPU ---- - -**(1) Make sure you have xcode installed... at least the command line parts** -``` -# check the path of your xcode install -xcode-select -p - -# xcode installed returns -# /Applications/Xcode-beta.app/Contents/Developer - -# if xcode is missing then install it... it takes ages; -xcode-select --install -``` - -**(2) Install the conda version for MacOS that supports Metal GPU** -``` -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh -``` - -**(3) Make a conda environment** -``` -conda create -n llama python=3.9.16 -conda activate llama -``` - -**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62** - *(you needed xcode installed in order pip to build/compile the C++ code)* -``` -pip uninstall llama-cpp-python -y -CMAKE_ARGS="-DGGML_METAL=on" pip install -U llama-cpp-python --no-cache-dir -pip install 'llama-cpp-python[server]' - -# you should now have llama-cpp-python v0.1.62 or higher installed -llama-cpp-python         0.1.68 - -``` - -**(5) Download a v3 gguf v2 model** - - **ggufv2** - - file name ends with **Q4_0.gguf** - indicating it is 4bit quantized, with quantisation method 0 - -https://huggingface.co/TheBloke/CodeLlama-7B-GGUF - - -**(6) run the llama-cpp-python API server with MacOS Metal GPU support** -``` -# config your ggml model path -# make sure it is gguf v2 -# make sure it is q4_0 -export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]Q4_0.gguf -python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1 -``` - -***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used* - - diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 199bd4ffbf..0000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mkdocs -mkdocs-material -mkdocstrings[python] \ No newline at end of file diff --git a/docs/wiki/SCHEMA.md b/docs/wiki/SCHEMA.md index b96ec964c7..1ffcb1e227 100644 --- a/docs/wiki/SCHEMA.md +++ b/docs/wiki/SCHEMA.md @@ -4,14 +4,15 @@ - **Author**: JamePeng - **Maintainer**: LLM-assisted documentation workflow - **Project**: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) wiki -- **Last Modified**: 2026-05-02 +- **Last Modified**: 2026-06-02 - **Version Target**: latest source code -- **Schema Version**: 0.3 +- **Schema Version**: 0.4 **Purpose**: - Maintain a living, always-up-to-date, structured documentation wiki for the `llama-cpp-python` library, with LLMs acting as the primary documentation maintainer. - The wiki must help users understand the latest public API, core classes, modules, configuration options, examples, and migration paths based on the current source code. - The wiki should explain not only *how to call an API*, but also *what role the class/module plays in the library*, *how its state is configured*, and *how users should choose between related APIs*. +- The schema also defines the expected wiki directory layout, page ownership, and update rules so new pages can be generated consistently. **Core Principles**: - The source of truth is the latest code in `llama_cpp/`, especially: @@ -24,11 +25,12 @@ - `llama_cpp.py` - `mtmd_cpp.py` - `_ggml.py` + - `_logger.py` - Never invent parameters or behavior. Always read the current source code before writing/updating a page. - Prefer documenting public and user-facing APIs first. Internal implementation details may be documented only when they help users understand behavior, extension points, debugging, or advanced usage. - All examples must be complete, runnable with the latest API, and include necessary imports. - Clearly mark deprecated, legacy, or changed usage with a warning and show the modern replacement. -- Use internal wiki links (e.g. [[Llama]], [[Qwen35ChatHandler]]) for cross-referencing. +- Use internal wiki links, such as `[[Llama]]`, `[[LlamaCache]]`, `[[LlamaSpeculative]]`, or `[[Qwen35ChatHandler]]`, for cross-referencing. - Keep pages concise, professional, and user-friendly. **Documentation Language**: @@ -37,9 +39,53 @@ - Code comments inside examples should also be in English by default. - If the source code contains Chinese comments or non-English notes, translate them into clear English while preserving the original meaning. +**Wiki Directory Layout**: + +The wiki should be organized by documentation purpose rather than by source-file location alone. + +```text +docs/wiki/ +├─ core/ # Core classes and modules (e.g., Llama, main API objects) +├─ development/ # Developer-focused pages, tools, agents, CI/CD workflows +├─ examples/ # Complete runnable examples for users +├─ features/ # High-level features spanning multiple classes/modules +├─ modules/ # Specialized modules (cache, embeddings, logging, speculative decoding, bindings) +├─ types/ # Type definitions and data structures used across the library +├─ .gitkeep # Placeholder for Git to track empty directories +├─ contributing-to-wiki.md # Guidelines for contributing to the wiki +├─ index.md # Entry point and table of contents +├─ install.md # Installation instructions +├─ SCHEMA.md # Documentation schema and style guide (this file) +├─ troubleshooting.md # Known issues, debugging tips, FAQ +``` + +### Top-Level Files + +| Path | Purpose | Update Guidance | +|---|---|---| +| `docs/wiki/SCHEMA.md` | Defines the documentation contract, directory structure, page templates, and LLM update rules. | Update when adding a new page type, directory, documentation standard, or structural convention. | +| `docs/wiki/index.md` | Main wiki landing page and navigation entry. | Update when important pages are added, renamed, reorganized, or promoted. | +| `docs/wiki/contributing-to-wiki.md` | Human and LLM contribution guide for maintaining the wiki. | Keep aligned with this schema, especially source-reading and accuracy rules. | +| `docs/wiki/install.md` | Installation guide placeholder or final installation documentation. | Convert from placeholder to complete page when installation docs are ready. | +| `docs/wiki/troubleshooting.md` | Troubleshooting guide placeholder or final diagnostics documentation. | Expand with common runtime, build, backend, model loading, and environment issues. | +| `docs/wiki/.gitkeep` | Keeps the wiki directory tracked when needed. | No documentation content is required. | + +### Directory Ownership + +| Directory | Purpose | Typical Content | Primary Audience | +|---|---|---|---| +| `core/` | High-level public entry points and central user APIs. | `Llama`, model lifecycle, generation APIs, chat/completion interfaces. | General users and advanced users. | +| `modules/` | Focused subsystem pages, user-facing modules, low-level bindings, helpers, and advanced API areas. | Cache, embeddings, grammar, speculative decoding, logging, llama.cpp bindings, MTMD bindings. | Advanced users, extension authors, maintainers. | +| `features/` | Workflow-oriented guides that span multiple APIs or modules. | Chat formatting, structured output, multimodal usage, backend loading, caching workflows, speculative decoding workflows. | Users solving a specific task. | +| `examples/` | Complete runnable examples. | Minimal inference, chat completion, embeddings, grammar-constrained generation, speculative decoding, multimodal usage. | Users who want copy-paste starting points. | +| `types/` | Type and schema documentation. | Request/response structures, typed dictionaries, protocol-style types, OpenAI-compatible payloads. | Users integrating with typed code or API-compatible workflows. | +| `development/` | Maintainer-facing documentation and contribution workflows. | Build notes, CI notes, release notes, commit generation workflow, documentation maintenance rules. | Maintainers and contributors. | + **Page Types and Templates**: -1. **Class / Module Page** (e.g. core/Llama.md, modules/LlamaEmbedding.md) +1. **Class / Module Page** + Examples: `core/Llama.md`, `modules/LlamaEmbedding.md`, `modules/LlamaCache.md` + - Frontmatter (YAML): ```yaml --- @@ -50,14 +96,15 @@ version_target: "latest" --- ``` - - Sections (in order): + + - Sections, in order: - Overview - Role in the Library - Constructor (`__init__`) – full parameter table with types, defaults, and explanations - Important Attributes / State - - Core Methods (with signatures and usage examples) + - Core Methods, with signatures and usage examples - Best Practices & Common Patterns - - Deprecated / Changed APIs (with migration notes) + - Deprecated / Changed APIs, with migration notes - Related Links - The **Overview** should briefly explain: @@ -80,24 +127,77 @@ - Only document attributes that affect user understanding, configuration, lifecycle, inference behavior, caching, chat formatting, embeddings, or debugging. Do not document every trivial private variable. -2. **Feature Page** (features/xxx.md) - - Overview, When to use, Related APIs, Code examples, Configuration Notes, Limitations, Related features - - Feature pages should explain workflows across multiple classes or modules. - -3. **Example Page** (examples/xxx.md) - - Goal, Prerequisites, Complete runnable code block, Expected output, Tips - - Rules: - * Use the latest API. - * Include all imports as need. - * Avoid pseudo-code. - * Keep examples focused. - * Mention required model assumptions when needed, such as GGUF file path or chat format. +2. **Feature Page** + Example: `features/speculative-decoding.md`, `features/embeddings-rerank.md` + + Feature pages should explain workflows across multiple classes or modules. + + Required sections: + - Overview + - When to Use + - Related APIs + - Code Examples + - Configuration Notes + - Limitations + - Related Features + +3. **Example Page** + Example: `examples/chat-completion.md` + + Required sections: + - Goal + - Prerequisites + - Complete Runnable Code + - Expected Output + - Tips + + Rules: + - Use the latest API. + - Include all required imports. + - Avoid pseudo-code. + - Keep examples focused. + - Mention required model assumptions when needed, such as GGUF file path, embedding mode, grammar file, chat format, or multimodal assets. + +4. **Development Page** + Example: `development/GitCommitGenerationAgent.md` + + Development pages are maintainer-facing and may document repository workflows, CI, release notes, build matrix decisions, or documentation maintenance conventions. + + Required sections: + - Overview + - Scope + - Workflow + - Inputs / Outputs + - Rules and Constraints + - Examples + - Related Links + +**Cross-Linking Rules**: + +- Use wiki-style internal links for pages that exist or should exist, such as `[[Llama]]`, `[[LlamaCache]]`, `[[LlamaSpeculative]]`, and `[[Logger]]`. +- Link from high-level pages to lower-level module pages when the module explains advanced details. +- Link from feature pages back to the relevant class/module pages. +- Avoid circular explanations. A page may link to another page for details instead of repeating the same explanation. **Update Rules**: + - Before updating any page, the LLM must read the relevant source files. - Update the `last_updated` date. -- If a new feature appears, such as a new chat handler, sampler, cache type, embedding API, multimodal API, or backend option, create or expand the corresponding page. +- If a new feature appears, such as a new chat handler, sampler, cache type, embedding API, multimodal API, backend option, or binding wrapper, create or expand the corresponding page. - If behavior is inferred from implementation rather than explicitly documented in code, mark the explanation as implementation-based. +- Empty files should be converted into explicit placeholder pages instead of being left blank. - Maintain a high standard of readability and accuracy. -This schema is the contract. All generated content must follow it. \ No newline at end of file +**Quality Checklist**: + +Before finalizing a wiki page, verify: + +- The page reflects the latest source code. +- All parameters, defaults, and return values are accurate. +- Examples are runnable and include necessary imports. +- Internal links point to the correct wiki page names. +- Advanced or low-level APIs are clearly labeled. +- Deprecated behavior is clearly separated from current usage. +- The page avoids undocumented claims, speculative behavior, or outdated assumptions. + +This schema is the contract. All generated content must follow it. diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index 7a9b7bd6ad..1f7cce206b 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -4,13 +4,13 @@ title: Llama Class module_name: llama_cpp.llama source_file: llama_cpp/llama.py class_name: Llama -last_updated: 2026-05-01 +last_updated: 2026-05-16 version_target: "latest" --- ``` ## Overview -The `Llama` class is the core, high-level Python wrapper for a `llama.cpp` model. It handles model loading, memory management (KV cache), tokenization, and generation (both base text completion and chat formatting). It includes advanced features like dynamic LoRA routing, hybrid model checkpointing, speculative decoding, and context shifting. +The `Llama` class is the core, high-level Python wrapper for a `llama.cpp` model. It handles model loading, memory management (KV cache), tokenization, and generation (both base text completion and chat formatting). It includes advanced features like dynamic LoRA routing, dual-mode hybrid/recurrent checkpointing, speculative decoding, and context shifting. ## Constructor (`__init__`) @@ -51,8 +51,18 @@ Initialize the model and context. Note that model loading will immediately alloc | `chat_format` | `str` | `None` | String specifying the chat template (e.g., `"llama-2"`, `"chatml"`). Guessed from GGUF if None. | | `chat_handler` | `LlamaChatCompletionHandler` | `None` | Optional custom handler. See [[ChatHandlers]]. | | `draft_model` | `LlamaDraftModel` | `None` | Optional draft model for speculative decoding. | -| `ctx_checkpoints` | `int` | `32` | Max context checkpoints per slot (Hybrid/SWA models). | -| `checkpoint_interval`| `int`| `4096` | Token interval for saving Hybrid model checkpoints. | +| `ctx_checkpoints` | `int` | `16` | Max hybrid/recurrent context checkpoints to keep. Set to `0` to disable checkpointing for single-turn fast paths. | +| `checkpoint_interval` | `int` | `4096` | Token interval for saving periodic Hybrid/Recurrent checkpoints during long prompt evaluation. | +| `checkpoint_on_device` | `bool` | `False` | Store Hybrid/Recurrent checkpoint tensor payloads in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. Reduces device-to-host copy overhead, but only one active checkpoint per `seq_id` is safe. | + +### Runtime Logging Parameters + +| Parameter | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `verbose` | `bool` | `True` | Backward-compatible boolean native logging switch. `False` keeps only error-level llama.cpp / ggml logs; `True` enables debug-level native logs. If `verbosity` is provided, `verbosity` takes precedence over `verbose`. | +| `verbosity` | `Optional[Union[int, str, bool]]` | `None` | Fine-grained llama.cpp-style native runtime log verbosity. Numeric levels: `0=output`, `1=error`, `2=warning`, `3=info`, `4=trace`, `5=debug`. Use `verbosity=3` for llama.cpp-style default info logs. String aliases such as `"silent"`, `"quiet"`, `"info"`, `"trace"`, and `"debug"` are also accepted. | +| `log_filters` | `Optional[Sequence[str]]` | `None` | Optional substring filters for native runtime logs. If any provided substring appears in a decoded backend log message, that message is suppressed. The default logger may include built-in filters for noisy low-level logs such as `CUDA Graph id %d reuse` messages. Pass an empty list `[]` to disable default substring filtering. | +| `log_filters_case_sensitive` | `bool` | `True` | Whether `log_filters` should match case-sensitively. Defaults to `True` for predictable low-level backend log filtering. | *(Note: There are numerous additional RoPE/YaRN scaling parameters available for specialized context extension. Refer to the source code for the full list).* @@ -111,6 +121,42 @@ model.eval(tokens=[1, 453, 234, 987], active_loras=[{"name": "coding_adapter", " Immediately halts an active generation loop safely. * **Usage**: Typically called from a separate monitoring thread (like a timer). When triggered, the running stream will exit and the final chunk will contain `"finish_reason": "abort"`. +### Runtime Logging Control + +The `Llama` class exposes lightweight runtime helpers for adjusting native llama.cpp / ggml logging after initialization. + +> **Note:** Native backend logging is process-global because llama.cpp / ggml use a global log callback. Changing verbosity or log filters affects all `Llama` instances in the current Python process. + +* `set_verbosity(verbosity: Union[int, str, bool, None])`: Set native runtime log verbosity. +* `get_verbosity() -> int`: Return the current native runtime log verbosity. +* `set_log_filters(filters: Sequence[str], case_sensitive: bool = True)`: Replace substring filters for native runtime logs. +* `add_log_filters(filters: Sequence[str])`: Append substring filters. +* `get_log_filters() -> List[str]`: Return the current substring filters. +* `clear_log_filters()`: Clear all substring filters, including default filters. +* `reset_log_filters()`: Restore default substring filters. + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="models/qwen3.gguf", + verbosity=3, # llama.cpp-style info logs +) + +# Temporarily enable debug-level native logs. +llm.set_verbosity(5) + +# Suppress noisy backend messages by substring. +llm.add_log_filters([ + "CUDA Graph", + "CUDA graph", + "clip_model_loader: tensor", +]) + +# Return to quiet error-only logging. +llm.set_verbosity(1) +``` + ### Dynamic LoRA Management The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dynamically per-generation or per-eval. * `load_lora(name: str, path: str)`: Loads an adapter into VRAM (does not apply it yet). @@ -184,23 +230,46 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn llm.create_completion("Once upon a time", active_loras=[{"name": "story", "scale": 0.9}]) # Use sql adapter - llm.create_completion("SELECT *", active_loras=[{"name": "sql_expert", "scale": 0.8}])v + llm.create_completion("SELECT *", active_loras=[{"name": "sql_expert", "scale": 0.8}]) ``` 5. **Hybrid & Recurrent Architectures**: - The class natively detects Hybrid/Recurrent models (like LFM2VL/LFM2.5VL, Qwen3.5/3.6, Mamba or specialized SWA models(Gemma3/4)) and automatically enables the `HybridCheckpointCache`. This creates periodic save-states during large context pre-filling, allowing the model to roll back seamlessly if a generation is rejected (e.g., speculative decoding mismatches) without corrupting the recurrent state. + The class natively detects Hybrid/Recurrent models (for example LFM2VL/LFM2.5VL, Qwen3.5/3.6, Mamba, RWKV, or specialized SWA models such as Gemma3/4) and automatically enables the `HybridCheckpointCache`. - * Tips: If you are using hybrid multimodal model for building ComfyUI nodes or running single-turn API wrappers where you do not need multi-turn state rollbacks, simply initialize your Llama instance with `ctx_checkpoints=0`: + Unlike regular Transformer KV caches, Hybrid/Recurrent model memory cannot always be safely truncated token-by-token. The wrapper therefore saves periodic sequence-state checkpoints during long context prefill, allowing rollback to a verified prefix without corrupting recurrent state. + + `HybridCheckpointCache` supports two checkpoint storage modes: + + - **Host checkpoint mode** (`checkpoint_on_device=False`, default): checkpoint payloads are serialized into Python-owned bytes. This supports multiple historical checkpoints per `seq_id`, which is useful for multi-turn reuse and deeper rollback history. + - **Device checkpoint mode** (`checkpoint_on_device=True`): checkpoint tensor payloads are stored in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. Python only keeps the host-visible serialized portion. This reduces device-to-host tensor copy overhead, but only one active checkpoint per `seq_id` is safe because device payloads are keyed by `seq_id`. + + *Tips*: If you are using a hybrid multimodal model for ComfyUI nodes or single-turn API wrappers where you do not need multi-turn state rollback, initialize your `Llama` instance with `ctx_checkpoints=0`: + + ```python + llm = Llama( + model_path="./Qwen3.5-VL-9B.gguf", + chat_handler=MTMDChatHandler(clip_model_path="./mmproj.gguf"), + n_ctx=4096, + ctx_checkpoints=0 # Disable checkpoints for zero-latency single-turn fast paths + ) + ``` + + For long prompts on GPU-backed Hybrid/Recurrent models, you can enable device-backed checkpoints to reduce device-to-host copy overhead: + + ```python + llm = Llama( + model_path="./Qwen3.6-27B.gguf", + n_ctx=32768, + n_gpu_layers=-1, + ctx_checkpoints=16, + checkpoint_interval=4096, + checkpoint_on_device=True + ) + ``` + + Use `checkpoint_on_device=False` if you need multiple historical checkpoints for the same `seq_id`. Use `checkpoint_on_device=True` when fast rollback/checkpointing is more important than keeping many historical checkpoint payloads. - ```python - llm = Llama( - model_path="./Qwen3.5-VL-9B.gguf", - chat_handler=MTMDChatHandler(clip_model_path="./mmproj.gguf"), - n_ctx=4096, - ctx_checkpoints=0 # <-- SET THIS TO 0 TO ENABLE ZERO-LATENCY FAST PATH - ) - ``` 6. **Assistant Prefill**: `llama-cpp-python` supports native **Assistant Prefill** for seamless message continuation. You can now simply use the `assistant_prefill=True` parameter in the `create_chat_completion` function. @@ -297,6 +366,67 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn run_controlled_generation("Explain quantum mechanics in a way that relates to bugs in code.", timeout_seconds=8) ``` +8. **Runtime Logging & Backend Noise Filtering**: + + `Llama` supports fine-grained native llama.cpp / ggml logging through `verbosity`. This is more precise than the legacy `verbose` boolean flag. + + ```python + from llama_cpp import Llama + + # Legacy behavior: + # verbose=False -> error-only logs + llm_quiet = Llama( + model_path="models/qwen3.gguf", + verbose=False, + ) + + # Recommended precise logging: + # 0 = output, 1 = error, 2 = warning, 3 = info, 4 = trace, 5 = debug + llm = Llama( + model_path="models/qwen3.gguf", + verbosity=3, # llama.cpp-style default info logs + ) + ``` + + For low-level debugging, use `verbosity=5`. By default, the logger may suppress known noisy backend messages such as CUDA Graph reuse logs. Pass `log_filters=[]` to disable all substring filtering. + + ```python + llm = Llama( + model_path="models/qwen3.gguf", + verbosity=5, + log_filters=[], # show all debug logs, including normally filtered ones + ) + ``` + + To suppress additional noisy messages, pass substring filters: + + ```python + llm = Llama( + model_path="models/qwen3.gguf", + verbosity=5, + log_filters=[ + "CUDA Graph id", + "clip_model_loader: tensor", + "ggml_cuda_graph_update_required", + ], + ) + ``` + + You can also adjust logging at runtime: + + ```python + llm.set_verbosity(5) + llm.add_log_filters(["llama_perf_context_print"]) + + # Later, return to warning-level logs. + llm.set_verbosity(2) + ``` + + **Important:** native backend logging is process-global. Runtime changes affect all `Llama` instances in the same Python process. + + **verbose=False** vs. **verbosity=0**: These have distinct behaviors. + - `verbose=False` silences Python wrapper prints but not backend diagnostics; like `if self.verbose: print()` + - `verbosity=0` silences all backend non-error output. --- diff --git a/docs/wiki/development/git-commit-generation-agent.md b/docs/wiki/development/git-commit-generation-agent.md new file mode 100644 index 0000000000..4cce635154 --- /dev/null +++ b/docs/wiki/development/git-commit-generation-agent.md @@ -0,0 +1,214 @@ +--- +title: Git Commit Generation Agent +page_type: development-helper +source_file: docs/wiki/development/git-commit-generation-agent.md +last_updated: 2026-05-23 +version_target: "latest" +author: JamePeng +audience: maintainers +--- + +# Git Commit Generation Agent for `llama-cpp-python` + +## Overview + +This page defines a maintainer-facing LLM helper workflow for generating +high-quality, descriptive, and standardized Git commit messages for +`llama-cpp-python`. + +## System Persona +You are an expert C++/Python developer and a core maintainer of the +`llama-cpp-python` project. Your task is to generate clear, accurate, and +standardized Git commit messages based on provided diffs, source snippets, +benchmark notes, issue references, or maintainer summaries. + +## Core Principles + +The project follows the **Conventional Commits** specification and requires a +**Developer Certificate of Origin (DCO) Sign-off**. + +Generated commit messages must prioritize: + +- **Why** the change was needed. +- **How** the change was implemented. +- **What** user-visible, runtime, build, packaging, or documentation behavior + changed. +- **What** future maintainers need to know when reading the project history. + +## Input Requirements + +The agent may receive: + +- A full Git diff +- A changed file list +- Source snippets +- Benchmark results +- Maintainer notes +- Issue or PR references +- A natural-language summary of changes + +When the input is incomplete, generate the best possible commit message from the +provided information, but do not invent implementation details. + +## Formatting Rules + +### 1. Header Line (Subject) +Use the following format: + +```text +(): +```` + +Allowed types: + +| Type | Use for | +| ---------- | ----------------------------------------------------------- | +| `feat` | New features or user-facing capabilities | +| `fix` | Bug fixes | +| `docs` | Documentation-only changes | +| `build` | CMake, build scripts, compiler flags, packaging build logic | +| `perf` | Performance optimizations | +| `ci` | GitHub Actions or other workflow changes | +| `chore` | Maintenance, cleanup, or non-user-facing changes | +| `refactor` | Internal restructuring without behavior change | +| `test` | Test additions or updates | + +Recommended scopes: + +* `llama` +* `core` +* `bindings` +* `sampling` +* `speculative` +* `cache` +* `chat` +* `multimodal` +* `embedding` +* `types` +* `cmake` +* `windows` +* `cuda` +* `metal` +* `ci` +* `docs` +* `readme` +* `packaging` + +Subject rules: + +* Use imperative mood, such as `add`, `fix`, `update`, `skip`, `expose`. +* Do not use past tense, such as `added`, `fixed`, or `updated`. +* Keep the subject under 72 characters when possible. +* Use lowercase unless a proper noun, symbol, or API name requires otherwise. +* Do not end the subject with a period. + +### 2. Body +Leave one blank line between the header and the body. +The body should: +* Start with a short paragraph explaining the motivation or problem. +* Use bullets when the diff contains multiple logical changes. +* Mention important files, classes, functions, flags, or APIs using Markdown + backticks. +* Keep lines wrapped at around 72-80 characters. +* Mention user-visible behavior changes when relevant. +* Mention performance impact only when supported by the input. + +### 3. Footer (Sign-off) +* Leave one blank line after the body. +* You MUST append a generic DCO sign-off line at the very end. +* **Format:** `Signed-off-by: Developer Name ` + +--- + +## Accuracy Rules + +* Do not invent changed files, functions, APIs, benchmarks, flags, or behavior. +* Do not claim performance improvements unless benchmark data is provided or the + diff clearly supports the optimization. +* Do not mention issue or PR numbers unless provided by the user. +* Do not include migration notes unless the change affects user-facing APIs. +* If the change is documentation-only, do not imply runtime behavior changed. +* If the change is internal-only, do not overstate it as a user-facing feature. +* Prefer specific technical descriptions over generic wording. + +## Output Rules + +When the user provides a code diff or a summary of changes, analyze the intent +and output only the raw Git commit message. + +Do not: + +* Wrap the commit message in Markdown code fences. +* Add explanations before or after the commit message. +* Add headings such as `Commit message:`. +* Include alternative versions unless explicitly requested. + +## Output Examples + +### Example 1: Build System Change +```text +build(cmake): package LLVM OpenMP runtime DLL for Windows wheels + +Dynamically loaded GGML CPU backends compiled with LLVM/Clang and OpenMP +require `libomp140.x86_64.dll` at runtime. Since this dependency is not +always caught by `$`, it must be packaged manually. + +- Add `llama_cpp_python_install_windows_runtime_file` to handle installing + arbitrary extra DLLs with proper CMake path normalization. +- Add fallback search logic to locate the OpenMP DLL in common Visual Studio + directories. +- Execute the installation before the dev-file cleanup step to ensure the + DLL is correctly packaged in the final Python wheel. + +Signed-off-by: Developer Name + +``` + +### Example 2: Performance Optimization + +```text +perf(eval): skip unnecessary logit array copies during native sampling + +Introduce a `copy_logits` flag to `Llama.eval()` to control whether C-level +logits are copied into the Python `self.scores` array. + +- Automatically disable `copy_logits` during the generation loop unless + Python-side hooks (`logits_processor`, `stopping_criteria`) explicitly + require them. +- Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch + the final token's logits when copying is required. + +This significantly reduces CPU overhead and memory bandwidth during generation, +as the native `llama.cpp` sampler reads directly from the C context without +needing to expose the `n_vocab` array to Python on every token. + +Signed-off-by: Developer Name + +``` + +### Example 3: Documentation Update + +```text +docs(speculative): document n-gram map k/k4v modes and new parameters + +Reflect the recent architectural upgrades to `LlamaNGramMapDecoding` in +the official documentation. + +- Document the new `__init__` parameters (`mode`, `min_hits`, + `max_entries_per_key`) and their validation rules. +- Add a detailed comparison table explaining the memory and behavior + differences between the `"k"` and `"k4v"` lookup modes. +- Add a strong production warning against the legacy `LlamaPromptLookupDecoding` + implementation. + +Signed-off-by: Developer Name + +``` + +## Execution + +When the user provides a code diff or a summary of changes, analyze the intent and output ONLY the raw Git commit message following the exact structure and tone demonstrated above. + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] diff --git a/docs/wiki/index.md b/docs/wiki/index.md index 02f2dd5b9a..8e5dbed14b 100644 --- a/docs/wiki/index.md +++ b/docs/wiki/index.md @@ -10,6 +10,16 @@ The documentation is maintained with the help of LLMs, but the source of truth i ## Quick Navigation +### Getting Started + +Start here if you are installing or rebuilding `llama-cpp-python`. + +| Page | Description | +|---|---| +| [install\|Installation] | Source installation guide covering Python setup, CMake options, llama.cpp backend selection, hardware acceleration, rebuilds, and verification. | + +--- + ### Core API Start here if you are using `llama-cpp-python` directly. @@ -30,6 +40,19 @@ These pages document major source modules and related classes. | [modules/LlamaEmbedding\|Llama Embedding] | Embedding-related APIs and usage patterns. | | [modules/LlamaGrammar\|Llama Grammar] | Provides grammar utilities for constrained generation. | | [modules/LlamaSpeculative\|Llama Speculative Decoding] | Draft model interfaces and prompt-based speculative decoding helpers. | +| [modules/Logger\|Logger] | provides configuration for runtime logging in `llama-cpp-python`, wrapping the native `ggml`/`llama.cpp` logging infrastructure. It controls verbosity levels, output streams, substring filtering, and callback integration, allowing fine-grained control over diagnostic and informational output from the underlying bindings. | + +--- + +### Development + +This section contains maintainer-facing development notes, workflows, and LLM-assisted helper tools for working on `llama-cpp-python`. + +#### Pages + +| Page | Description | +|---|---| +| [development/Git Commit Generation Agent] | Helper workflow for generating clear, structured, and source-aware Git commit messages. | --- @@ -48,14 +71,16 @@ These pages define how the wiki should be written, updated, and reviewed. If you are new to this wiki, read the pages in this order: -1. [[core/Llama|Llama](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] -2. [[modules/LlamaCache|Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] -3. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] -4. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] -5. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] +1. [[install|Installation](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/install.md)] +2. [[core/Llama|Llama](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +3. [[modules/LlamaCache|Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] +4. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] +5. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] +6. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] +7. [[modules/Logger\|Logger](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/Logger.md)] +8. [[development/Git Commit Generation Agent](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/development/git-commit-generation-agent.md)] If you are contributing documentation, start with: - 1. [[SCHEMA|Wiki Schema](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md)] 2. [[contributing-to-wiki|Contributing to the Wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md)] @@ -67,11 +92,14 @@ The wiki is still being expanded. Currently available pages: +- `install.md` - `core/Llama.md` - `modules/LlamaCache.md` - `modules/LlamaEmbedding.md` - `modules/LlamaGrammar.md` - `modules/LlamaSpeculative.md` +- `modules/Logger.md` +- `development/git-commit-generation-agent.md` - `SCHEMA.md` - `contributing-to-wiki.md` @@ -83,7 +111,6 @@ Some planned pages may already exist as empty placeholder files. Empty pages are Future documentation may cover: -- Installation and build options - Chat formats and chat handlers - Low-level ctypes bindings - Multimodal APIs @@ -110,5 +137,6 @@ This wiki follows a few core rules: ## Project Links - GitHub: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) +- Installation guide: [install](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/install.md) - Wiki schema: [SCHEMA](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md) -- Contribution guide: [contributing-to-wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md) \ No newline at end of file +- Contribution guide: [contributing-to-wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md) diff --git a/docs/wiki/install.md b/docs/wiki/install.md index e69de29bb2..576ca14c6f 100644 --- a/docs/wiki/install.md +++ b/docs/wiki/install.md @@ -0,0 +1,775 @@ +--- +title: Installation +page_type: guide +source_files: + - README.md + - vendor/llama.cpp/docs/build.md + - vendor/llama.cpp/docs/backend/ +last_updated: 2026-06-02 +author: JamePeng +version_target: "latest" +--- + +# Installation + +## Overview + +This page explains how to install `llama-cpp-python` from source, with or +without hardware acceleration. + +`llama-cpp-python` builds the native `llama.cpp` libraries during installation +and installs them inside the Python package. The exact build depends on your +Python version, compiler, CMake version, operating system, and selected +`llama.cpp` backend. + +For most users, the safest installation path is: + +1. Create a clean Python virtual environment. +2. Upgrade `pip`. +3. Install from the GitHub repository. +4. Pass `CMAKE_ARGS` only when you need a specific backend. + +--- + +## Requirements + +| Requirement | Notes | +|---|---| +| Python | Python 3.9 or newer. The package metadata currently lists Python 3.9 through 3.14. | +| CMake | CMake 3.21 or newer. | +| C/C++ compiler | Required because the package builds `llama.cpp` native libraries. | +| Git | Required when installing from the GitHub repository or cloning recursively. | +| Backend SDKs | Required only for GPU or accelerator builds, such as CUDA, Vulkan, OpenVINO, ROCm/HIP, or SYCL. | + +Platform compiler guidance: + +| Platform | Typical compiler setup | +|---|---| +| Linux | `gcc` or `clang` plus Python development headers if required by your distribution. | +| Windows | Visual Studio 2022 Build Tools or MinGW. For most native builds, Visual Studio Build Tools is recommended. | +| macOS | Xcode Command Line Tools. Metal is enabled by default on supported macOS builds. | + +--- + +## Use a Virtual Environment + +Using a virtual environment avoids mixing build artifacts and dependencies from +different Python installations. + +### Linux and macOS + +```bash +python3 -m venv .venv +source .venv/bin/activate +python -m pip install --upgrade pip setuptools wheel +``` + +### Windows PowerShell + +```powershell +py -3 -m venv .venv +.\.venv\Scripts\Activate.ps1 +python -m pip install --upgrade pip setuptools wheel +``` + +If PowerShell blocks activation scripts, run: + +```powershell +Set-ExecutionPolicy -Scope CurrentUser RemoteSigned +``` + +Then activate the environment again. + +--- + +## Basic Installation + +Install directly from the project repository: + +```bash +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +On Windows PowerShell: + +```powershell +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +This builds `llama.cpp` from source and installs the generated native runtime +libraries alongside the Python package. + +Use verbose output when diagnosing build failures: + +```bash +python -m pip install --verbose "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## Install From a Local Clone + +Clone recursively so the `vendor/llama.cpp` submodule is available: + +```bash +git clone https://github.com/JamePeng/llama-cpp-python --recursive +cd llama-cpp-python +python -m pip install --upgrade pip +python -m pip install . +``` + +If you already cloned without `--recursive`, initialize the submodule manually: + +```bash +git submodule update --init --recursive +``` + +For editable development installs: + +```bash +python -m pip install -e . +``` + +--- + +## Passing CMake Options + +`llama.cpp` backend options are passed through CMake. There are two common +ways to pass those options during `pip install`. + +### Environment Variable + +Linux and macOS: + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Windows PowerShell: + +```powershell +$env:CMAKE_ARGS = "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Clear the variable after the build if you do not want it reused: + +```powershell +Remove-Item Env:CMAKE_ARGS +``` + +### `pip --config-settings` + +You can also pass CMake arguments through `pip`: + +```bash +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" \ + -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" +``` + +Use semicolons inside `cmake.args` when passing multiple CMake definitions. + +--- + +## Common CMake Options + +The Python package forwards CMake options to the bundled `vendor/llama.cpp` +build. These options are useful across many backends. + +| Option | Typical values | Use | +|---|---|---| +| `CMAKE_BUILD_TYPE` | `Release`, `Debug` | Selects build type for single-config generators such as Ninja or Unix Makefiles. Release is the normal install choice. | +| `GGML_NATIVE` | `ON`, `OFF` | Controls whether ggml builds for the current host CPU/GPU. Use `OFF` for more portable wheels; use `ON` for local machine-specific optimization. | +| `BUILD_SHARED_LIBS` | `ON`, `OFF` | Controls shared versus static native libraries. The Python package normally installs shared runtime libraries. | +| `GGML_BACKEND_DL` | `ON`, `OFF` | Builds backend libraries so they can be loaded dynamically at runtime when supported by the build. | +| `GGML_CPU_ALL_VARIANTS` | `ON`, `OFF` | Builds multiple CPU backend variants for x86 feature sets when supported. Useful for portable x64 wheels. | +| `GGML_OPENMP` | `ON`, `OFF` | Enables OpenMP CPU parallelism. On Windows, OpenMP runtime DLLs may need to be packaged beside backend DLLs. | +| `CMAKE_PREFIX_PATH` | path list | Helps CMake find SDKs or libraries installed outside default locations. | +| `CMAKE_C_COMPILER` / `CMAKE_CXX_COMPILER` | compiler paths or names | Selects compilers, often needed for SYCL, HIP, or custom toolchains. | + +Example portable CUDA build: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DGGML_NATIVE=OFF" \ + python -m pip install --force-reinstall --no-cache-dir \ + "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Example dynamic CPU backend build: + +```bash +CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_NATIVE=OFF" \ + python -m pip install --force-reinstall --no-cache-dir \ + "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## Backend Quick Reference + +Choose one backend path that matches your hardware and installed SDKs. + +| Backend | Typical CMake option | Notes | +|---|---|---| +| CPU only | none | Default portable path. Performance depends on CPU features and build options. | +| OpenBLAS | `-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS` | CPU BLAS acceleration for prompt processing and larger batches. | +| BLIS | `-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME` | CPU BLAS route using BLIS. | +| Intel oneMKL | `-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp` | Intel CPU BLAS route. This is not the Intel GPU path. | +| CUDA | `-DGGML_CUDA=on` | Requires NVIDIA CUDA Toolkit matching your driver and GPU. | +| Metal | `-DGGML_METAL=on` | Enabled by default on supported macOS builds. Use `-DGGML_METAL=OFF` to disable. | +| Vulkan | `-DGGML_VULKAN=on` | Requires Vulkan SDK and platform-specific setup. | +| OpenVINO | `-DGGML_OPENVINO=ON` | Useful for Intel CPU, GPU, and NPU workflows after OpenVINO environment setup. | +| HIP / ROCm | `-DGGML_HIP=ON` | For supported AMD GPUs. May require `GPU_TARGETS`. | +| SYCL | `-DGGML_SYCL=on` | Usually used with Intel oneAPI compilers. | +| OpenCL | `-DGGML_OPENCL=ON` | Primarily documented for Qualcomm Adreno and Snapdragon workflows; can also apply to some other OpenCL devices. | +| CANN | `-DGGML_CANN=ON` | Ascend NPU backend. Requires Ascend drivers and CANN toolkit. | +| ZenDNN | `-DGGML_ZENDNN=ON` | AMD Zen CPU acceleration, mainly matrix multiplication paths. | +| zDNN | `-DGGML_ZDNN=ON -DZDNN_ROOT=/path/to/zdnn` | IBM Z / LinuxONE acceleration path. | + +For the full list of backend options, check the upstream llama.cpp build +documentation and the current `vendor/llama.cpp` source. + +--- + +## CUDA + +CUDA builds require the NVIDIA CUDA Toolkit. Choose a toolkit version that is +compatible with your driver and GPU. + +Linux: + +```bash +CMAKE_ARGS="-DGGML_CUDA=on" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Windows PowerShell: + +```powershell +$env:CMAKE_ARGS = "-DGGML_CUDA=on" +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +For newer NVIDIA GPUs with compute capability 90 or higher, the README notes +that Programmatic Dependent Launch can be enabled with: + +```bash +-DGGML_CUDA_PDL=ON +``` + +Example: + +```bash +CMAKE_ARGS="-DGGML_CUDA=on -DGGML_CUDA_PDL=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +If `nvcc` produces large volumes of non-blocking template warnings, the README +documents optional CUDA warning suppression: + +```bash +-DCMAKE_CUDA_FLAGS="--diag-suppress=177 --diag-suppress=221 --diag-suppress=550" +``` + +### CUDA Portability and Architecture Selection + +By default, llama.cpp may build for the GPU detected on the build machine. For +a wheel intended to run across multiple CUDA GPUs, disable native detection: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DGGML_NATIVE=OFF" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +If `nvcc` cannot detect your GPU, or if you want to control the generated +binary size, specify CUDA architectures explicitly: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=86;89" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Use NVIDIA's compute capability table to choose architecture numbers. For +example, RTX 30-series GPUs commonly use `86`, and RTX 4090 uses `89`. + +If multiple CUDA toolkits are installed, point CMake at the intended compiler: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-12.8/bin/nvcc" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Runtime variables that may matter after installation: + +| Variable | Use | +|---|---| +| `CUDA_VISIBLE_DEVICES` | Selects or hides CUDA devices for the current process. | +| `GGML_CUDA_ENABLE_UNIFIED_MEMORY` | Enables unified-memory fallback on Linux when VRAM is exhausted. On Windows, similar behavior may be controlled by NVIDIA driver settings. | +| `GGML_CUDA_P2P` | Enables peer-to-peer access between GPUs when driver and hardware support it. | +| `GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F` | Forces FP32 compute in selected cuBLAS paths, trading speed for numerical headroom. | +| `GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` | Forces FP16 compute in selected cuBLAS paths when supported. | + +--- + +## BLAS and CPU Acceleration + +BLAS acceleration mainly improves prompt processing and larger batch prefill. +It generally does not improve single-token generation speed as much as GPU +offload. + +### OpenBLAS + +Use OpenBLAS when the OpenBLAS development package is available on your system. + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +On Linux, install the OpenBLAS development package with your system package +manager before building. Package names vary by distribution. + +### BLIS + +BLIS is selected through the `FLAME` BLAS vendor after BLIS is installed: + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +The upstream BLIS guide also notes that runtime variables such as +`BLIS_NUM_THREADS` and OpenMP affinity settings can affect CPU performance. + +### Intel oneMKL for CPU + +Intel oneMKL is a CPU BLAS path. It is different from Intel GPU acceleration, +which is usually handled through SYCL or OpenVINO. + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## Metal on macOS + +On macOS, Metal is enabled by default by this project when building on Apple +platforms. A normal install is usually enough: + +```bash +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +To disable Metal at build time: + +```bash +CMAKE_ARGS="-DGGML_METAL=OFF" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +At runtime, use `n_gpu_layers=0` when you want CPU inference even though the +package was built with Metal support. + +--- + +## Vulkan + +Vulkan builds require the Vulkan SDK and any platform-specific environment +setup required by the SDK. + +```bash +CMAKE_ARGS="-DGGML_VULKAN=on" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +On Linux and macOS, make sure the Vulkan SDK setup script has been sourced in +the same shell session before running `pip install`. + +On Windows, install the Vulkan SDK and make sure its environment variables are +available in the shell that runs the build. + +On Linux, system packages can also provide the Vulkan loader and shader tools. +The upstream guide notes that SPIR-V headers may be required separately from +the Vulkan loader development package on some distributions. + +For macOS Vulkan builds, Vulkan usually runs through a Metal translation layer. +The upstream guide builds Vulkan with Metal disabled: + +```bash +CMAKE_ARGS="-DGGML_VULKAN=ON -DGGML_METAL=OFF" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## OpenVINO + +OpenVINO builds require the OpenVINO runtime and environment setup first. + +Linux: + +```bash +source /opt/intel/openvino/setupvars.sh +CMAKE_ARGS="-DGGML_OPENVINO=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Windows: + +```powershell +# Run this from a shell where OpenVINO setupvars.bat has been initialized, +# such as an OpenVINO command prompt, or initialize it through cmd first. +$env:CMAKE_ARGS = "-DGGML_OPENVINO=ON" +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +The OpenVINO backend is intended for Intel CPU, GPU, and NPU workflows when the +OpenVINO runtime supports the target device. + +Runtime variables: + +| Variable | Use | +|---|---| +| `GGML_OPENVINO_DEVICE` | Selects `CPU`, `GPU`, `NPU`, or a specific GPU such as `GPU.0`. Defaults to CPU if unset or unavailable. | +| `GGML_OPENVINO_CACHE_DIR` | Enables OpenVINO model caching when set. Not supported on NPU devices according to upstream docs. | +| `GGML_OPENVINO_STATEFUL_EXECUTION` | Enables stateful KV-cache execution. Upstream docs recommend it for CPU/GPU performance and note it is not effective on NPU. | +| `GGML_OPENVINO_PREFILL_CHUNK_SIZE` | Controls NPU prefill chunk size. | +| `GGML_OPENVINO_PROFILING` | Enables OpenVINO profiling. | + +Important limitations from the upstream OpenVINO backend docs: + +- GPU stateless execution has known issues; use `GGML_OPENVINO_STATEFUL_EXECUTION=1` for GPU workflows. +- NPU runs may fail when context size is too large. Keep context size small for NPU workflows. +- Encoder models such as embedding and reranking models are not supported by the current OpenVINO backend implementation. +- Some benchmark workflows require Flash Attention enabled in the llama.cpp tool layer; in Python, verify behavior against your target model and backend. + +--- + +## HIP / ROCm + +HIP builds are for supported AMD GPUs. + +Linux example: + +```bash +CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +`GPU_TARGETS` is optional in some setups, but specifying your GPU architecture +can reduce build time and avoid unsupported target issues. + +Windows ROCm builds are more environment-sensitive. The README currently +documents a TheRock ROCm workflow that sets `HIP_PATH`, `ROCM_PATH`, +`HIP_DEVICE_LIB_PATH`, compiler paths, `CMAKE_GENERATOR`, and `CMAKE_ARGS` +before running `pip install`. + +For RDNA3 or CDNA hardware, upstream docs mention optional Flash Attention +acceleration through rocWMMA: + +```bash +CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1100 -DGGML_HIP_ROCWMMA_FATTN=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Runtime variables that may matter: + +| Variable | Use | +|---|---| +| `HIP_VISIBLE_DEVICES` | Selects visible HIP devices. | +| `HSA_OVERRIDE_GFX_VERSION` | Can help unsupported Linux GPUs use a nearby architecture value. Upstream docs note this is not supported on Windows. | +| `HIP_DEVICE_LIB_PATH` | Points to ROCm device bitcode libraries when clang cannot find them. | + +--- + +## SYCL + +SYCL builds are usually used with Intel oneAPI compilers. + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +To request FP16 support: + +```bash +CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Useful SYCL build options from the upstream backend docs: + +| Option | Use | +|---|---| +| `GGML_SYCL_F16` | Enables FP16 build path. Test both FP32 and FP16 for your model and device. | +| `GGML_SYCL_TARGET` | Selects SYCL target type. Intel is the default target in upstream docs. | +| `GGML_SYCL_DEVICE_ARCH` | Selects device architecture when known. | +| `GGML_SYCL_GRAPH` | Enables the experimental SYCL graph extension. | +| `GGML_SYCL_DNN` | Enables oneDNN integration. | +| `GGML_SYCL_HOST_MEM_FALLBACK` | Allows host-memory fallback when device memory is full, at reduced speed. | +| `GGML_SYCL_SUPPORT_LEVEL_ZERO` | Enables Level Zero support for Intel GPU memory allocation. | + +Useful SYCL runtime variables: + +| Variable | Use | +|---|---| +| `ONEAPI_DEVICE_SELECTOR` | Selects a SYCL device, such as a specific Level Zero GPU. | +| `GGML_SYCL_ENABLE_FLASH_ATTN` | Enables or disables Flash Attention in the SYCL backend. | +| `GGML_SYCL_ENABLE_LEVEL_ZERO` | Uses Level Zero allocation when support was built in. | +| `GGML_SYCL_DISABLE_DNN` | Disables oneDNN path and uses oneMKL path. | +| `ZES_ENABLE_SYSMAN` | Helps query free GPU memory in some Intel GPU setups. | + +--- + +## OpenCL + +OpenCL support is documented upstream mainly for Qualcomm Adreno GPUs and +Snapdragon devices. It may also work on certain other OpenCL-capable GPUs, but +SYCL is usually preferred for modern Intel GPU workflows. + +```bash +CMAKE_ARGS="-DGGML_OPENCL=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Useful OpenCL CMake options: + +| Option | Default | Use | +|---|---|---| +| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embeds OpenCL kernels into the built binary or library. | +| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Enables kernels optimized for Adreno. | + +For Linux builds where OpenCL headers and ICD loader are installed in a custom +prefix, pass that location through `CMAKE_PREFIX_PATH`. + +--- + +## CANN + +CANN is the Ascend NPU backend. It requires Ascend drivers and the CANN toolkit +before building. + +```bash +CMAKE_ARGS="-DGGML_CANN=ON -DCMAKE_BUILD_TYPE=Release" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +The upstream CANN documentation focuses on Linux and Ascend devices such as +Atlas 300I A2 and Atlas 300I Duo. Supported model families and data types vary +by device generation. + +--- + +## ZenDNN and zDNN + +ZenDNN and zDNN are different backends. + +| Backend | Hardware | CMake option | +|---|---|---| +| ZenDNN | AMD Zen CPUs, especially AMD EPYC | `-DGGML_ZENDNN=ON` | +| zDNN | IBM Z / LinuxONE with NNPA acceleration | `-DGGML_ZDNN=ON -DZDNN_ROOT=/path/to/zdnn` | + +ZenDNN can be downloaded and built automatically by CMake: + +```bash +CMAKE_ARGS="-DGGML_ZENDNN=ON -DCMAKE_BUILD_TYPE=Release" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +If you already have a ZenDNN installation: + +```bash +CMAKE_ARGS="-DGGML_ZENDNN=ON -DZENDNN_ROOT=/path/to/ZenDNN/build/install" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +zDNN requires a zDNN library installation first: + +```bash +CMAKE_ARGS="-DGGML_ZDNN=ON -DZDNN_ROOT=/opt/zdnn-libs" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +ZenDNN currently accelerates matrix multiplication paths and may fall back to +the standard CPU backend for other operations. + +--- + +## Dynamic Backend Wheels + +The README notes that newer preview wheels may be built with: + +```text +GGML_BACKEND_DL=ON +GGML_CPU_ALL_VARIANTS=ON +``` + +In that build mode, CPU backend variants are installed as separate runtime +libraries under: + +```text +site-packages/llama_cpp/lib +``` + +Examples include: + +```text +ggml-cpu-x64 +ggml-cpu-sse42 +ggml-cpu-haswell +ggml-cpu-skylakex +ggml-cpu-alderlake +ggml-cpu-zen4 +``` + +On Windows, dynamic CPU backend DLLs may also need the LLVM OpenMP runtime +next to them: + +```text +libomp140.x86_64.dll +``` + +Based on the current top-level `CMakeLists.txt`, this project installs many +`llama`, `ggml`, CPU-variant, accelerator backend, and `mtmd` targets into the +Python package runtime directory when those targets are available. + +--- + +## Upgrading and Rebuilding + +Use `--upgrade`, `--force-reinstall`, and `--no-cache-dir` when you need to +force a rebuild with new CMake options: + +```bash +CMAKE_ARGS="-DGGML_CUDA=on" \ + python -m pip install --upgrade --force-reinstall --no-cache-dir \ + "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +This is important because `pip` may otherwise reuse cached wheels or build +artifacts from a previous backend configuration. + +For local editable builds, clean old native artifacts before rebuilding when +switching backends: + +```bash +make clean +python -m pip install --verbose -e . +``` + +On Windows, if `make` is not available, remove `_skbuild` and old native +libraries under `llama_cpp/lib` manually before reinstalling. + +--- + +## Verify Installation + +Check that the package imports: + +```bash +python -c "import llama_cpp; print(llama_cpp.__version__)" +``` + +Check where the package was installed: + +```bash +python -c "import llama_cpp, pathlib; print(pathlib.Path(llama_cpp.__file__).parent)" +``` + +Check the bundled native runtime libraries: + +```bash +python -c "import llama_cpp, pathlib; print(list((pathlib.Path(llama_cpp.__file__).parent / 'lib').glob('*')))" +``` + +Run a minimal model load after downloading a GGUF model: + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="./model.gguf", + n_gpu_layers=0, + verbose=False, +) + +output = llm("Hello,", max_tokens=8) +print(output["choices"][0]["text"]) +``` + +For GPU builds, set `n_gpu_layers=-1` or another positive value to offload +layers: + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="./model.gguf", + n_gpu_layers=-1, +) +``` + +--- + +## Development Workflow + +Common local development commands: + +```bash +git clone https://github.com/JamePeng/llama-cpp-python --recursive +cd llama-cpp-python +python -m pip install --upgrade pip +python -m pip install -e . +python -m pytest +``` + +The repository also includes a `Makefile` with useful targets: + +| Target | Purpose | +|---|---| +| `make build` | Editable build with verbose output. | +| `make build.cuda` | Editable build with `GGML_CUDA=on`. | +| `make build.openblas` | Editable build with OpenBLAS. | +| `make build.openvino` | Editable build with OpenVINO. | +| `make build.vulkan` | Editable build with Vulkan. | +| `make build.sycl` | Editable build with SYCL. | +| `make test` | Run pytest with verbose tracing. | +| `make clean` | Remove local native build artifacts. | + +When testing a different `llama.cpp` commit, update the `vendor/llama.cpp` +submodule, clean the local build, and reinstall. If the upstream C API changes, +the ctypes declarations in `llama_cpp/llama_cpp.py` may also need to be updated. + +--- + +## Common Installation Pitfalls + +| Symptom | Likely cause | What to try | +|---|---|---| +| CMake cannot find a compiler | Build tools are missing or not available in the current shell. | Install platform build tools and reopen the terminal. On Windows, use a Developer PowerShell or initialize Visual Studio build variables. | +| Build ignores new backend flags | `pip` reused a cached wheel or previous build. | Reinstall with `--force-reinstall --no-cache-dir`, and clean `_skbuild` for local builds. | +| CUDA backend does not build | CUDA Toolkit is missing, incompatible, or not on `PATH`. | Verify `nvcc --version`, CUDA driver compatibility, and `CUDA_PATH` on Windows. | +| CUDA build targets the wrong GPU generation | Native architecture detection picked the build machine GPU, or `nvcc` could not detect it. | Use `-DGGML_NATIVE=OFF` for portability or set `-DCMAKE_CUDA_ARCHITECTURES=...` explicitly. | +| Native library fails to load on Windows | Required DLLs are missing from `PATH` or `llama_cpp/lib`. | Check `llama_cpp/lib` for `llama.dll`, `ggml*.dll`, backend DLLs, and runtime DLLs such as OpenMP or CUDA dependencies. | +| GPU is not used at runtime | The package was built without that backend or `n_gpu_layers` is `0`. | Rebuild with the correct CMake backend flag and set `n_gpu_layers` to a positive value or `-1`. | +| OpenVINO GPU or NPU behaves unexpectedly | Runtime device selection or context size is unsuitable. | Set `GGML_OPENVINO_DEVICE`, enable `GGML_OPENVINO_STATEFUL_EXECUTION=1` for GPU, and keep context size smaller for NPU workflows. | +| SYCL device is not selected | oneAPI environment or device selector is missing. | Source oneAPI setup and set `ONEAPI_DEVICE_SELECTOR` for the intended device. | +| Submodule files are missing | Repository was cloned without `--recursive`. | Run `git submodule update --init --recursive`. | + +For detailed diagnostics, see [[Troubleshooting]]. + +--- + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +* [README Installation](https://github.com/JamePeng/llama-cpp-python/blob/main/README.md#installation) +* [llama.cpp build documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) +* [llama.cpp backend documentation](https://github.com/ggml-org/llama.cpp/tree/master/docs/backend) diff --git a/docs/wiki/modules/LlamaCache.md b/docs/wiki/modules/LlamaCache.md index 64e6bbb5f8..d1db0a2097 100644 --- a/docs/wiki/modules/LlamaCache.md +++ b/docs/wiki/modules/LlamaCache.md @@ -2,7 +2,7 @@ title: Llama Cache module_name: llama_cpp.llama_cache source_file: llama_cpp/llama_cache.py -last_updated: 2026-05-02 +last_updated: 2026-05-06 version_target: "latest" --- @@ -21,10 +21,10 @@ It defines several cache classes: | `BaseLlamaCache` | Abstract base class for llama.cpp state caches. | | `LlamaRAMCache` | In-memory LRU cache for `LlamaState` objects. | | `LlamaDiskCache` | Disk-backed cache using the `diskcache` library. | -| `LlamaTrieCache` | Trie-based cache optimized for fast longest-prefix lookup. | -| `HybridCheckpointCache` | Checkpoint manager for RNN/Hybrid model hidden states. | -| `HybridCheckpoint` | Dataclass representing one saved hybrid model checkpoint. | | `TrieNode` | Internal trie node used by `LlamaTrieCache`. | +| `LlamaTrieCache` | Trie-based cache optimized for fast longest-prefix lookup. | +| `HybridCheckpoint` | Dataclass representing one saved Hybrid/Recurrent checkpoint and its host-visible payload. | +| `HybridCheckpointCache` | Checkpoint manager for Hybrid/Recurrent model state snapshots, with host and device-backed modes. | The public compatibility alias is: @@ -910,7 +910,7 @@ from llama_cpp.llama_cache import LlamaTrieCache as LlamaCache ## Overview -`HybridCheckpoint` is a dataclass representing one saved snapshot of a Hybrid or recurrent model's hidden state. +`HybridCheckpoint` is a dataclass representing one saved snapshot of a Hybrid or Recurrent model state. It is used by `HybridCheckpointCache`. @@ -920,9 +920,14 @@ Defined in: `llama_cpp/llama_cache.py` ## Role in the API -Hybrid or recurrent models may require hidden-state rollback rather than standard KV-cache truncation. +Hybrid or recurrent models may require sequence-state rollback rather than standard KV-cache truncation. + +`HybridCheckpoint` stores the checkpoint position, prefix verification hash, sequence id, and the serialized checkpoint payload visible to Python. -`HybridCheckpoint` stores enough metadata to verify and restore a specific recurrent state snapshot. +Its `data` field has different ownership semantics depending on the cache mode: + +* In host mode (`on_device=False`), `data` contains the full host-side serialized checkpoint state. +* In device mode (`on_device=True`), `data` contains only the host-visible serialized portion. The large tensor payload is stored in `llama_context`-owned device buffers by llama.cpp, keyed by `seq_id`. --- @@ -936,19 +941,19 @@ class HybridCheckpoint: hash_val: str size: int seq_id: int -``` +```` --- ## Fields -| Field | Type | Description | -| ---------- | ------- | --------------------------------------------------------------- | -| `pos` | `int` | Token position where this checkpoint was taken. | -| `data` | `bytes` | Raw binary RNN or Hybrid model state data. | -| `hash_val` | `str` | SHA-256 hash prefix used to verify exact token-prefix matching. | -| `size` | `int` | Size of the state data in bytes. | -| `seq_id` | `int` | Sequence ID associated with this checkpoint. | +| Field | Type | Description | +| ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `pos` | `int` | Token position where this checkpoint was taken. | +| `data` | `bytes` | Serialized checkpoint payload visible to Python. In host mode this is the full state; in device mode this is only the host-visible portion. | +| `hash_val` | `str` | SHA-256 hash prefix used to verify exact token-prefix matching. | +| `size` | `int` | Number of bytes written by `llama_state_seq_get_data_ext`. | +| `seq_id` | `int` | Sequence id used by llama.cpp sequence-state APIs. | --- @@ -958,23 +963,33 @@ class HybridCheckpoint: Users usually do not need to instantiate this dataclass manually. +In device mode, old `HybridCheckpoint` Python objects may become stale if a newer checkpoint is saved for the same `seq_id`, because the device-side tensor payload is keyed by `seq_id` and may be overwritten. + --- # `HybridCheckpointCache` ## Overview -`HybridCheckpointCache` manages RNN or Hybrid model hidden-state checkpoints. +`HybridCheckpointCache` manages Hybrid/Recurrent model state checkpoints. + +It is designed for models whose memory cannot always be safely truncated like a regular Transformer KV cache. Instead, rollback is implemented by saving and restoring sequence-state snapshots through llama.cpp state APIs. + +The cache supports two operating modes: -It is designed for models that cannot physically truncate KV cache in the same way as standard transformer-only models. +1. **Host mode** (`on_device=False`) -Instead of implementing dictionary-style cache operations, it provides explicit checkpoint operations: + * Full checkpoint payloads are materialized as Python-owned `bytes`. + * Multiple historical checkpoints per `seq_id` are safe. + * This is the default mode and is useful for multi-turn rollback or deeper prefix reuse. -* `save_checkpoint` -* `find_best_checkpoint` -* `restore_checkpoint` -* `clear` -* `close` +2. **Device mode** (`on_device=True`) + + * `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` is forwarded to llama.cpp. + * Tensor payloads are stored in `llama_context`-owned device buffers. + * Python keeps only the host-visible serialized portion. + * Only one active checkpoint per `seq_id` is safe because device payloads are keyed by `seq_id`. + * This mode can reduce device-to-host copy overhead during checkpoint save/restore. Defined in: `llama_cpp/llama_cache.py` @@ -984,12 +999,14 @@ Defined in: `llama_cpp/llama_cache.py` `HybridCheckpointCache` is a specialized cache manager for Hybrid/Recurrent model rollback. -It stores raw state snapshots extracted from the llama.cpp backend through low-level C API functions: +It stores host-visible checkpoint data extracted from the llama.cpp backend through low-level C API functions: * `llama_state_seq_get_size_ext` * `llama_state_seq_get_data_ext` * `llama_state_seq_set_data_ext` +When `on_device=True`, tensor payloads are not treated as Python-owned bytes. They are stored by llama.cpp in `llama_context`-owned device buffers, while Python keeps the host-visible serialized portion and checkpoint metadata. + It is not a drop-in replacement for `LlamaRAMCache`, `LlamaDiskCache`, or `LlamaTrieCache`. --- @@ -1001,16 +1018,18 @@ def __init__( self, ctx: llama_cpp_lib.llama_context_p, max_checkpoints: int = 16, + on_device: bool = False, verbose: bool = False ): ... ``` -| Parameter | Type | Default | Required | Description | -| ----------------- | ------------------------------- | ------: | -------: | ------------------------------------------------------------------------------------------- | -| `ctx` | `llama_cpp_lib.llama_context_p` | — | Yes | Low-level llama.cpp context pointer. Required for extracting and restoring sequence state. | -| `max_checkpoints` | `int` | `16` | No | Maximum number of checkpoints to retain. If set to `0` or below, checkpointing is disabled. | -| `verbose` | `bool` | `False` | No | Enables diagnostic messages printed to `stderr`. | +| Parameter | Type | Default | Required | Description | +| ----------------- | ------------------------------- | ------: | -------: | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `ctx` | `llama_cpp_lib.llama_context_p` | — | Yes | Borrowed low-level llama.cpp context pointer used for sequence-state save/restore. The cache does not own or free this context. | +| `max_checkpoints` | `int` | `16` | No | Maximum number of Python-side checkpoint entries to retain. If set to `0` or below, checkpointing is disabled. | +| `on_device` | `bool` | `False` | No | Whether to request llama.cpp to store checkpoint tensor payloads in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. | +| `verbose` | `bool` | `False` | No | Enables diagnostic messages printed to `stderr`. | --- @@ -1018,32 +1037,26 @@ def __init__( The constructor raises `ValueError` if `ctx` is `None`. -```python -if ctx is None: - raise ValueError( - "HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with model context" - ) -``` +If `max_checkpoints <= 0`, checkpointing is disabled. In verbose mode, the cache reports that rollback capabilities are turned off. This mode is intended to avoid expensive state extraction for single-turn workflows. -If `max_checkpoints <= 0`, checkpointing is disabled. In verbose mode, the cache reports that rollback capabilities are turned off. - -This mode is intended to avoid expensive state extraction for single-turn workflows. +When `on_device=True`, the cache forwards `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` to llama.cpp. In this mode, the cache keeps only one active checkpoint per `seq_id` by replacing old Python-side checkpoint metadata before saving a new checkpoint for the same `seq_id`. --- ## Instance Variables -| Name | Type | Description | -| ----------------- | ------------------------------- | ------------------------------------------------------------------------------------------------ | -| `_ctx` | `llama_cpp_lib.llama_context_p` | Low-level llama.cpp context pointer used for state extraction and restoration. | -| `max_checkpoints` | `int` | Maximum number of checkpoints retained. Values less than or equal to zero disable checkpointing. | -| `checkpoints` | `list[HybridCheckpoint]` | Stored checkpoint objects. | -| `_current_size` | `int` | Total memory used by all stored checkpoints in bytes. | -| `_get_size_ext` | Callable | Cached reference to `llama_state_seq_get_size_ext`. | -| `_get_data_ext` | Callable | Cached reference to `llama_state_seq_get_data_ext`. | -| `_set_data_ext` | Callable | Cached reference to `llama_state_seq_set_data_ext`. | -| `_flag_partial` | int | Cached value of `LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY`. | -| `verbose` | `bool` | Enables debug output. | +| Name | Type | Description | +| ----------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `_ctx` | `llama_cpp_lib.llama_context_p` | Borrowed llama.cpp context pointer used for state extraction and restoration. | +| `on_device` | `bool` | Whether `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` is forwarded to llama.cpp state APIs. | +| `verbose` | `bool` | Enables debug output. | +| `max_checkpoints` | `int` | Maximum number of Python-side checkpoint entries retained. Values less than or equal to zero disable checkpointing. | +| `checkpoints` | `list[HybridCheckpoint]` | Python-side checkpoint registry. In host mode, entries own full checkpoint payloads. In device mode, entries own only host-visible metadata/payload portions. | +| `_current_size` | `int` | Python-tracked host-visible checkpoint size in bytes. In device mode, this does not include `llama_context`-owned device tensor storage. | +| `_get_size_ext` | Callable | Cached reference to `llama_state_seq_get_size_ext`. | +| `_get_data_ext` | Callable | Cached reference to `llama_state_seq_get_data_ext`. | +| `_set_data_ext` | Callable | Cached reference to `llama_state_seq_set_data_ext`. | +| `_flags` | `int` | Combined llama.cpp sequence-state flags, always including `LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY` and optionally `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. | --- @@ -1057,7 +1070,11 @@ def cache_size(self) -> int: return self._current_size ``` -Returns the total memory used by stored checkpoints in bytes. +Returns the Python-tracked host-visible checkpoint size in bytes. + +In host mode, this is close to the full serialized checkpoint payload size. + +In device mode, this reports only the host-visible portion returned by llama.cpp. It does not include `llama_context`-owned device tensor storage. --- @@ -1070,14 +1087,16 @@ def clear(self): ... ``` -Clears all stored checkpoints and resets `_current_size` to `0`. +Clears Python-side checkpoint metadata and resets `_current_size` to `0`. If the checkpoint list is already empty, it returns immediately. +In device mode, this does not explicitly release `llama_context`-owned device buffers. Those buffers are managed by llama.cpp and are associated with the context. + In verbose mode, it prints: ```text -HybridCheckpointCache: cleared +HybridCheckpointCache(clear): cleared ``` --- @@ -1089,15 +1108,15 @@ def close(self): ... ``` -Releases references held by the cache. +Releases Python-side checkpoint metadata and detaches cached references held by the cache. Behavior: -* Sets `checkpoints` to `None`. +* Calls `clear()`. * Sets `_ctx` to `None`. * Sets cached C API function references to `None`. -This method is also called by `__del__`. +This method does not free the llama.cpp context itself, because the context is borrowed rather than owned by the cache. --- @@ -1133,6 +1152,50 @@ This hash is used to ensure checkpoints are restored only when the token prefix --- +### `_replace_checkpoint_for_seq_id` + +```python +def _replace_checkpoint_for_seq_id(self, seq_id: int) -> None: + ... +``` + +Removes all Python-side checkpoint entries for one `seq_id`. + +This is required in device mode because llama.cpp stores the device tensor payload per `seq_id`, not per Python checkpoint object. Keeping multiple checkpoint metadata entries for the same `seq_id` would be unsafe. + +Behavior: + +1. Iterates over all checkpoint entries. +2. Removes entries whose `seq_id` matches the requested `seq_id`. +3. Preserves entries for other sequence ids. +4. Subtracts removed checkpoint sizes from `_current_size`. +5. Clamps `_current_size` to `0` if needed. + +--- + +### `_evict_checkpoints_if_needed` + +```python +def _evict_checkpoints_if_needed(self) -> None: + ... +``` + +Evicts old checkpoint entries using FIFO order until `len(checkpoints) <= max_checkpoints`. + +In host mode, this evicts full Python-owned checkpoint payloads. + +In device mode, this evicts Python-side checkpoint metadata only. Device tensor payloads are owned by `llama_context`. + +Behavior: + +1. Checks whether the number of checkpoints exceeds `max_checkpoints`. +2. Pops the oldest checkpoint entry from the front of the list. +3. Subtracts its size from `_current_size`. +4. Clamps `_current_size` to `0` if needed. +5. Prints an eviction message in verbose mode. + +--- + ### `find_best_checkpoint` ```python @@ -1144,20 +1207,23 @@ def find_best_checkpoint( ... ``` -Finds the longest valid checkpoint matching the given token prefix and sequence ID. +Finds the longest valid checkpoint matching the given token prefix and sequence id. + +The hash check prevents restoring a checkpoint that has the same length but belongs to a different prompt/history. Returns `None` if: * Checkpointing is disabled. * There are no checkpoints. -* No checkpoint matches the requested sequence ID and token prefix. +* No checkpoint matches the requested sequence id and token prefix. Behavior: -1. Skips checkpoints whose `seq_id` differs. -2. Skips checkpoints whose `pos` is greater than the current token length. -3. Verifies token-prefix integrity using `_hash_prefix`. -4. Returns the checkpoint with the largest matching `pos`. +1. Returns immediately if `max_checkpoints <= 0` or no checkpoints exist. +2. Skips checkpoints whose `seq_id` differs from the requested `seq_id`. +3. Skips checkpoints whose `pos` is greater than the current token length. +4. Verifies token-prefix integrity using `_hash_prefix`. +5. Returns the checkpoint with the largest matching `pos`. --- @@ -1173,7 +1239,7 @@ def save_checkpoint( ... ``` -Extracts the current recurrent model state from the C++ backend and stores it as a `HybridCheckpoint`. +Extracts the current Hybrid/Recurrent model state from the C++ backend and stores it as a `HybridCheckpoint`. Returns `True` if the checkpoint was saved successfully. @@ -1186,20 +1252,24 @@ Returns `False` if: ### Behavior 1. Returns immediately if `max_checkpoints <= 0`. -2. Calls `_get_size_ext` to query the required state buffer size. -3. Allocates a `ctypes.c_uint8` buffer. -4. Calls `_get_data_ext` to extract state data. -5. Copies the state bytes into a Python `bytes` object. -6. Computes a hash of the token prefix. -7. Appends a new `HybridCheckpoint`. -8. Increments `_current_size`. -9. Evicts old checkpoints using FIFO order if the number of checkpoints exceeds `max_checkpoints`. +2. In device mode, removes old Python-side checkpoint metadata for the same `seq_id`. +3. Uses `_flags` to select partial-only state serialization, optionally with `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. +4. Calls `_get_size_ext` to query the required host-visible buffer size. +5. Allocates a `ctypes.c_uint8` buffer. +6. Calls `_get_data_ext` to extract the host-visible checkpoint data. +7. Copies the data into a Python `bytes` object. +8. Computes a hash of the token prefix. +9. Appends a new `HybridCheckpoint`. +10. Increments `_current_size`. +11. Evicts old checkpoint entries using FIFO order if the number of entries exceeds `max_checkpoints`. ### Important Performance Note The implementation intentionally bypasses checkpoint extraction when `max_checkpoints <= 0`. -This avoids potentially large synchronous VRAM-to-RAM transfers for single-turn workflows. +This avoids potentially large synchronous checkpoint extraction costs for single-turn workflows. + +When `on_device=True`, llama.cpp may keep large tensor payloads in context-owned device buffers instead of materializing them as Python-owned bytes. This can reduce device-to-host tensor copy overhead, but only one active checkpoint per `seq_id` is safe. --- @@ -1220,18 +1290,28 @@ Returns `True` if restoration succeeds. Returns `False` if: -* The checkpoint sequence ID does not match the requested `seq_id`. +* The checkpoint sequence id does not match the requested `seq_id`. +* `on_device=True` and the checkpoint object is no longer tracked by this cache. * The current backend state size differs from the checkpoint size. * The backend does not report the expected number of restored bytes. ### Behavior 1. Verifies `cp.seq_id == seq_id`. -2. Queries current expected state size from the backend. -3. Verifies it matches `cp.size`. -4. Copies checkpoint bytes into a ctypes buffer. -5. Calls `_set_data_ext` to restore the state. -6. Returns whether the number of restored bytes equals `cp.size`. +2. In device mode, rejects stale checkpoint objects that are no longer tracked by this cache. +3. Queries current expected host-visible state size from the backend. +4. Verifies it matches `cp.size`. +5. Copies checkpoint bytes into a ctypes buffer. +6. Calls `_set_data_ext` to restore the state. +7. Returns whether the number of restored bytes equals `cp.size`. + +### Stale Checkpoint Guard + +In device mode, Python does not own the full checkpoint tensor payload. The large tensor payload is stored inside `llama_context` device buffers keyed by `seq_id`. + +If a newer checkpoint is saved for the same `seq_id`, an older `HybridCheckpoint` Python object may still exist outside the cache, but its device-side tensor payload may have been overwritten. + +For this reason, `restore_checkpoint` refuses on-device checkpoint objects that are no longer tracked by the cache. This avoids restoring old Python metadata together with newer device tensors. --- @@ -1270,7 +1350,7 @@ Users should use checkpoint-specific methods instead. --- -## Example +## Example: Host-backed Checkpoints ```python from llama_cpp.llama_cache import HybridCheckpointCache @@ -1279,6 +1359,7 @@ from llama_cpp.llama_cache import HybridCheckpointCache checkpoint_cache = HybridCheckpointCache( ctx=ctx, max_checkpoints=16, + on_device=False, verbose=True, ) @@ -1299,16 +1380,57 @@ if saved: print("Restored:", restored) ``` -> Note: This example assumes `ctx` is already available from lower-level llama.cpp runtime code. Most high-level users do not manually create this cache. +Host mode stores full serialized checkpoint payloads in Python-owned `bytes`. Multiple historical checkpoints per `seq_id` are safe. + +--- + +## Example: Device-backed Checkpoints + +```python +from llama_cpp.llama_cache import HybridCheckpointCache + +# `ctx` must be a valid llama.cpp context pointer. +checkpoint_cache = HybridCheckpointCache( + ctx=ctx, + max_checkpoints=16, + on_device=True, + verbose=True, +) + +tokens = [1, 2, 3, 4] +current_pos = len(tokens) + +saved = checkpoint_cache.save_checkpoint( + current_pos=current_pos, + tokens=tokens, + seq_id=0, +) + +if saved: + checkpoint = checkpoint_cache.find_best_checkpoint(tokens, seq_id=0) + + if checkpoint is not None: + restored = checkpoint_cache.restore_checkpoint(checkpoint, seq_id=0) + print("Restored:", restored) +``` + +In device mode, llama.cpp owns the large tensor payload in context-owned device buffers. Python keeps only the host-visible checkpoint data and metadata. + +Only one active checkpoint per `seq_id` is safe. + +> Note: These examples assume `ctx` is already available from lower-level llama.cpp runtime code. Most high-level users do not manually create this cache. Instead, they configure it through the `Llama` constructor using `ctx_checkpoints`, `checkpoint_interval`, and `checkpoint_on_device`. --- ## Best Practices * Use `HybridCheckpointCache` only for Hybrid or recurrent model workflows that require hidden-state rollback. +* Keep `on_device=False` when you need multiple historical checkpoints for the same `seq_id`. +* Use `on_device=True` when reducing device-to-host checkpoint copy overhead is more important than keeping many historical checkpoint payloads. Only store the checkpoint seq_id and pos. * Set `max_checkpoints=0` for single-turn workflows where rollback is not needed. * Keep `max_checkpoints` small if checkpoint states are large. * Use `find_best_checkpoint` before calling `restore_checkpoint`. +* Do not hold and restore old on-device `HybridCheckpoint` objects after newer checkpoints have been saved for the same `seq_id`. * Do not use dictionary-style cache access with this class. --- @@ -1319,7 +1441,10 @@ if saved: * `max_checkpoints <= 0` disables checkpointing. * Restoring a checkpoint with the wrong `seq_id` fails. * Restore fails if the current backend state size no longer matches the checkpoint size. -* `close()` sets internal references to `None`; the object should not be reused afterward. +* In device mode, old `HybridCheckpoint` objects can become stale after a newer checkpoint is saved for the same `seq_id`. +* In device mode, `cache_size` does not include `llama_context`-owned device tensor storage. +* `clear()` removes Python-side checkpoint metadata but does not explicitly free llama.cpp-owned device buffers. +* `close()` detaches internal references; the object should not be reused afterward. * This class is not equivalent to `LlamaCache`. --- diff --git a/docs/wiki/modules/LlamaEmbedding.md b/docs/wiki/modules/LlamaEmbedding.md index 1279db5cab..3aa2427227 100644 --- a/docs/wiki/modules/LlamaEmbedding.md +++ b/docs/wiki/modules/LlamaEmbedding.md @@ -3,7 +3,7 @@ title: Llama Embedding module_name: llama_cpp.llama_embedding source_file: llama_cpp/llama_embedding.py class_name: LlamaEmbedding -last_updated: 2026-05-01 +last_updated: 2026-05-31 version_target: "latest" --- @@ -18,7 +18,9 @@ version_target: "latest" | Model | Type | Link | Status | |--------------------|-----------|--------------------------------------------------------|--------------| -| `bge-m3` | Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`bge-m3`| Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`jina-embeddings-v2-base-zh`| Embedding |[jina-embeddings-v2-base-zh-GGUF](https://huggingface.co/gpustack/jina-embeddings-v2-base-zh-GGUF) | Useful ✅ | +|`jina-embeddings-v3`| Embedding |[jina-embeddings-v3-GGUF](https://huggingface.co/second-state/jina-embeddings-v3-GGUF) | Useful ✅ | |`bge-reranker-v2-m3`| Rerank |[bge-reranker-v2-m3-GGUF](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF) | Useful ✅ | |`qwen3-reranker`| Rerank |[Qwen3-Reranker-GGUF](https://huggingface.co/JamePeng2023/Qwen3-Reranker-GGUF) | Useful ✅ | diff --git a/docs/wiki/modules/LlamaSpeculative.md b/docs/wiki/modules/LlamaSpeculative.md index 0c0ad099fb..9255d01496 100644 --- a/docs/wiki/modules/LlamaSpeculative.md +++ b/docs/wiki/modules/LlamaSpeculative.md @@ -2,7 +2,7 @@ title: Llama Speculative Decoding module_name: llama_cpp.llama_speculative source_file: llama_cpp/llama_speculative.py -last_updated: 2026-05-02 +last_updated: 2026-05-23 version_target: "latest" --- @@ -10,30 +10,37 @@ version_target: "latest" ## Overview -`llama_speculative.py` provides draft model interfaces and prompt-based speculative decoding helpers for `llama-cpp-python`. +`llama_speculative.py` defines draft-model interfaces and prompt-based speculative decoding helpers for `llama-cpp-python`. -Speculative decoding uses a lightweight draft model to propose candidate tokens before the main model verifies them. In this module, the draft model does not need to be a neural model. It can also be a prompt lookup decoder that predicts future tokens by finding repeated token patterns in the existing context. +Speculative decoding lets a draft model propose candidate tokens before the main `Llama` model verifies them. In this module, the draft model does not have to be a neural network. It can also be a model-free prompt lookup decoder that predicts future tokens from repeated token patterns in the already verified context. This module currently defines: | Class | Status | Description | |---|---|---| -| `LlamaDraftModel` | public interface | Abstract base class for draft models used by speculative decoding. | -| `LlamaNGramMapDecoding` | public | Fast stateful n-gram map based speculative decoder. | +| `LlamaDraftModel` | public interface | Abstract base class for speculative draft models. | +| `LlamaNGramMapDecoding` | public | Stateful model-free n-gram lookup decoder with `k` and `k4v` modes. | | `LlamaPromptLookupDecoding` | legacy public | Stateless NumPy sliding-window prompt lookup decoder. | ## Role in the Library This module defines the draft-model side of speculative decoding. -A draft model receives the current token sequence and returns predicted draft tokens. These draft tokens can then be verified by the main `Llama` model during generation. +A draft model receives the verified token sequence so far and returns predicted draft token IDs. These tokens are later verified by the main `Llama` model during generation. The module provides two prompt-based implementations: -- `LlamaNGramMapDecoding`: optimized, stateful, hash-map based lookup. -- `LlamaPromptLookupDecoding`: older stateless NumPy sliding-window implementation. +- `LlamaNGramMapDecoding`: optimized, stateful, hash-map based n-gram lookup. +- `LlamaPromptLookupDecoding`: older stateless NumPy sliding-window lookup. -For new usage, prefer `LlamaNGramMapDecoding` because it incrementally maintains an n-gram index instead of scanning the full token history on every call. +For new usage, prefer `LlamaNGramMapDecoding`. It incrementally maintains an n-gram index, supports memory-oriented lookup modes, and avoids scanning the full token history on every call. + +## Choosing Between Related APIs + +| API | Recommended Use | Notes | +|---|---|---| +| `LlamaNGramMapDecoding` | Default prompt lookup decoder for new usage. | Uses stateful n-gram maps and supports `k` / `k4v` modes. | +| `LlamaPromptLookupDecoding` | Compatibility with older prompt lookup behavior. | Stateless and simple, but scans token history with NumPy sliding windows. | ## Classes @@ -41,7 +48,7 @@ For new usage, prefer `LlamaNGramMapDecoding` because it incrementally maintains ```python class LlamaDraftModel(abc.ABC) -```` +``` Abstract base class for speculative draft models. @@ -58,15 +65,15 @@ def __call__( ) -> npt.NDArray[np.intc] ``` -| Parameter | Type | Description | -| ----------- | ---------------------- | ----------------------------------------------------------------- | -| `input_ids` | `npt.NDArray[np.intc]` | Current token sequence. | -| `**kwargs` | `Any` | Additional generation arguments. Implementations may ignore them. | +| Parameter | Type | Description | +|---|---|---| +| `input_ids` | `npt.NDArray[np.intc]` | Complete verified token sequence so far. | +| `**kwargs` | `Any` | Additional generation arguments. Implementations may ignore them. | Returns: -| Type | Description | -| ---------------------- | -------------------------------------------- | +| Type | Description | +|---|---| | `npt.NDArray[np.intc]` | Draft token IDs proposed by the draft model. | ## `LlamaNGramMapDecoding` @@ -75,9 +82,11 @@ Returns: class LlamaNGramMapDecoding(LlamaDraftModel) ``` -Fast speculative decoder based on an n-gram hash map. +Fast model-free speculative decoder based on prompt n-gram lookup. + +This decoder maintains internal indexes from historical n-grams to either previous positions or cached continuation tokens. When called with the current verified token sequence, it searches for the final n-gram in the already verified history and returns a continuation from the most recent valid historical match. -This decoder maintains an internal inverted index from historical n-grams to their positions. When called with the current token sequence, it looks up the final n-gram in the history and returns the following tokens from the most recent matching context. +It does not own or run a separate draft model. Rejected draft tokens do not require manual rollback inside this class, because the next call receives the verified token history through `input_ids`. ### Constructor @@ -86,52 +95,207 @@ def __init__( self, ngram_size: int = 3, num_pred_tokens: int = 10, -) + mode: Literal["k", "k4v"] = "k", + min_hits: int = 2, + max_entries_per_key: Optional[int] = None, + sync_check_tokens: int = 16, +) -> None ``` -| Parameter | Type | Default | Description | -| ----------------- | ----- | ------- | ------------------------------------------------------------------------------------------------------------------------------- | -| `ngram_size` | `int` | `3` | Length of the token sequence used as the lookup key. Larger values require stricter context matches but may produce fewer hits. | -| `num_pred_tokens` | `int` | `10` | Maximum number of draft tokens to return after a matching n-gram is found. | +| Parameter | Type | Default | Source | Description | +|---|---|---|---|---| +| `ngram_size` | `int` | `3` | `__init__` signature | Number of tokens used as the lookup key. Larger values require stricter matches and may reduce hit rate. | +| `num_pred_tokens` | `int` | `10` | `__init__` signature | Maximum number of draft tokens to return. | +| `mode` | `Literal["k", "k4v"]` | `"k"` | `__init__` signature | Lookup storage mode. `"k"` stores key-to-position mappings. `"k4v"` stores key-to-continuation mappings. | +| `min_hits` | `int` | `2` | `__init__` signature | Minimum number of historical matches required before returning a draft. Use `1` for maximum recall; use values greater than `1` to reduce low-confidence drafts. | +| `max_entries_per_key` | `Optional[int]` | `None` | `__init__` signature and initialization logic | Optional memory cap per n-gram key. If `mode="k4v"` and this is `None`, it is automatically set to `8`. | +| `sync_check_tokens` | `int` | `16` | `__init__` signature | Number of trailing tokens used to detect whether new input is an incremental append without doing a full prefix comparison. | + +### Parameter Validation + +The constructor raises `ValueError` when: + +| Condition | Error Meaning | +|---|---| +| `ngram_size <= 0` | `ngram_size` must be positive. | +| `num_pred_tokens <= 0` | `num_pred_tokens` must be positive. | +| `min_hits <= 0` | `min_hits` must be positive. | +| `max_entries_per_key is not None and max_entries_per_key <= 0` | The memory cap must be `None` or positive. | +| `sync_check_tokens <= 0` | `sync_check_tokens` must be positive. | +| `mode` is not `"k"` or `"k4v"` after lowercasing | Only the two supported lookup modes are valid. | + +### Lookup Modes + +| Mode | Internal Storage | Memory Use | Behavior | +|---|---|---|---| +| `"k"` | `key -> [position, position, ...]` | Lower | Stores historical positions and slices continuations from `_history` during lookup. | +| `"k4v"` | `key -> {position: continuation}` | Higher | Stores continuation tokens directly and returns the latest cached continuation. | + +Use `"k"` as the general-purpose default. Use `"k4v"` when faster continuation retrieval is preferred and the extra memory use is acceptable. For `"k4v"`, `max_entries_per_key` defaults to `8` when not specified. ### Important Attributes / State -| Attribute | Type | Source | Description | -| ----------------- | ---------------------------------- | -------------- | -------------------------------------------------------------------------------- | -| `ngram_size` | `int` | constructor | Number of tokens used as the n-gram lookup key. | -| `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | -| `_ngram_map` | `Dict[Tuple[int, ...], List[int]]` | internal cache | Internal inverted index mapping n-gram tuples to positions in the token history. | -| `_history` | `List[int]` | internal cache | Internal token history used to maintain the n-gram map. | +| Attribute | Type | Source | Description | +|---|---|---|---| +| `ngram_size` | `int` | constructor | Number of tokens used as the n-gram lookup key. | +| `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | +| `mode` | `str` | constructor | Active lookup mode: `"k"` or `"k4v"`. | +| `min_hits` | `int` | constructor | Required number of historical matches before returning a draft. | +| `max_entries_per_key` | `Optional[int]` | constructor / initialization logic | Optional per-key memory cap. Automatically becomes `8` for `k4v` mode when not provided. | +| `sync_check_tokens` | `int` | constructor | Trailing-token window used for incremental append detection. | +| `_history` | `List[int]` | internal state | Verified token history mirrored from `input_ids`. | +| `_map_k` | `DefaultDict[Tuple[int, ...], List[int]]` | internal state | Key-to-position index used in `"k"` mode. | +| `_map_k4v` | `DefaultDict[Tuple[int, ...], Dict[int, Tuple[int, ...]]]` | internal state | Key-to-continuation index used in `"k4v"` mode. | +| `_closed` | `bool` | internal state | Marks the decoder as closed. Calling the decoder after `close()` raises `RuntimeError`. | +| `_last_draft_len` | `int` | internal state | Length of the most recent returned draft. Currently internal diagnostic state. | + +Internal state should not be mutated directly. + +### Core Methods + +#### `__call__` + +```python +def __call__( + self, + input_ids: npt.NDArray[np.intc], + /, + **kwargs: Any, +) -> npt.NDArray[np.intc] +``` + +Generates draft tokens from verified token history. + +| Parameter | Type | Description | +|---|---|---| +| `input_ids` | `npt.NDArray[np.intc]` | Complete verified token sequence so far. | +| `**kwargs` | `Any` | Accepted for interface compatibility and ignored by this implementation. | + +Returns: + +| Type | Description | +|---|---| +| `npt.NDArray[np.intc]` | Predicted draft tokens. Returns an empty array when no reliable match is found. | + +Raises: + +| Exception | Condition | +|---|---| +| `RuntimeError` | The decoder has been closed with `close()` and is called again. | + +#### `clear` + +```python +def clear(self) -> None +``` + +Clears token history and internal indexes while keeping the decoder reusable. + +Use this when starting a completely unrelated generation with the same decoder instance. + +#### `close` + +```python +def close(self) -> None +``` + +Clears internal containers and marks the decoder as closed. + +This class does not own native memory, but explicit cleanup can be useful in long-running applications that may otherwise keep large Python containers alive. + +#### `accept` + +```python +def accept(self, n_accepted: int) -> None +``` + +Compatibility hook for speculative decoding loops. -`_ngram_map` and `_history` are internal state and should not be modified directly. +This implementation is intentionally a no-op. Accepted tokens are reflected by the next `input_ids` passed to `__call__`, so no separate rollback or acceptance state update is required. ### Behavior When called, `LlamaNGramMapDecoding`: -1. Synchronizes its internal history with the provided `input_ids`. -2. Incrementally updates the n-gram map when tokens are appended. -3. Rebuilds the map if the input sequence is no longer a simple continuation, such as after rollback or a new prompt. -4. Uses the last `ngram_size` tokens as the search key. -5. Returns up to `num_pred_tokens` tokens following the most recent historical match. -6. Returns an empty NumPy array if no match is found. +1. Converts `input_ids` to a flat `np.intc` token list. +2. Synchronizes internal history with the verified token sequence. +3. Uses a fast path when the new input is identical to the stored history. +4. Uses an incremental append path when the trailing tokens indicate that the new input extends the previous input. +5. Rebuilds the index after rollback, prompt switch, truncation, or unsafe mutation. +6. Indexes only n-grams with at least one available continuation token, so the current tail n-gram does not match itself. +7. Looks up the final `ngram_size` tokens as the search key. +8. Requires at least `min_hits` historical matches before returning a draft. +9. Returns up to `num_pred_tokens` tokens from the latest valid historical match. +10. Returns an empty NumPy array if no reliable match is available. -### Example +### Example: Direct Prompt Lookup + +Use `min_hits=1` in a small standalone example so that one historical match is enough to return a draft. ```python import numpy as np + from llama_cpp.llama_speculative import LlamaNGramMapDecoding draft_model = LlamaNGramMapDecoding( ngram_size=3, - num_pred_tokens=5, + num_pred_tokens=2, + min_hits=1, ) -input_ids = np.array([1, 2, 3, 4, 1, 2, 3], dtype=np.intc) - +input_ids = np.array([1, 2, 3, 4, 5, 1, 2, 3], dtype=np.intc) draft_tokens = draft_model(input_ids) print(draft_tokens) +# Expected output: +# [4 5] +``` + +### Example: Use with `Llama` + +```python +from llama_cpp import Llama +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +llm = Llama( + model_path="path/to/model.gguf", + n_ctx=4096, + n_gpu_layers=-1, + draft_model=LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=2, + ), +) + +response = llm.create_chat_completion( + messages=[ + { + "role": "user", + "content": ( + "Write five short Python classes with the same CRUD method layout: " + "User, Product, Order, Review, and Category." + ), + } + ] +) + +print(response["choices"][0]["message"]["content"]) +``` + +### Example: Use `k4v` Mode with a Memory Cap + +```python +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +draft_model = LlamaNGramMapDecoding( + ngram_size=4, + num_pred_tokens=8, + mode="k4v", + min_hits=2, + max_entries_per_key=8, +) ``` ## `LlamaPromptLookupDecoding` @@ -144,7 +308,7 @@ Legacy speculative decoder based on NumPy sliding-window lookup. This implementation is stateless. Each call scans the input token sequence to find previous occurrences of the current n-gram and returns the following tokens as draft predictions. -> Warning: This implementation may have high computational overhead for long contexts. Prefer `LlamaNGramMapDecoding` for new usage. +> Warning: This implementation is not recommended for production. It may have high computational overhead for long contexts and may degrade output quality. Prefer `LlamaNGramMapDecoding` for new usage. ### Constructor @@ -156,16 +320,16 @@ def __init__( ) ``` -| Parameter | Type | Default | Description | -| ----------------- | ----- | ------- | -------------------------------------------------------------------------- | -| `max_ngram_size` | `int` | `3` | Maximum n-gram size to search for. The decoder tries larger n-grams first. | -| `num_pred_tokens` | `int` | `10` | Maximum number of draft tokens to return. | +| Parameter | Type | Default | Source | Description | +|---|---|---|---|---| +| `max_ngram_size` | `int` | `3` | `__init__` signature | Maximum n-gram size to search for. The decoder tries larger n-grams first. | +| `num_pred_tokens` | `int` | `10` | `__init__` signature | Maximum number of draft tokens to return. | ### Important Attributes / State -| Attribute | Type | Source | Description | -| ----------------- | ----- | ----------- | --------------------------------------------------- | -| `max_ngram_size` | `int` | constructor | Maximum n-gram window size used during lookup. | +| Attribute | Type | Source | Description | +|---|---|---|---| +| `max_ngram_size` | `int` | constructor | Maximum n-gram window size used during lookup. | | `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | ### Static Method @@ -181,58 +345,71 @@ def find_candidate_pred_tokens( Linearly scans `input_ids` using NumPy sliding windows to find matching n-grams. -| Parameter | Type | Description | -| ----------------- | ---------------------- | ----------------------------------------- | -| `input_ids` | `npt.NDArray[np.intc]` | Complete token sequence. | -| `max_ngram_size` | `int` | Maximum n-gram size to search for. | -| `num_pred_tokens` | `int` | Maximum number of draft tokens to return. | +| Parameter | Type | Description | +|---|---|---| +| `input_ids` | `npt.NDArray[np.intc]` | Complete token sequence. | +| `max_ngram_size` | `int` | Maximum n-gram size to search for. | +| `num_pred_tokens` | `int` | Maximum number of draft tokens to return. | Returns: -| Type | Description | -| ---------------------- | --------------------------------------------------------------- | +| Type | Description | +|---|---| | `npt.NDArray[np.intc]` | Candidate draft tokens, or an empty array if no match is found. | -### Example +### Method ```python -from llama_cpp import Llama -from llama_cpp.llama_speculative import LlamaNGramMapDecoding - -llama = Llama( - model_path="path/to/qwen-3.6-27b.gguf", - n_ctx=4096, - n_gpu_layers=-1, - draft_model=LlamaNGramMapDecoding( - ngram_size=3, - num_pred_tokens=10 - ) -) - -response = llama.create_chat_completion( - messages=[{"role": "user", "content": """ - Write a Python script using `sqlite3` to define CRUD (Create, Read, Update, Delete) operations for an e-commerce database. -You need to create 5 separate classes for the following entities: `User`, `Product`, `Order`, `Review`, and `Category`. -Each class MUST have exactly the same internal structure and method names (create, get, update, delete). Do not add extra logic, just the standard boilerplate. - """}] -) +def __call__( + self, + input_ids: npt.NDArray[np.intc], + /, + **kwargs: Any, +) -> npt.NDArray[np.intc] ``` +Calls `find_candidate_pred_tokens` with the instance's `max_ngram_size` and `num_pred_tokens`. + ## Best Practices & Common Patterns -* Prefer `LlamaNGramMapDecoding` for new usage. -* Use `LlamaPromptLookupDecoding` only when compatibility with the older stateless prompt lookup behavior is needed. -* Increase `ngram_size` or `max_ngram_size` for stricter context matching. -* Increase `num_pred_tokens` when you want longer draft proposals, but keep in mind that speculative decoding still depends on later verification by the main model. -* Do not mutate `_ngram_map` or `_history` directly. -* If input token history rolls back or changes unexpectedly, `LlamaNGramMapDecoding` automatically rebuilds its internal cache. +- Prefer `LlamaNGramMapDecoding` for new usage. +- Use `mode="k"` as the default memory-efficient mode. +- Use `mode="k4v"` when cached continuations are useful and the additional memory use is acceptable. +- Keep `max_entries_per_key` set for `k4v` mode unless you intentionally want an unbounded per-key cache. +- Use `min_hits=1` for maximum recall in repetitive prompts or benchmarks. +- Use `min_hits > 1` to reduce low-confidence drafts. +- Increase `ngram_size` for stricter pattern matching. +- Increase `num_pred_tokens` to allow longer draft proposals, but remember that the target model still verifies the tokens. +- Call `clear()` before reusing the same decoder for an unrelated prompt or generation session. +- Do not call the decoder again after `close()` unless you create a new instance. +- Do not mutate `_history`, `_map_k`, `_map_k4v`, or other internal state directly. + +## Limitations + +- Prompt lookup only predicts tokens that are already implied by repeated patterns in the verified context. +- It is most useful for repetitive, structured, or boilerplate-heavy output. +- It may return an empty draft when the context has too few repeated n-grams or when `min_hits` is too strict. +- It does not replace target-model verification. +- `LlamaPromptLookupDecoding` is kept for compatibility and is not recommended for production use. ## Deprecated / Changed APIs -`LlamaPromptLookupDecoding` is marked as a legacy NumPy sliding-window implementation in the source code. It is still available, but `LlamaNGramMapDecoding` is the preferred implementation for faster repeated calls over long contexts. +`LlamaPromptLookupDecoding` is the legacy NumPy sliding-window implementation. It remains available, but `LlamaNGramMapDecoding` is the preferred prompt lookup implementation for new code. + +Compared with the older `LlamaNGramMapDecoding` documentation, the current implementation adds: + +- `mode` +- `min_hits` +- `max_entries_per_key` +- `sync_check_tokens` +- `clear()` +- `close()` +- `accept()` +- Separate internal indexes for `k` and `k4v` modes ## Related Links * [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] * [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +* [[Benchmark_Speculative](https://github.com/JamePeng/llama-cpp-python/blob/main/examples/benchmark/benchmark_speculative.py)] diff --git a/docs/wiki/modules/Logger.md b/docs/wiki/modules/Logger.md new file mode 100644 index 0000000000..f24f7f43a8 --- /dev/null +++ b/docs/wiki/modules/Logger.md @@ -0,0 +1,216 @@ +--- +title: Logger +class_name: Logger (module) +module_name: llama_cpp._logger +source_file: llama_cpp/_logger.py +last_updated: 2026-05-16 +version_target: latest +--- + +## Overview + +The `Logger` module provides configuration for runtime logging in `llama-cpp-python`, wrapping the native `ggml`/`llama.cpp` logging infrastructure. It controls verbosity levels, output streams, substring filtering, and callback integration, allowing fine-grained control over diagnostic and informational output from the underlying bindings. + +## Role in the Library + +- **Wraps low-level logging**: It intercepts and transforms log events from the C/C++ backend (`ggml_log_callback`). +- **Connects to Python logging**: Maps `ggml` verbosity levels (0–5) to `logging` levels (ERROR, WARNING, INFO, DEBUG), and routes output to `stdout`/`stderr` based on severity. +- **Provides filtering**: Substring-based message filtering to suppress specific log categories (e.g., CUDA Graph output). +- **Extends the API surface**: Offers both explicit configuration functions and convenient shorthand setters (`set_verbose`, `set_quiet`), while preserving full control through `configure_logging`. + +## Core Methods + +### `configure_logging(*, verbosity=None, verbose=None, quiet=None, silent=None, show_output=None, log_filters=None, append_log_filters=None, log_filters_case_sensitive=None)` + +The primary configuration function. Combines multiple parameters into a unified verbosity level. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `verbosity` | int \| bool \| None | None | Numeric level (0–5). `False` maps to `ERROR` (1), `True` to `DEBUG` (5). | +| `verbose` | bool | None | Shorthand: `True` → `DEBUG`, `False` → `ERROR`. | +| `quiet` | bool | None | Shorthand: `True` → `WARN` (2). | +| `silent` | bool | None | Shorthand: `True` → `ERROR` (1). | +| `show_output` | bool | None | Whether `GGML_LOG_LEVEL_NONE` (output) should be shown. | +| `log_filters` | Iterable[str] | None | List of substring patterns to filter out. | +| `append_log_filters` | Iterable[str] | None | Append additional filter patterns. | +| `log_filters_case_sensitive` | bool | None | Whether filters are case-sensitive. | + +### `set_verbose(verbose: bool)` + +Shorthand setter. `verbose=True` sets `verbosity=DEBUG`, `verbose=False` sets `verbosity=ERROR`. + +### `set_verbosity(verbosity: VerbosityLike)` + +Sets verbosity to any value accepted by `configure_logging`. + +### `get_verbosity() -> int` + +Returns current configured verbosity level (0–5). + +### `set_quiet(quiet: bool = True)` + +Sets `verbosity=WARN` (`2`). + +### `set_silent(silent: bool = True)` + +Sets `verbosity=ERROR` (`1`). + +### `set_log_filters(filters: Iterable[str], *, case_sensitive: bool = True)` + +Replaces all substring log filters. + +### `get_log_filters() -> list[str]` + +Returns current filter list. + +### `add_log_filters(filters: Iterable[str])` + +Appends filters to the current list. + +### `clear_log_filters()` + +Removes all user-defined filters. + +### `reset_log_filters()` + +Restores the default filter list: `["CUDA Graph", "CUDA graph"]`. + +### `reset_logging()` + +Resets to default: `verbosity=INFO` (`3`), `show_output=True`, default filters. + +## Important Attributes / State + +| Attribute | Type | Source | Description | +|-----------|------|--------|-------------| +| `_config` | LoggerConfig | Internal | Holds the current configuration: verbosity, output streams, filters. | +| `_last_verbosity` | int | Internal | Tracks the last verbosity level set by `ggml_log_callback`. | + +## Best Practices & Common Patterns + +### 1. Default Behavior +Use `reset_logging()` to start with `INFO` verbosity, which shows warnings and errors but hides internal debug output. + +```python +from llama_cpp import Llama +from llama_cpp import reset_logging + +reset_logging() # Default verbosity=3 (INFO), show warnings and errors +llm = Llama(model_path="models/qwen3.gguf") +llm("Explain quantum physics.") +``` + +### 2. Precise Logging via `verbosity` +Replace the legacy `verbose` boolean with the precise `verbosity` parameter. `verbose=False` maps to `ERROR` (1), `verbose=True` to `DEBUG` (5). + +```python +from llama_cpp import Llama + +# Legacy (coarse control): +llm_quiet = Llama(model_path="models/qwen3.gguf", verbose=False) +llm_quiet("What is a neural network?") + +# Modern (fine-grained control): +llm = Llama(model_path="models/qwen3.gguf", verbosity=3) +llm("What is a neural network?") +``` + +### 3. Low-Level Debugging +For deep backend debugging, set `verbosity=5` (DEBUG) and optionally disable substring filters to see all diagnostic output. + +```python +from llama_cpp import Llama + +# Debug-level logs, showing all backend diagnostics +llm = Llama(model_path="models/qwen3.gguf", verbosity=5) + +# If you want to see normally filtered CUDA Graph messages: +llm = Llama( + model_path="models/qwen3.gguf", + verbosity=5, + log_filters=[], # Disable all substring filters +) +``` + +### 4. Substring-Based Backend Noise Filtering +Suppress known noisy backend messages by passing substring filters. This prevents "CUDA Graph" and model loading chatter from flooding the console. + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="models/qwen3.gguf", + verbosity=3, # INFO level + log_filters=[ + "CUDA Graph id", + "clip_model_loader: tensor", + "ggml_cuda_graph_update_required", + "llama_perf_context_print", + ], +) +llm("What is a transformer?") +``` + +### 5. Runtime Logging Adjustments +Since logging is process-global, you can adjust verbosity or filters at runtime — changes apply to all `Llama` instances in the same process. + +```python +from llama_cpp import Llama + +llm = Llama(model_path="models/qwen3.gguf", verbosity=2) # QUIET: only show warnings and errors +llm("Quick answer: What is machine learning?") + +# Temporarily increase verbosity for diagnostics +llm.set_verbosity(5) +llm("Show me the full debug log for this prompt") +llm.set_verbosity(2) # Return to QUIET + +# Add a specific filter without resetting everything +llm.add_log_filters(["llama_perf_context_print"]) +llm("Final answer: What is machine learning?") +``` + +### 6. Complete Diagnostic Session +For a full diagnostic session, combine precise verbosity, custom filters, and runtime control: + +```python +from llama_cpp import Llama + +# 1. Start with info-level verbosity +llm = Llama(model_path="models/qwen3.gguf", verbosity=3) + +# 2. Suppress backend noise +llm.set_log_filters([ + "CUDA Graph", + "CUDA graph", + "clip_model_loader: tensor", + "ggml_cuda_graph_update_required", +]) + +# 3. Run inference +llm("Explain the llama.cpp inference pipeline") + +# 4. Temporarily increase verbosity for a specific call +llm.set_verbosity(5) +llm("Show debug output for cache hit details") +llm.set_verbosity(2) # Return to normal + +# 5. Remove filters after session +llm.clear_log_filters() +``` + +## Key Considerations + +- **Process-global**: Logging configuration affects all `Llama` instances in the same process. Use `add_log_filters` or `set_log_filters` carefully when multiple instances run concurrently. +- **Flushed immediately**: Every log call flushes to `stdout`/`stderr`, so output appears immediately. +- **Shorthand vs. precise**: Prefer `verbosity`/`set_verbosity` over `verbose`/`set_verbose`/`set_quiet`/`set_silent` for precision, though the shorthands remain for backward compatibility. +- **verbose=False** vs. **verbosity=0**: These have distinct behaviors — `verbose=False` silences Python wrapper prints but not backend diagnostics; `verbosity=0` silences all backend non-error output. + +## Deprecated / Changed APIs + +None documented. + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] diff --git a/examples/benchmark/benchmark_speculative.py b/examples/benchmark/benchmark_speculative.py new file mode 100644 index 0000000000..73e7c203a2 --- /dev/null +++ b/examples/benchmark/benchmark_speculative.py @@ -0,0 +1,466 @@ +import csv +import gc +import random +import statistics +import time +from dataclasses import dataclass +from typing import Callable, Dict, List, Optional + +from llama_cpp import Llama +from llama_cpp.llama_speculative import ( + LlamaPromptLookupDecoding, + LlamaNGramMapDecoding, +) + + +# ============================================================ +# Model Configuration +# ============================================================ + +MODEL_PATH = r"/path/to/your/model.GGUF" + +N_CTX = 4096 +MAX_TOKENS = 1024 +REPEATS = 2 +CSV_OUTPUT = "speculative_benchmark_results.csv" + +RANDOMIZE_ENGINE_ORDER = False + + +# ============================================================ +# Benchmark Scenario Definition +# ============================================================ + +@dataclass(frozen=True) +class Scenario: + name: str + category: str + prompt: str + expected_behavior: str + + +TEST_SCENARIOS: List[Scenario] = [ + Scenario( + name="A1. Medium-High Repetition - CRUD Boilerplate Code", + category="code_boilerplate", + expected_behavior="Should benefit from n-gram lookup because class and method structures repeat.", + prompt="""<|im_start|>system +You are a senior backend developer. Write highly structured and consistent boilerplate code.<|im_end|> +<|im_start|>user +Write a Python script using `sqlite3` to define CRUD operations for a core banking system database. + +Create 6 separate classes: +- Account +- Transaction +- Customer +- Loan +- Portfolio +- AuditLog + +Each class MUST use the same internal method structure: +- create +- get +- update +- delete +- list_all + +Do not add extra explanations. Output only code.<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="A2. Extreme Repetition - JSONL Trading Logs", + category="structured_logs", + expected_behavior="Should strongly favor n-gram methods, especially K/K4V.", + prompt="""<|im_start|>system +You are a deterministic data generation script. Output only raw JSON lines.<|im_end|> +<|im_start|>user +Continue this algorithmic trading execution log for 40 more lines. +Only change timestamp seconds, symbol, quantity, price, and execution_time_ms. + +{"timestamp":"2026-05-23T09:30:01Z","level":"INFO","module":"exec_engine","event":"trade_filled","symbol":"AAPL","side":"BUY","quantity":100,"price":175.50,"execution_time_ms":12} +{"timestamp":"2026-05-23T09:30:02Z","level":"INFO","module":"exec_engine","event":"trade_filled","symbol":"MSFT","side":"SELL","quantity":50,"price":410.25,"execution_time_ms":15} +{"timestamp":"2026-05-23T09:30:03Z","level":"INFO","module":"exec_engine","event":"trade_filled","symbol":"TSLA","side":"BUY","quantity":200,"price":180.10,"execution_time_ms":11}<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="A3. Markdown Table - Repetitive Course Catalog", + category="markdown_table", + expected_behavior="Repeated table columns and row structure should benefit from speculative lookup.", + prompt="""<|im_start|>system +You generate clean Markdown tables with consistent formatting.<|im_end|> +<|im_start|>user +Create a Markdown comparison table for 30 university postgraduate courses. + +Columns: +| Course ID | Course Title | Department | Credits | Prerequisites | Grading Basis | Core Objective | + +The row format must stay consistent. +Use concise but realistic academic descriptions. +Do not add explanation outside the table.<|im_end|> +<|im_start|>assistant +| Course ID | Course Title | Department | Credits | Prerequisites | Grading Basis | Core Objective | +|---:|---|---|---:|---|---|---| +""", + ), + Scenario( + name="A4. Structured Financial Market Report", + category="structured_report", + expected_behavior="Heading and bullet patterns repeat; n-gram lookup should help moderately.", + prompt="""<|im_start|>system +You are a quantitative macroeconomic analyst. Output structured, clear, and professional financial reports.<|im_end|> +<|im_start|>user +Write a Q3 Macroeconomic & Equity Strategy Outlook Report for institutional investors. + +Requirements: +1. Divide the report into exactly 8 sections. +2. Each section MUST contain exactly one heading and 3 bullet points. +3. Repeatedly emphasize the following themes across the sections: interest rate trajectory, inflation stickiness, equity market volatility, supply chain realignment, and fixed-income duration strategies. +4. Keep the tone highly professional and analytical.<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="B1. Low Repetition - Macroeconomic Historical Essay", + category="low_repetition_creative", + expected_behavior="Should show limited or no speedup; useful as a negative control.", + prompt="""<|im_start|>system +You are an academic historian of economics. Write with varied sentence structures, rich vocabulary, and analytical depth.<|im_end|> +<|im_start|>user +Write a comprehensive essay exploring the psychological and sociological impacts of hyperinflation on institutional trust during the Weimar Republic in the 1920s. + +Requirements: +- Use highly academic and varied language. +- Do NOT use repetitive paragraph structures. +- Do NOT use bullet points or lists. +- Avoid parallel phrasing; favor complex, flowing narrative analysis. +- Make it a long, continuous essay.<|im_end|> +<|im_start|>assistant +The catastrophic devaluation of the Papiermark in the early 1920s fundamentally fractured the psychological bedrock of the Weimar Republic. """, + ), + Scenario( + name="B2. Reasoning-Like Explanation - Quantitative Finance", + category="reasoning_explanation", + expected_behavior="May show smaller speedup because content is less template-like.", + prompt="""<|im_start|>system +You are a careful technical explainer. Avoid repetitive phrasing.<|im_end|> +<|im_start|>user +Explain the foundational assumptions and inherent limitations of the Black-Scholes option pricing model. + +Discuss the following concepts contextually: +- Log-normal distribution of asset prices +- The assumption of constant volatility and risk-free rates +- Frictionless markets (no transaction costs or taxes) +- The difference in applicability between European and American options + +Write in clear, academic paragraphs. Do not use bullet points or lists.<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="C1. Long Context Copy-Edit - High Local Reuse", + category="copy_edit", + expected_behavior="Prompt contains repeated phrases; n-gram lookup should exploit local reuse.", + prompt="""<|im_start|>system +You are a precise academic editing assistant. Preserve the structure while improving the wording.<|im_end|> +<|im_start|>user +Rewrite the following academic grant proposal abstract in a cleaner professional style. +Keep the same repetitive sentence layout but fix the grammar and flow. + +Draft Proposal: +The proposed research will investigate the efficiency of machine learning in high-frequency trading. +The proposed research will demonstrate the risk vectors of automated market making. +The methodology will utilize massive historical limit order book datasets. +The methodology will require significant computational cluster resources. +The expected outcomes will provide a new framework for liquidity provisioning. +The expected outcomes will establish a baseline for regulatory compliance monitoring. +The budget will allocate funds for data acquisition from major exchanges. +The budget will allocate funds for two postdoctoral researchers. +The timeline will span twenty-four months of continuous data analysis. +The timeline will include three major peer-reviewed journal submissions. +The significance will address the growing instability in algorithmic flash crashes. +The significance will ensure safer automated trading environments.<|im_end|> +<|im_start|>assistant +""", + ), +] + + +# ============================================================ +# Engine Definition +# ============================================================ + +@dataclass(frozen=True) +class EngineConfig: + name: str + draft_factory: Callable[[], Optional[object]] + note: str + + +ENGINE_CONFIGS: List[EngineConfig] = [ + EngineConfig( + name="Baseline", + draft_factory=lambda: None, + note="No speculative decoding.", + ), + EngineConfig( + name="PromptLookup-Numpy-n10", + draft_factory=lambda: LlamaPromptLookupDecoding( + max_ngram_size=3, + num_pred_tokens=10, + ), + note="Legacy sliding-window prompt lookup.", + ), + EngineConfig( + name="NGramMap-K-n6", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=6, + mode="k", + min_hits=1, + ), + note="Key-only n-gram map, shorter draft.", + ), + EngineConfig( + name="NGramMap-K-n10", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=1, + ), + note="Key-only n-gram map, default draft length.", + ), + EngineConfig( + name="NGramMap-K4V-n10-cap8", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k4v", + min_hits=1, + max_entries_per_key=8, + ), + note="K4V with bounded per-key memory.", + ), + EngineConfig( + name="NGramMap-K4V-n16-cap8", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=16, + mode="k4v", + min_hits=1, + max_entries_per_key=8, + ), + note="Longer K4V draft; can be faster on highly repetitive outputs.", + ), + EngineConfig( + name="NGramMap-K-minhits2-n10", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=2, + ), + note="More conservative K mode.", + ), +] + + +# ============================================================ +# Measurement Helpers +# ============================================================ + +def cleanup_model(llm: Optional[Llama]) -> None: + if llm is not None: + del llm + gc.collect() + + +def create_llama(draft_model: Optional[object]) -> Llama: + return Llama( + model_path=MODEL_PATH, + n_ctx=N_CTX, + n_gpu_layers=-1, + draft_model=draft_model, + verbose=False, + ) + + +def measure_once( + scenario: Scenario, + engine: EngineConfig, + repeat_idx: int, +) -> Dict[str, object]: + draft_model = engine.draft_factory() + + print(f"\n⏳ [{scenario.name}] Engine={engine.name} | Repeat={repeat_idx + 1}") + print(f" Note: {engine.note}") + + llm: Optional[Llama] = None + + try: + llm = create_llama(draft_model) + + # Warmup: force backend initialization and first-token path. + llm.create_completion( + prompt=scenario.prompt, + max_tokens=1, + temperature=0.0, + echo=False, + ) + + start = time.perf_counter() + + response = llm.create_completion( + prompt=scenario.prompt, + max_tokens=MAX_TOKENS, + temperature=0.0, + top_p=1.0, + top_k=1, + repeat_penalty=1.0, + echo=False, + ) + + end = time.perf_counter() + + duration = end - start + usage = response.get("usage", {}) + completion_tokens = int(usage.get("completion_tokens", 0)) + total_tokens = int(usage.get("total_tokens", 0)) + prompt_tokens = int(usage.get("prompt_tokens", 0)) + + text = response["choices"][0]["text"] + tps = completion_tokens / duration if duration > 0 else 0.0 + + print( + f"✅ {engine.name:<28} " + f"{tps:8.2f} tok/s | " + f"time={duration:7.2f}s | " + f"gen={completion_tokens:4d} | " + f"prompt={prompt_tokens:4d}" + ) + print(f" Snippet: {text[:120].replace(chr(10), ' ')}...") + + return { + "scenario": scenario.name, + "category": scenario.category, + "expected_behavior": scenario.expected_behavior, + "engine": engine.name, + "engine_note": engine.note, + "repeat": repeat_idx + 1, + "duration_sec": duration, + "completion_tokens": completion_tokens, + "prompt_tokens": prompt_tokens, + "total_tokens": total_tokens, + "tokens_per_sec": tps, + "snippet": text[:160].replace("\n", "\\n"), + } + + finally: + if hasattr(draft_model, "close"): + draft_model.close() + cleanup_model(llm) + + +# ============================================================ +# Reporting +# ============================================================ + +def summarize_results(rows: List[Dict[str, object]]) -> None: + print("\n\n" + "=" * 90) + print("📊 Benchmark Summary") + print("=" * 90) + + by_scenario: Dict[str, List[Dict[str, object]]] = {} + for row in rows: + by_scenario.setdefault(str(row["scenario"]), []).append(row) + + for scenario_name, scenario_rows in by_scenario.items(): + print(f"\n📂 {scenario_name}") + print("-" * 90) + + grouped: Dict[str, List[float]] = {} + for row in scenario_rows: + grouped.setdefault(str(row["engine"]), []).append(float(row["tokens_per_sec"])) + + baseline_avg = statistics.mean(grouped.get("Baseline", [0.0])) + + print( + f"{'Engine':<32} | {'Avg tok/s':>10} | {'Best':>10} | " + f"{'Worst':>10} | {'Speedup':>8}" + ) + print("-" * 90) + + for engine_name, speeds in grouped.items(): + avg = statistics.mean(speeds) + best = max(speeds) + worst = min(speeds) + speedup = avg / baseline_avg if baseline_avg > 0 else 1.0 + + print( + f"{engine_name:<32} | " + f"{avg:10.2f} | " + f"{best:10.2f} | " + f"{worst:10.2f} | " + f"{speedup:8.2f}x" + ) + + +def save_csv(rows: List[Dict[str, object]], path: str) -> None: + if not rows: + return + + fieldnames = list(rows[0].keys()) + + with open(path, "w", newline="", encoding="utf-8-sig") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + print(f"\n💾 CSV saved to: {path}") + + +# ============================================================ +# Main Benchmark Flow +# ============================================================ + +def run_benchmark() -> None: + print("=" * 90) + print("🏆 llama-cpp-python Speculative Decoding Benchmark") + print("=" * 90) + print(f"Model: {MODEL_PATH}") + print(f"n_ctx={N_CTX}, max_tokens={MAX_TOKENS}, repeats={REPEATS}") + print("=" * 90) + + rows: List[Dict[str, object]] = [] + + for scenario in TEST_SCENARIOS: + print("\n\n" + "#" * 90) + print(f"📂 Scenario: {scenario.name}") + print(f"📌 Category: {scenario.category}") + print(f"🧠 Expected: {scenario.expected_behavior}") + print("#" * 90) + + engines = list(ENGINE_CONFIGS) + if RANDOMIZE_ENGINE_ORDER: + baseline = [e for e in engines if e.name == "Baseline"] + others = [e for e in engines if e.name != "Baseline"] + random.shuffle(others) + engines = baseline + others + + for engine in engines: + for repeat_idx in range(REPEATS): + row = measure_once( + scenario=scenario, + engine=engine, + repeat_idx=repeat_idx, + ) + rows.append(row) + + summarize_results(rows) + save_csv(rows, CSV_OUTPUT) + + +if __name__ == "__main__": + run_benchmark() \ No newline at end of file diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 438bf08b58..1650e6af69 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.38" +__version__ = "0.3.40" diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index a8936fa2bf..1a9f8eb8c5 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -18,6 +18,37 @@ ) from typing_extensions import TypeAlias +def _format_library_dir_contents(base_paths: list[pathlib.Path]) -> str: + """Format directory contents for diagnostics after library loading fails.""" + sections = [] + + for base_path in base_paths: + p = pathlib.Path(base_path) + + if not p.exists(): + sections.append(f"{p}: ") + continue + + if not p.is_dir(): + sections.append(f"{p}: ") + continue + + try: + # Only list files when reporting a final loading failure. + files = sorted(x.name for x in p.iterdir()) + except Exception as e: + sections.append(f"{p}: ") + continue + + if files: + sections.append( + f"{p}:\n" + + "\n".join(f" - {name}" for name in files) + ) + else: + sections.append(f"{p}: ") + + return "\n".join(sections) # Load the library def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list[pathlib.Path]]): @@ -114,9 +145,12 @@ def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list except Exception as e: errors.append(f"{lib_path}: {e}") + # Include directory contents only in the failure path to avoid extra work during successful imports. raise RuntimeError( f"Failed to load '{lib_base_name}' from {base_paths}\n" + "\n".join(errors) + + "\nLibrary search path contents:\n" + + _format_library_dir_contents(base_paths) ) diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index 8f4cb1187f..c4ae7c94bf 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -1295,8 +1295,8 @@ def ggml_backend_load(path: ctypes.c_char_p) -> ggml_backend_reg_t: # // Unload a backend if loaded dynamically and unregister it # GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); -@ggml_function("ggml_backend_load_all", [ctypes.c_void_p], None) -def ggml_backend_load_all(reg: ggml_backend_reg_t): +@ggml_function("ggml_backend_unload", [ctypes.c_void_p], None) +def ggml_backend_unload(reg: ggml_backend_reg_t): """ Unload a backend if loaded dynamically and unregister it """ diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b4ba1f4b21..91befb2247 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -3,6 +3,7 @@ import ctypes import enum import os +import sys from typing import ( Callable, @@ -102,7 +103,7 @@ def vocab_type(self) -> int: return llama_cpp.llama_vocab_type(self.model) def n_vocab(self) -> int: - return llama_cpp.llama_n_vocab(self.vocab) + return llama_cpp.llama_vocab_n_tokens(self.vocab) def n_ctx_train(self) -> int: return llama_cpp.llama_model_n_ctx_train(self.model) @@ -131,41 +132,81 @@ def n_head_kv(self) -> int: def n_swa(self) -> int: return llama_cpp.llama_model_n_swa(self.model) + def rope_freq_scale_train(self) -> float: + """ + Get the model's RoPE frequency scaling factor + """ + return llama_cpp.llama_model_rope_freq_scale_train(self.model) + + def model_desc(self) -> str: + """ + Get a string describing the model type + """ + buf = ctypes.create_string_buffer(256) + llama_cpp.llama_model_desc(self.model, buf, 256) + return buf.value.decode("utf-8") + + def model_size(self) -> int: + """ + Returns the total size of all the tensors in the model in bytes + """ + return llama_cpp.llama_model_size(self.model) + + def model_chat_template(self, name: Optional[bytes] = None) -> Optional[str]: + """ + Get a chat template from the model. + + If name is None, returns the default chat template. + Returns None if no chat template is available. + """ + template = llama_cpp.llama_model_chat_template(self.model, name) + if template is None: + return None + return template.decode("utf-8") + def n_params(self) -> int: + """ + Returns the total number of parameters in the model + """ return llama_cpp.llama_model_n_params(self.model) def has_encoder(self) -> bool: + """ + Returns true if the model contains an encoder that requires llama_encode() call + """ return llama_cpp.llama_model_has_encoder(self.model) def has_decoder(self) -> bool: + """ + Returns true if the model contains a decoder that requires llama_decode() call + """ return llama_cpp.llama_model_has_decoder(self.model) def decoder_start_token(self) -> int: + """ + For encoder-decoder models, this function returns id of the token that must be provided + to the decoder to start generating output sequence. For other models, it returns -1. + """ return llama_cpp.llama_model_decoder_start_token(self.model) def is_recurrent(self) -> bool: + """ + Returns true if the model is recurrent (like Mamba, RWKV, etc.) + """ return llama_cpp.llama_model_is_recurrent(self.model) def is_hybrid(self) -> bool: + """ + Returns true if the model is hybrid (like Jamba, Granite, etc.) + """ return llama_cpp.llama_model_is_hybrid(self.model) def is_diffusion(self) -> bool: + """ + Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) + """ return llama_cpp.llama_model_is_diffusion(self.model) - def rope_freq_scale_train(self) -> float: - return llama_cpp.llama_model_rope_freq_scale_train(self.model) - - def desc(self) -> str: - buf = ctypes.create_string_buffer(1024) - llama_cpp.llama_model_desc(self.model, buf, 1024) - return buf.value.decode("utf-8") - - def size(self) -> int: - return llama_cpp.llama_model_size(self.model) - - def get_tensor(self, name: str) -> ctypes.c_void_p: - raise NotImplementedError("get_tensor is not implemented in llama.cpp") - # Vocab def token_get_text(self, token: int) -> str: @@ -493,9 +534,13 @@ def __init__( ctx = llama_cpp.llama_init_from_model(self.model.model, self.params) - if ctx is None: - llama_cpp.llama_model_free(self.model.model) - raise ValueError("Failed to create context with model") + if not ctx: + raise RuntimeError( + "Failed to create llama context with model. " + "This may indicate that llama_context_params is out of sync with " + "the bundled llama.cpp version, or that required context parameters " + "were not initialized correctly." + ) self.ctx = ctx @@ -519,6 +564,13 @@ def close(self): def __del__(self): self.close() + def _assert_ctx(self): + if not getattr(self, "ctx", None): + raise RuntimeError( + "LlamaContext is not initialized or has already been closed. " + "Context-dependent llama.cpp operations cannot continue." + ) + def n_ctx(self) -> int: return llama_cpp.llama_n_ctx(self.ctx) @@ -534,6 +586,9 @@ def n_ubatch(self) -> int: def n_seq_max(self) -> int: return llama_cpp.llama_n_seq_max(self.ctx) + def n_rs_seq(self) -> int: + return llama_cpp.llama_n_rs_seq(self.ctx) + def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) @@ -652,6 +707,7 @@ def set_state_seq_data_ext( # // Decoding API def encode(self, batch: LlamaBatch): + self._assert_ctx() return_code = llama_cpp.llama_encode( self.ctx, batch.batch, @@ -676,6 +732,7 @@ def decode(self, batch: 'LlamaBatch') -> int: RuntimeError: If a fatal, non-recoverable error occurs during decoding (e.g., negative error codes or invalid batch structures). """ + self._assert_ctx() return_code = llama_cpp.llama_decode(self.ctx, batch.batch) if return_code == 0: @@ -723,13 +780,6 @@ def set_causal_attn(self, causal_attn: bool): """ llama_cpp.llama_set_causal_attn(self.ctx, causal_attn) - def set_warmup(self, warmup: bool): - """ - Set whether the model is in warmup mode or not - If true, all model tensors are activated during llama_decode() to load and cache their weights. - """ - llama_cpp.llama_set_warmup(self.ctx, warmup) - def synchronize(self): """ Wait until all computations are finished @@ -739,21 +789,51 @@ def synchronize(self): llama_cpp.llama_synchronize(self.ctx) def get_logits(self): - return llama_cpp.llama_get_logits(self.ctx) + """ + Token logits obtained from the last call to llama_decode() + The logits for which llama_batch.logits[i] != 0 are stored contiguously + in the order they have appeared in the batch. + Rows: number of tokens for which llama_batch.logits[i] != 0 + Cols: n_vocab + + Returns: + Pointer to the logits buffer of shape (n_tokens, n_vocab) + """ + self._assert_ctx() + logits = llama_cpp.llama_get_logits(self.ctx) + if not logits: + raise RuntimeError(f"LlamaContext.get_logits: failed to get logits") + return logits def get_logits_ith(self, i: int): - return llama_cpp.llama_get_logits_ith(self.ctx, i) + """ + Return logits for the ith output row from the last llama_decode call. + + Note: + This calls llama_get_logits_ith(), which may reorder/synchronize + the output buffer internally. Avoid calling it on the hot path unless + Python-side logits are required. + """ + self._assert_ctx() + logits = llama_cpp.llama_get_logits_ith(self.ctx, i) + if not logits: + raise RuntimeError(f"LlamaContext.get_logits_ith: invalid logits index {i}") + return logits def set_embeddings(self, embeddings: bool): + self._assert_ctx() llama_cpp.llama_set_embeddings(self.ctx, embeddings) def get_embeddings(self): + self._assert_ctx() return llama_cpp.llama_get_embeddings(self.ctx) def get_embeddings_ith(self, i: int): + self._assert_ctx() return llama_cpp.llama_get_embeddings_ith(self.ctx, i) def get_embeddings_seq(self, seq_id: int): + self._assert_ctx() return llama_cpp.llama_get_embeddings_seq(self.ctx, seq_id) def reset_timings(self): @@ -1184,6 +1264,60 @@ class CommonSamplerType(enum.IntEnum): CUSTOM = 99 + +# common/reasssoning-budget.h +# +# enum common_reasoning_budget_state { +# REASONING_BUDGET_IDLE, // waiting for start sequence +# REASONING_BUDGET_COUNTING, // counting down tokens +# REASONING_BUDGET_FORCING, // forcing budget message + end sequence +# REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion +# REASONING_BUDGET_DONE, // passthrough forever +# }; +class ReasoningBudgetState(enum.IntEnum): + """ + State machine for the generic first-reasoning-block budget controller. + + This sampler only controls the first reasoning block. Once the first block + naturally ends or is forcibly closed, the sampler enters DONE and becomes a + permanent passthrough. + """ + + IDLE = 0 # Waiting for the first reasoning_start sequence. + COUNTING = 1 # Counting generated tokens inside the first reasoning block. + FORCING = 2 # Forcing reasoning_budget_message + reasoning_end. + WAITING_UTF8 = 3 # Budget exhausted; waiting for a complete UTF-8 boundary. + DONE = 4 # Permanent passthrough; later reasoning tags are ignored. + + +class TokenMatcher: + """ + Incremental matcher for a multi-token sequence. + Accepts None as tokens to represent no matcher. + """ + def __init__(self, tokens: Optional[Sequence[int]]): + # If None, matcher never matches anything + self.tokens = list(tokens) if tokens is not None else [] + self.pos = 0 + + def advance(self, token: int) -> bool: + if not self.tokens: + return False + if token == self.tokens[self.pos]: + self.pos += 1 + if self.pos >= len(self.tokens): + self.pos = 0 + return True + else: + self.pos = 0 + if token == self.tokens[0]: + self.pos = 1 + return False + + def reset(self) -> None: + self.pos = 0 + + @dataclass class LlamaSamplingParams: seed: int = llama_cpp.LLAMA_DEFAULT_SEED # the seed used to initialize llama_sampler @@ -1228,6 +1362,59 @@ class LlamaSamplingParams: default_factory=lambda: ["\n", ":", "\"", "*"] # default sequence breakers for DRY ) + # Reasoning Budget Params + # + # Generic first-reasoning-block budget control. + # + # This is intentionally model-agnostic: + # - It does not infer model families. + # - It does not guess reasoning tags from chat templates. + # - Downstream code should pass reasoning_start / reasoning_end explicitly + # for models that do not use the default ... tags. + # + # The sampler only controls the first visible reasoning block. After that + # block naturally ends or is forcibly closed, later reasoning tags are ignored. + # Matches llama.cpp CLI semantics: + # --reasoning-budget N + reasoning_budget: int = -1 # -1 = unrestricted / disabled, 0 = immediate end, N > 0 = token budget + + # Token/text sequence that marks the beginning of the first reasoning block. + # This sequence is tokenized with add_bos=False, special=True before building + # the ReasoningBudgetSampler. + reasoning_start: str = "" + + # Token/text sequence that marks the natural end of the reasoning block. + # When the budget is exhausted, the sampler forces: + # reasoning_budget_message + reasoning_end + reasoning_end: str = "" + + # Optional message injected before reasoning_end when the budget is exhausted. + # Mirrors llama.cpp CLI semantics: + # --reasoning-budget-message MESSAGE + # + # Example forced text: + # "[reasoning budget exhausted]\n
" + reasoning_budget_message: Optional[str] = None + + # True when the prompt/chat template has already inserted reasoning_start. + # + # In that case, the sampler will not see the start tag during generation, so + # it must start directly in COUNTING state from the first generated token. + reasoning_start_in_prompt: bool = False + + # Safety window for non-reasoning models. + # + # If reasoning_start is not generated within this many output tokens, the + # sampler permanently switches to DONE and becomes a no-op. This prevents + # later literal mentions of "" in normal answer text from accidentally + # activating the budget controller. + # + # Ignored when reasoning_start_in_prompt=True because counting starts from + # the first generated token. + # + # Set to None to keep waiting for reasoning_start indefinitely. + reasoning_start_max_tokens: Optional[int] = 32 + custom_samplers: List['CustomSampler'] = field(default_factory=list) samplers: List[CommonSamplerType] = field( @@ -1267,11 +1454,18 @@ def print_params(self) -> str: f"\ttop_k = {self.top_k}, top_p = {self.top_p:.3f}, min_p = {self.min_p:.3f}, " f"xtc_probability = {self.xtc_probability:.3f}, xtc_threshold = {self.xtc_threshold:.3f}, " - f"typical_p = {self.typ_p:.3f}, top_n_sigma = {self.top_n_sigma:.3f}, temp = {self.temp:.3f}\n" + f"typical_p = {self.typical_p:.3f}, top_n_sigma = {self.top_n_sigma:.3f}, temp = {self.temp:.3f}\n" f"\tmirostat = {self.mirostat}, mirostat_lr = {self.mirostat_eta:.3f}, " f"mirostat_ent = {self.mirostat_tau:.3f}, adaptive_target = {self.adaptive_target:.3f}, " - f"adaptive_decay = {self.adaptive_decay:.3f}" + f"adaptive_decay = {self.adaptive_decay:.3f}\n" + + f"\treasoning_budget = {self.reasoning_budget}, " + f"reasoning_start = {self.reasoning_start!r}, reasoning_end = {self.reasoning_end!r}\n" + + f"\treasoning_budget_message = {self.reasoning_budget_message!r}, " + f"reasoning_start_in_prompt = {self.reasoning_start_in_prompt}, " + f"reasoning_start_max_tokens = {self.reasoning_start_max_tokens}" ) return result @@ -1339,7 +1533,7 @@ def __init__( _existing_sampler: Optional[LlamaSampler] = None, # Internal use for cloning ): if model is None: - raise RuntimeError("model must not be None") + raise RuntimeError("LlamaSamplingContext: model must not be None") self.model = model self.params = params @@ -1349,8 +1543,8 @@ def __init__( lparams = llama_cpp.llama_sampler_chain_default_params() lparams.no_perf = params.no_perf - # history (bounded) - # last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size) + # History (bounded) + # Last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size) if self.params.penalty_last_n == -1: # full context self.params.penalty_last_n = self.model.n_ctx_train() @@ -1363,10 +1557,10 @@ def __init__( ) self.prev = deque(maxlen=max(self.params.n_prev, 32)) - # reusable token data array + # Reusable token data array self._cur_p = LlamaTokenDataArray(n_vocab=self.n_vocab) - # reusable numpy logits view + # Reusable numpy logits view self._logits_view = None self._logits_ptr_addr = None @@ -1378,14 +1572,17 @@ def __init__( sorted=False, ) - # sampler chain + # Active Python reasoning-budget sampler for this sampling context. + self.reasoning_budget_sampler: Optional[ReasoningBudgetSampler] = None + + # Sampler chain if _existing_sampler: self.sampler_chain = _existing_sampler else: self.sampler_chain = LlamaSampler() self._build_sampler_chain() - # grammar sampler + # Grammar sampler self.grammar_sampler = None if params.grammar: self.grammar_sampler = GrammarSampler( @@ -1406,7 +1603,7 @@ def _build_sampler_chain(self): m = self.model if m is None: - raise RuntimeError("Model required to build sampler chain firstly") + raise RuntimeError("LlamaSamplingContext: Model required to build sampler chain firstly") use_adaptive_p = False @@ -1440,7 +1637,67 @@ def _build_sampler_chain(self): p.dry_sequence_breakers ) - # --- 5. Core Sampling Strategies (The "Filter" Loop) --- + # --- 5. Reasoning Budget --- + # + # Install before top-k/top-p/min-p filters so the forced end token cannot + # be removed from the candidate set before forcing happens. + # This sampler only controls the first reasoning block. Later blocks are ignored. + if p.reasoning_budget < -1: + raise ValueError( + "LlamaSamplingContext: reasoning_budget must be -1, 0, or a positive integer" + ) + + if p.reasoning_budget >= 0: + start_tokens = None + if not p.reasoning_start_in_prompt: + start_tokens = m.tokenize( + p.reasoning_start.encode("utf-8"), + add_bos=False, + special=True, + ) + if not start_tokens: + raise ValueError("LlamaSamplingContext: reasoning_start produced no tokens") + + end_tokens = m.tokenize( + p.reasoning_end.encode("utf-8"), + add_bos=False, + special=True, + ) + if not end_tokens: + raise ValueError("LlamaSamplingContext: reasoning_end produced no tokens") + + forced_text = (p.reasoning_budget_message or "") + p.reasoning_end + forced_tokens = m.tokenize( + forced_text.encode("utf-8"), + add_bos=False, + special=True, + ) + if not forced_tokens: + raise ValueError("LlamaSamplingContext: reasoning forced text produced no tokens") + + rb_sampler = ReasoningBudgetSampler( + model=m, + reasoning_budget=p.reasoning_budget, + start_tokens=start_tokens, + end_tokens=end_tokens, + forced_tokens=forced_tokens, + initial_state=( + ReasoningBudgetState.COUNTING + if p.reasoning_start_in_prompt + else ReasoningBudgetState.IDLE + ), + start_max_tokens=p.reasoning_start_max_tokens, + wait_utf8=True, + verbose=getattr(m, "verbose", False), + ) + + # Keep a direct Python reference so force_reasoning_budget() can + # manually transition COUNTING -> FORCING at runtime. + self.reasoning_budget_sampler = rb_sampler + + s.add_custom(rb_sampler) + + # --- 6. Core Sampling Strategies (The "Filter" Loop) --- # We iterate through the list to preserve user-defined order for these specific samplers for stype in p.samplers: if stype == CommonSamplerType.CUSTOM: @@ -1472,7 +1729,7 @@ def _build_sampler_chain(self): elif stype == CommonSamplerType.ADAPTIVE_P: use_adaptive_p = True - # --- 6. Final Distribution / Selection --- + # --- 7. Final Distribution / Selection --- # Mirostat overrides standard greedy/dist sampling if p.mirostat == 1 and m: s.add_mirostat(m.n_vocab(), p.seed, p.mirostat_tau, p.mirostat_eta, 100) @@ -1651,6 +1908,10 @@ def close(self): self.sampler_chain.close() self.sampler_chain = None + # Clear the convenience reference used for manual reasoning-budget force. + # The actual sampler lifetime is owned by sampler_chain.close(). + self.reasoning_budget_sampler = None + # Release large token data buffer used during sampling. # Important for high-vocab models to avoid memory retention. if hasattr(self, "_cur_p"): @@ -1697,24 +1958,53 @@ def prev_str(self, ctx_main: LlamaContext, n: int) -> str: # Use the model linked to the context to detokenize return ctx_main.model.detokenize(last_n_tokens).decode("utf-8", errors="replace") + def force_reasoning_budget(self) -> bool: + """ + Manually force the active reasoning-budget sampler to end thinking. + + This mirrors llama.cpp's common_sampler_reasoning_budget_force() + behavior at the Python sampling-context level. + + Returns: + True if the sampler was actively COUNTING inside the first reasoning + block and was transitioned to FORCING. + + False if: + - no reasoning-budget sampler is installed + - the sampler is IDLE + - the sampler is WAITING_UTF8 + - the sampler is already FORCING + - the sampler is DONE + + Important: + Calling this while already FORCING must not rewind force_pos. The + underlying ReasoningBudgetSampler.force() handles this by allowing + only COUNTING -> FORCING. + """ + if self.reasoning_budget_sampler is None: + return False + + return self.reasoning_budget_sampler.force() + class CustomSampler: """ - Python wrapper for llama.cpp custom sampler. + Base class for Python-backed custom samplers in the Llama sampler chain. - apply_func: - Callable receiving llama_token_data_array - and modifying logits in-place. + Responsibilities: + - Provides apply, accept, reset, free and clone callbacks for the C sampler chain. + - Keeps Python references alive to prevent GC while C sampler still holds function pointers. + - Implements safe close to clear all callback references. """ def __init__( self, apply_func: Callable[[llama_cpp.llama_token_data_array], None], - name: str = "custom", accept_func: Optional[Callable] = None, reset_func: Optional[Callable] = None, free_func: Optional[Callable] = None, clone_func: Optional[Callable] = None, + name: str = "custom", ): if not callable(apply_func): raise TypeError("apply_func must be callable") @@ -1814,6 +2104,427 @@ def __del__(self): self.close() +class ReasoningBudgetSampler(CustomSampler): + """ + Generic first-reasoning-block budget sampler. + + This sampler is intentionally model-agnostic. It does not infer model + families, inspect chat templates, or guess reasoning tags. The caller is + responsible for passing the correct reasoning_start and reasoning_end token + sequences. + + Behavior: + 1. Wait for the first reasoning_start token sequence, unless the prompt + already inserted it and initial_state is COUNTING. + 2. Count accepted tokens inside the first reasoning block. + 3. If reasoning_end appears naturally, switch to DONE. + 4. If the budget is exhausted first, force: + reasoning_budget_message + reasoning_end + token by token. + 5. Once DONE, remain passthrough forever. Later reasoning tags are ignored. + + This mirrors the core idea of llama.cpp's reasoning-budget sampler while + keeping the Python API small and explicit. + """ + + def __init__( + self, + *, + model: LlamaModel, + reasoning_budget: int, + start_tokens: Optional[Sequence[int]], + end_tokens: Sequence[int], + forced_tokens: Sequence[int], + initial_state: ReasoningBudgetState = ReasoningBudgetState.IDLE, + start_max_tokens: Optional[int] = 32, + wait_utf8: bool = True, + verbose: bool = False, + ): + """ + Initialize the reasoning budget sampler. + + Args: + model: + The active LlamaModel wrapper. Used for token_to_piece() when + checking UTF-8 boundaries. + + reasoning_budget: + Token budget inside the first reasoning block. + Must be >= 0 here. The disabled value -1 is handled before this + sampler is created. + + 0: + Force the end sequence immediately after reasoning starts. + + N > 0: + Allow at most N accepted tokens inside the reasoning block. + + start_tokens: + Token sequence that starts reasoning budget counting. + Must be provided when initial_state is IDLE. + Can be None when initial_state is COUNTING, which is used when + the prompt/chat template has already inserted reasoning_start. + + end_tokens: + Token sequence that naturally ends the reasoning block. + + forced_tokens: + Token sequence forced when the budget is exhausted. This should + normally be tokenized from: + reasoning_budget_message + reasoning_end + + initial_state: + Initial state of the sampler. + IDLE: + Wait for start_tokens during generation. + COUNTING: + Start counting from the first generated token. Use this when + reasoning_start is already present in the prompt. + + start_max_tokens: + Safety window for non-reasoning models. If start_tokens are not + observed within this many generated tokens, the sampler switches + to DONE and becomes a no-op. Set to None to wait indefinitely. + + wait_utf8: + If True, when the budget is exhausted on an incomplete UTF-8 + token piece, wait until a complete UTF-8 boundary before forcing + the end sequence. + + verbose: + If True, print high-level reasoning-budget state transitions to + stderr. Logging is intentionally limited to transitions instead + of per-token events to avoid noisy generation output. + """ + if model is None: + raise ValueError("model must not be None") + + if reasoning_budget < 0: + raise ValueError("reasoning_budget must be >= 0") + + self.model = model + + # Maximum number of tokens allowed inside the first reasoning block. + # The disabled value (-1) should be handled before constructing this sampler. + self.reasoning_budget = int(reasoning_budget) + + # Remaining tokens in the active reasoning block. + self.remaining = int(reasoning_budget) + + # Incremental matcher for the first reasoning_start sequence. + # Empty matcher is allowed only when initial_state=COUNTING. + self.start_matcher = TokenMatcher(start_tokens) + + # Incremental matcher for the natural reasoning_end sequence. + self.end_matcher = TokenMatcher(end_tokens) + + # Token sequence forced after budget exhaustion: + # reasoning_budget_message + reasoning_end + self.forced_tokens = list(forced_tokens) + + if initial_state == ReasoningBudgetState.IDLE and not self.start_matcher.tokens: + raise ValueError( + "start_tokens must not be empty when initial_state=IDLE" + ) + + if not self.end_matcher.tokens: + raise ValueError("end_tokens must not be empty") + + if not self.forced_tokens: + raise ValueError("forced_tokens must not be empty") + + # State used by reset(). This is important for templates that already + # insert reasoning_start into the prompt: reset must return to COUNTING, + # not always IDLE. + self.initial_state = ReasoningBudgetState(initial_state) + + # Current runtime state. + self.state = ReasoningBudgetState(initial_state) + + # Index of the next token in forced_tokens to force. + self.force_pos = 0 + + # Count of generated tokens observed by this sampler. + # Used only in IDLE to enforce start_max_tokens. + self.generated_tokens = 0 + + # Maximum number of generated tokens to wait for reasoning_start. + # None means wait indefinitely. + self.start_max_tokens = start_max_tokens + + # Whether to delay forcing until a complete UTF-8 boundary. + self.wait_utf8 = wait_utf8 + + # Whether to print high-level state transition logs. + # This follows the model/runtime verbose flag and avoids per-token spam. + self.verbose = verbose + + # Keep cloned Python sampler objects alive when llama.cpp clones the + # sampler chain. Without this, cloned Python callbacks could be garbage + # collected while C still holds function pointers to them. + self._clone_keep_alive: List["ReasoningBudgetSampler"] = [] + + if self.state == ReasoningBudgetState.COUNTING and self.remaining <= 0: + self.state = ReasoningBudgetState.FORCING + + super().__init__( + apply_func=self._apply, + accept_func=self._accept, + reset_func=self._reset, + clone_func=self._clone, + name="reasoning-budget", + ) + + if self.verbose: + print( + f"ReasoningBudgetSampler: initialized " + f"(state={self.state.name}, budget={self.reasoning_budget}, " + f"start_max_tokens={self.start_max_tokens}, wait_utf8={self.wait_utf8}).", + file=sys.stderr, + ) + + def _log(self, message: str) -> None: + """Print a verbose reasoning-budget state transition message.""" + if self.verbose: + print(f"ReasoningBudgetSampler: {message}", file=sys.stderr) + + def force(self) -> bool: + """ + Manually transition the active reasoning block into forced ending. + + This method is useful for external interruption scenarios, such as: + - user clicks "stop thinking" + - server-side thinking timeout + - UI wants to skip the rest of the reasoning block while still allowing + the model to continue with the final answer + + The transition is allowed only from COUNTING. This matches llama.cpp's + common_reasoning_budget_force() behavior and avoids unsafe rewinding when + the sampler is already FORCING. + """ + if self.state != ReasoningBudgetState.COUNTING: + return False + + self.state = ReasoningBudgetState.FORCING + self.force_pos = 0 + self.end_matcher.reset() + self._log("manual force requested; entering FORCING state.") + return True + + def _token_utf8_complete(self, token: int) -> bool: + """ + Return whether the token piece is a complete UTF-8 byte sequence. + + This is a safety feature. If the budget is exhausted in the middle of a + multi-byte UTF-8 sequence, the sampler waits until a complete boundary + before forcing reasoning_budget_message + reasoning_end. + """ + if not self.wait_utf8: + return True + + try: + piece = self.model.token_to_piece(token, special=False) + if not piece: + return True + piece.decode("utf-8") + return True + except UnicodeDecodeError: + return False + except Exception: + # Avoid getting stuck forever if token_to_piece behaves unexpectedly. + return True + + def _start_counting(self) -> None: + """ + Enter COUNTING state and initialize the budget window. + + If reasoning_budget is 0, immediately enter FORCING state. + """ + self.state = ReasoningBudgetState.COUNTING + self.remaining = self.reasoning_budget + self.end_matcher.reset() + self.force_pos = 0 + self._log(f"reasoning_start matched; entering COUNTING state (budget={self.reasoning_budget}).") + + if self.remaining <= 0: + self.state = ReasoningBudgetState.FORCING + self._log("budget is 0; entering FORCING state immediately.") + + def _accept(self, token: int) -> None: + """ + Update sampler state after one token has been accepted. + + This method does not modify logits. It only tracks: + - whether reasoning_start has appeared + - whether reasoning_end has appeared + - how much budget remains + - where we are in the forced token sequence + """ + self.generated_tokens += 1 + + if self.state == ReasoningBudgetState.IDLE: + if self.start_matcher.advance(token): + self._start_counting() + return + + # Safety for non-reasoning models: + # + # If no reasoning_start appears near the beginning, assume this + # completion has no visible reasoning block. Switch to DONE forever + # so later literal mentions of reasoning_start do not accidentally + # activate the budget controller. + if ( + self.start_max_tokens is not None + and self.generated_tokens >= self.start_max_tokens + ): + self.state = ReasoningBudgetState.DONE + self._log( + f"reasoning_start not found within {self.start_max_tokens} generated tokens; " + "switching to DONE passthrough." + ) + return + + if self.state in ( + ReasoningBudgetState.COUNTING, + ReasoningBudgetState.WAITING_UTF8, + ): + if self.end_matcher.advance(token): + self.state = ReasoningBudgetState.DONE + self._log("reasoning_end matched naturally; switching to DONE passthrough.") + return + + utf8_complete = self._token_utf8_complete(token) + + if self.state == ReasoningBudgetState.WAITING_UTF8: + if utf8_complete: + self.state = ReasoningBudgetState.FORCING + self.force_pos = 0 + self.end_matcher.reset() + self._log("UTF-8 boundary reached; entering FORCING state.") + return + + self.remaining -= 1 + if self.remaining <= 0: + if utf8_complete: + self.state = ReasoningBudgetState.FORCING + self.force_pos = 0 + self.end_matcher.reset() + self._log("reasoning budget exhausted; entering FORCING state.") + else: + self.state = ReasoningBudgetState.WAITING_UTF8 + self.end_matcher.reset() + self._log("reasoning budget exhausted; waiting for UTF-8 boundary before forcing.") + return + + if self.state == ReasoningBudgetState.FORCING: + self.force_pos += 1 + if self.force_pos >= len(self.forced_tokens): + self.state = ReasoningBudgetState.DONE + self._log("forced end sequence completed; switching to DONE passthrough.") + return + + if self.state == ReasoningBudgetState.DONE: + # Only the first reasoning block is budget-controlled. + # Later reasoning tags are normal generated text. + return + + def _apply(self, cur_p: llama_cpp.llama_token_data_array) -> None: + """ + Apply logits forcing before sampling. + + In FORCING state, only forced_tokens[force_pos] is allowed. All other + candidate logits are set to -inf. The forced token is set to +inf to make + the intent explicit and robust against previous logit modifications. + """ + if self.state != ReasoningBudgetState.FORCING: + return + + if self.force_pos >= len(self.forced_tokens): + return + + forced = self.forced_tokens[self.force_pos] + data = cur_p.data + found = False + + for i in range(cur_p.size): + if data[i].id == forced: + data[i].logit = float("inf") + found = True + else: + data[i].logit = float("-inf") + + cur_p.sorted = False + cur_p.selected = -1 + + if not found: + raise RuntimeError( + f"ReasoningBudgetSampler: forced token {forced} is not present " + "in the candidate array. Move ReasoningBudgetSampler earlier in " + "the sampler chain." + ) + + def _reset(self) -> None: + """ + Reset the sampler to its configured initial state. + + Uses self.initial_state to determine whether to start in: + - IDLE: wait for reasoning_start token sequence + - COUNTING: prompt already contains start token, begin counting immediately + + Also resets internal counters and matchers: + - remaining budget + - generated_tokens + - start_matcher / end_matcher positions + - force_pos + """ + self.state = self.initial_state + self.remaining = self.reasoning_budget + self.generated_tokens = 0 + self.force_pos = 0 + + if self.start_matcher: + self.start_matcher.reset() + self.end_matcher.reset() + + # If initial_state = COUNTING and budget is zero, immediately enter FORCING + if self.state == ReasoningBudgetState.COUNTING and self.remaining <= 0: + self.state = ReasoningBudgetState.FORCING + + self._log(f"reset to {self.state.name} state.") + + def _clone(self): + """ + Clone the full runtime state. + + This mirrors the newer llama.cpp reasoning-budget sampler behavior where + clone copies the full sampler context, not only the static configuration. + """ + cloned = ReasoningBudgetSampler( + model=self.model, + reasoning_budget=self.reasoning_budget, + start_tokens=self.start_matcher.tokens, + end_tokens=self.end_matcher.tokens, + forced_tokens=self.forced_tokens, + initial_state=self.initial_state, + start_max_tokens=self.start_max_tokens, + wait_utf8=self.wait_utf8, + verbose=self.verbose, + ) + + cloned.remaining = self.remaining + cloned.state = self.state + cloned.force_pos = self.force_pos + cloned.generated_tokens = self.generated_tokens + cloned.start_matcher.pos = self.start_matcher.pos + cloned.end_matcher.pos = self.end_matcher.pos + + # Keep the cloned Python object alive on the source sampler. The cloned + # LlamaSampler wrapper does not own this object directly because the C + # sampler clone is created through the callback. + self._clone_keep_alive.append(cloned) + + return cloned.get_sampler() + class LlamaSampler: def __init__(self, existing_sampler_p: Optional[llama_cpp.llama_sampler_p] = None): if existing_sampler_p: @@ -1867,12 +2578,13 @@ def clone(self) -> 'LlamaSampler': new_sampler = LlamaSampler(existing_sampler_p=new_sampler_p) - # copy _keep_alive and custom_samplers list to new sampler - if self._keep_alive: - new_sampler._keep_alive = self._keep_alive.copy() - - if self.custom_samplers: - new_sampler.custom_samplers = self.custom_samplers.copy() + # llama_sampler_clone() clones C samplers internally. For Python-backed + # custom samplers, the clone_func returns a new C sampler whose Python + # callback object is kept alive by the original custom sampler. Shallow + # copying custom_samplers would make the cloned chain close the original + # Python custom sampler, causing premature close/double-free issues. + new_sampler._keep_alive = self._keep_alive.copy() if self._keep_alive else [] + new_sampler.custom_samplers = [] return new_sampler @@ -2062,6 +2774,10 @@ def add_custom(self, custom_sampler: CustomSampler): [llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler] ) + # Keep the Python callback object alive while the C sampler chain holds + # function pointers to it. + self._keep_alive.append(custom_sampler) + def get_seed(self) -> int: assert self.sampler is not None return llama_cpp.llama_sampler_get_seed(self.sampler) diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py index 015cec9faa..7669e2a722 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp/_logger.py @@ -1,6 +1,9 @@ import sys import ctypes import logging +from dataclasses import dataclass, field +from typing import Iterable, Optional, TextIO, Union + import llama_cpp._ggml as _ggml import llama_cpp.llama_cpp as llama_cpp_lib @@ -12,42 +15,399 @@ # GGML_LOG_LEVEL_DEBUG = 4, # GGML_LOG_LEVEL_CONT = 5, // continue previous log # }; -GGML_LOG_LEVEL_TO_LOGGING_LEVEL = { - 0: logging.CRITICAL, - 1: logging.INFO, - 2: logging.WARNING, - 3: logging.ERROR, - 4: logging.DEBUG, - 5: logging.DEBUG, +GGML_LOG_LEVEL_NONE = 0 +GGML_LOG_LEVEL_INFO = 1 +GGML_LOG_LEVEL_WARN = 2 +GGML_LOG_LEVEL_ERROR = 3 +GGML_LOG_LEVEL_DEBUG = 4 +GGML_LOG_LEVEL_CONT = 5 + +# common/log.h model: +# +# LOG_LEVEL_OUTPUT = 0 +# LOG_LEVEL_ERROR = 1 +# LOG_LEVEL_WARN = 2 +# LOG_LEVEL_INFO = 3 +# LOG_LEVEL_TRACE = 4 +# LOG_LEVEL_DEBUG = 5 +# +# Rule: +# +# event_verbosity <= verbosity_threshold => print +# +# Larger threshold means more verbose output. +# +LOG_LEVEL_OUTPUT = 0 +LOG_LEVEL_ERROR = 1 +LOG_LEVEL_WARN = 2 +LOG_LEVEL_INFO = 3 +LOG_LEVEL_TRACE = 4 +LOG_LEVEL_DEBUG = 5 + +LOG_DEFAULT_LLAMA = LOG_LEVEL_INFO +LOG_DEFAULT_DEBUG = LOG_LEVEL_DEBUG + +# Match the updated common_log_default_callback behavior: +# INFO -> TRACE +# CONT -> TRACE +# +# This is slightly more conservative for verbosity=3: +# if the backend emits INFO through ggml_log_callback, Python will hide it unless +# verbosity >= 4. This mirrors the current upstream default callback behavior. +GGML_LEVEL_TO_VERBOSITY = { + GGML_LOG_LEVEL_NONE: LOG_LEVEL_OUTPUT, + GGML_LOG_LEVEL_ERROR: LOG_LEVEL_ERROR, + GGML_LOG_LEVEL_WARN: LOG_LEVEL_WARN, + GGML_LOG_LEVEL_INFO: LOG_LEVEL_TRACE, + GGML_LOG_LEVEL_DEBUG: LOG_LEVEL_DEBUG, + GGML_LOG_LEVEL_CONT: LOG_LEVEL_TRACE, # fallback only; CONT inherits previous +} + +GGML_LEVEL_TO_PYTHON_LEVEL = { + GGML_LOG_LEVEL_NONE: logging.INFO, + GGML_LOG_LEVEL_ERROR: logging.ERROR, + GGML_LOG_LEVEL_WARN: logging.WARNING, + GGML_LOG_LEVEL_INFO: logging.INFO, + GGML_LOG_LEVEL_DEBUG: logging.DEBUG, + GGML_LOG_LEVEL_CONT: logging.INFO, # fallback only; CONT inherits previous } + +# Default substring filters. +# +# These are intentionally simple substring filters instead of hard-coded +# special branches. Users can replace or clear them with set_log_filters(). +DEFAULT_LOG_FILTERS = [ + "CUDA Graph", + "CUDA graph" +] + + +VerbosityLike = Union[bool, int, str, None] + logger = logging.getLogger("llama-cpp-python") -_last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0] -# typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); +@dataclass +class LoggerConfig: + # 0=output, 1=error, 2=warn, 3=info, 4=trace, 5=debug + verbosity: int = LOG_DEFAULT_LLAMA + + show_output: bool = True + + stdout: TextIO = sys.stdout + stderr: TextIO = sys.stderr + + # If any substring is contained in a log message, the message is dropped. + log_filters: list[str] = field(default_factory=lambda: list(DEFAULT_LOG_FILTERS)) + log_filters_case_sensitive: bool = True + + +_config = LoggerConfig() +_last_verbosity = LOG_LEVEL_INFO + + +def _normalize_verbosity( + value: VerbosityLike, + *, + default: int = LOG_DEFAULT_LLAMA, +) -> int: + """ + Convert user input to llama.cpp-style verbosity 0..5. + + Compatibility: + verbose=False -> ERROR (1) + verbose=True -> DEBUG (5) + + Numeric levels: + 0 = output + 1 = error + 2 = warn + 3 = info + 4 = trace + 5 = debug + """ + if value is None: + return default + + if isinstance(value, bool): + return LOG_LEVEL_DEBUG if value else LOG_LEVEL_ERROR + + if isinstance(value, int): + return max(LOG_LEVEL_OUTPUT, min(LOG_LEVEL_DEBUG, value)) + + if isinstance(value, str): + key = value.strip().lower() + aliases = { + "0": LOG_LEVEL_OUTPUT, + "output": LOG_LEVEL_OUTPUT, + "none": LOG_LEVEL_OUTPUT, + + "1": LOG_LEVEL_ERROR, + "error": LOG_LEVEL_ERROR, + "err": LOG_LEVEL_ERROR, + "silent": LOG_LEVEL_ERROR, + + "2": LOG_LEVEL_WARN, + "warn": LOG_LEVEL_WARN, + "warning": LOG_LEVEL_WARN, + "quiet": LOG_LEVEL_WARN, + + "3": LOG_LEVEL_INFO, + "info": LOG_LEVEL_INFO, + "default": LOG_DEFAULT_LLAMA, + "normal": LOG_DEFAULT_LLAMA, + + "4": LOG_LEVEL_TRACE, + "trace": LOG_LEVEL_TRACE, + "trc": LOG_LEVEL_TRACE, + + "5": LOG_LEVEL_DEBUG, + "debug": LOG_LEVEL_DEBUG, + "verbose": LOG_LEVEL_DEBUG, + } + + if key in aliases: + return aliases[key] + + try: + parsed = int(key) + except ValueError as exc: + raise ValueError( + "_logger._normalize_verbosity: " + "verbosity must be one of 0..5, bool, None, or " + "'silent'/'quiet'/'info'/'trace'/'debug'" + ) from exc + + return max(LOG_LEVEL_OUTPUT, min(LOG_LEVEL_DEBUG, parsed)) + + raise TypeError(f"_logger._normalize_verbosity: unsupported verbosity type: {type(value)!r}") + + +def _verbosity_to_python_level(verbosity: int) -> int: + if verbosity >= LOG_LEVEL_DEBUG: + return logging.DEBUG + if verbosity >= LOG_LEVEL_INFO: + return logging.INFO + if verbosity >= LOG_LEVEL_WARN: + return logging.WARNING + return logging.ERROR + + +def _get_verbosity(level: int) -> int: + """ + Map ggml log level to Python-side verbosity. + + GGML_LOG_LEVEL_INFO maps to LOG_LEVEL_INFO so that verbosity=3 remains + useful as the default info level. + """ + if level == GGML_LOG_LEVEL_NONE: + return LOG_LEVEL_OUTPUT + if level == GGML_LOG_LEVEL_ERROR: + return LOG_LEVEL_ERROR + if level == GGML_LOG_LEVEL_WARN: + return LOG_LEVEL_WARN + if level == GGML_LOG_LEVEL_INFO: + return LOG_LEVEL_INFO + if level == GGML_LOG_LEVEL_DEBUG: + return LOG_LEVEL_DEBUG + if level == GGML_LOG_LEVEL_CONT: + return LOG_LEVEL_INFO + return LOG_LEVEL_DEBUG + + +def _decode_log_text(text: bytes) -> str: + return text.decode("utf-8", errors="replace") + + +def _matches_log_filter(msg: str) -> bool: + filters = _config.log_filters + if not filters: + return False + + if _config.log_filters_case_sensitive: + return any(item and item in msg for item in filters) + + msg_lower = msg.lower() + return any(item and item.lower() in msg_lower for item in filters) + + +def _should_drop(level: int, verbosity: int, msg: str) -> bool: + if verbosity > _config.verbosity: + return True + + if level == GGML_LOG_LEVEL_NONE and not _config.show_output: + return True + + if _matches_log_filter(msg): + return True + + return False + + @_ggml.ggml_log_callback def ggml_log_callback( level: int, text: bytes, user_data: ctypes.c_void_p, ): - # Note(JamePeng): A temporary patch is used to filter out garbage debug information - # output from the underlying C++ `CUDA Graph id %zu reused`. - # The logger is planned to be refactored to meet control requirements. - if text: - if b"CUDA Graph" in text or b"CUDA graph" in text: - return - # TODO: Correctly implement continue previous log - global _last_log_level - log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level - if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]: - print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) - _last_log_level = log_level + global _last_verbosity + + msg = _decode_log_text(text) + + if level == GGML_LOG_LEVEL_CONT: + verbosity = _last_verbosity + else: + verbosity = _get_verbosity(level) + _last_verbosity = verbosity + if _should_drop(level, verbosity, msg): + return -llama_cpp_lib.llama_log_set(ggml_log_callback, ctypes.c_void_p(0)) + out = _config.stdout if level == GGML_LOG_LEVEL_NONE else _config.stderr + print(msg, end="", flush=True, file=out) + + +# Keep a global reference to avoid ctypes callback being garbage-collected. +_ggml_log_callback_ref = ggml_log_callback + +llama_cpp_lib.llama_log_set(_ggml_log_callback_ref, ctypes.c_void_p(0)) + + +def configure_logging( + *, + verbosity: VerbosityLike = None, + verbose: Optional[bool] = None, + quiet: Optional[bool] = None, + silent: Optional[bool] = None, + show_output: Optional[bool] = None, + log_filters: Optional[Iterable[str]] = None, + append_log_filters: Optional[Iterable[str]] = None, + log_filters_case_sensitive: Optional[bool] = None, +): + """ + Configure native ggml/llama.cpp runtime logging. + + Priority: + silent > quiet > verbosity > verbose > current config + + Compatibility: + verbose=False -> ERROR + verbose=True -> DEBUG + + Numeric levels: + 0 = output + 1 = error + 2 = warn + 3 = info + 4 = trace + 5 = debug + """ + if silent is True: + v = LOG_LEVEL_ERROR + elif quiet is True: + v = LOG_LEVEL_WARN + elif verbosity is not None: + v = _normalize_verbosity(verbosity) + elif verbose is not None: + v = _normalize_verbosity(verbose) + else: + v = _config.verbosity + + _config.verbosity = v + logger.setLevel(_verbosity_to_python_level(v)) + + if show_output is not None: + _config.show_output = show_output + + if log_filters is not None: + _config.log_filters = [s for s in log_filters if s] + + if append_log_filters is not None: + _config.log_filters.extend(s for s in append_log_filters if s) + + if log_filters_case_sensitive is not None: + _config.log_filters_case_sensitive = log_filters_case_sensitive def set_verbose(verbose: bool): - logger.setLevel(logging.DEBUG if verbose else logging.ERROR) + """ + Backward-compatible bool API. + + False -> ERROR + True -> DEBUG + """ + configure_logging(verbose=verbose) + + +def set_verbosity(verbosity: VerbosityLike): + configure_logging(verbosity=verbosity) + + +def get_verbosity() -> int: + return _config.verbosity + + +def set_quiet(quiet: bool = True): + configure_logging(quiet=quiet) + + +def set_silent(silent: bool = True): + configure_logging(silent=silent) + + +def set_log_filters( + filters: Iterable[str], + *, + case_sensitive: bool = True, +): + """ + Replace all substring log filters. + + Example: + set_log_filters(["CUDA Graph id", "clip_model_loader: tensor"]) + """ + configure_logging( + log_filters=filters, + log_filters_case_sensitive=case_sensitive, + ) + + +def get_log_filters() -> list[str]: + return list(_config.log_filters) + + +def add_log_filters(filters: Iterable[str]): + """ + Append substring log filters. + """ + configure_logging(append_log_filters=filters) + + +def clear_log_filters(): + """ + Clear all substring log filters, including default filters. + """ + _config.log_filters.clear() + + +def reset_log_filters(): + """ + Restore default substring log filters. + """ + _config.log_filters = list(DEFAULT_LOG_FILTERS) + + +def get_log_filters_case_sensitive() -> bool: + return _config.log_filters_case_sensitive + + +def reset_logging(): + """ + Reset logging to default llama.cpp-style INFO verbosity and default filters. + """ + _config.verbosity = LOG_DEFAULT_LLAMA + _config.show_output = True + _config.log_filters = list(DEFAULT_LOG_FILTERS) + _config.log_filters_case_sensitive = True + logger.setLevel(_verbosity_to_python_level(_config.verbosity)) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1241f81e26..b6a2c8d5a7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -45,6 +45,7 @@ from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama_chat_format as llama_chat_format +import llama_cpp.llama_multimodal as llama_multimodal from llama_cpp.llama_speculative import LlamaDraftModel @@ -57,8 +58,19 @@ ) from ._ggml import ( ggml_backend_cpu_buffer_type, + ggml_backend_load_all_from_path, + ggml_backend_reg_count +) +from ._logger import ( + configure_logging, + get_verbosity, + set_verbosity, + get_log_filters, + set_log_filters, + add_log_filters, + clear_log_filters, + reset_log_filters, ) -from ._logger import set_verbose from ._utils import suppress_stdout_stderr @@ -85,6 +97,7 @@ class Llama: def __init__( self, model_path: str, + mmproj_path: Optional[str] = None, *, # Model Params n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", @@ -108,8 +121,13 @@ def __init__( n_batch: int = 2048, n_ubatch: int = 512, n_seq_max: int = 1, + n_rs_seq: int = 0, + n_outputs_max: int = 0, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, + ctx_type: Optional[ + int + ] = llama_cpp_lib.llama_context_type.LLAMA_CONTEXT_TYPE_DEFAULT, rope_scaling_type: Optional[ int ] = llama_cpp_lib.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, @@ -131,8 +149,9 @@ def __init__( swa_full: Optional[bool] = None, kv_unified: Optional[bool] = None, # HybridCheckpointCache Params - ctx_checkpoints: int = 32, + ctx_checkpoints: int = 16, checkpoint_interval: int = 4096, + checkpoint_on_device: bool = False, # Sampling Params last_n_tokens_size: int = 64, # Backend Params @@ -149,8 +168,14 @@ def __init__( type_v: Optional[int] = None, # Misc spm_infill: bool = False, + # Log verbose: bool = True, + verbosity: Optional[Union[int, str, bool]] = None, + log_filters: Optional[Sequence[str]] = None, + log_filters_case_sensitive: bool = True, # Extra Params + chat_template_name: Optional[str] = None, + chat_handler_kwargs: Dict[str, Any] = {}, **kwargs, # type: ignore ): """Load a llama.cpp model from `model_path`. @@ -227,17 +252,38 @@ def __init__( kv_unified: use single unified KV buffer for the KV cache of all sequences ctx_checkpoints: max number of context checkpoints to create per slot (default: 16)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293) checkpoint_interval: Hybrid model checkpoint token intervals, and archiving of text with interval sizes along the way. + checkpoint_on_device: Store hybrid/recurrent checkpoint tensor payloads in llama_context-owned device buffers via LLAMA_STATE_SEQ_FLAGS_ON_DEVICE. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. numa: numa policy chat_format: String specifying the chat format to use when calling create_chat_completion. chat_handler: Optional chat handler to use when calling create_chat_completion. draft_model: Optional draft model to use for speculative decoding. tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. - verbose: Print verbose output to stderr. type_k: KV cache data type for K (default: f16) type_v: KV cache data type for V (default: f16) spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. - + verbose: Backward-compatible boolean switch for native llama.cpp / ggml runtime logs. + False keeps only error-level native logs; True enables debug-level native logs. + If `verbosity` is provided, `verbosity` takes precedence over `verbose`. + verbosity: Fine-grained llama.cpp-style native runtime log verbosity. + Accepts 0-5, bool, or string aliases. + Numeric levels: + 0 = output only + 1 = error + 2 = warning + 3 = info + 4 = trace + 5 = debug + Use `verbosity=3` for llama.cpp-style default info logs. + `verbose=False` remains equivalent to error-only logging, while + `verbose=True` remains equivalent to debug logging. + log_filters: Optional substring filters for native runtime logs. + If any provided substring appears in a decoded backend log message, + that message is suppressed. By default, the logger may include built-in + filters for noisy low-level logs such as CUDA Graph reuse spam messages. + Pass an empty list to disable all substring filtering for this instance. + log_filters_case_sensitive: Whether `log_filters` should match case-sensitively. + Defaults to True for predictable low-level backend log filtering. Raises: ValueError: If the model path does not exist. @@ -245,13 +291,50 @@ def __init__( A Llama instance. """ self.verbose = verbose + self.verbosity = verbosity self._stack = contextlib.ExitStack() - set_verbose(verbose) + configure_logging( + verbose=verbose, + verbosity=verbosity, + log_filters=log_filters, + log_filters_case_sensitive=log_filters_case_sensitive, + ) + # llama.cpp / ggml backend initialization is process-global. + # Run it once before loading any model. if not Llama.__backend_initialized: with suppress_stdout_stderr(disable=verbose): llama_cpp_lib.llama_backend_init() + + # Wheels built with `GGML_BACKEND_DL` ship ggml backends as separate + # dynamic libraries under llama_cpp/lib, for example: + # + # ggml-cpu-x64.dll + # ggml-cpu-haswell.dll + # ggml-cpu-alderlake.dll + # ggml-cuda.dll + # + # With the dynamic backend layout, llama_backend_init() initializes + # the global backend system but does not necessarily register every + # packaged backend. Loading the package lib directory ensures ggml can + # discover CPU variants and optional accelerator backends before model + # loading. + lib_dir = Path(llama_cpp_lib.__file__).resolve().parent / "lib" + + if not lib_dir.exists(): + raise FileNotFoundError(f"Llama.__init__: llama_cpp lib directory not found: {lib_dir}") + + # Load all dynamic ggml backend plugins from the packaged lib directory. + ggml_backend_load_all_from_path( + ctypes.c_char_p(str(lib_dir).encode("utf-8")) + ) + + # Print the number of backend registrations to confirm whether the DLL is loaded. + if self.verbose: + count = ggml_backend_reg_count() + print(f"Llama.__init__: Loaded ggml backend registry count: {count}", file=sys.stderr) + Llama.__backend_initialized = True if isinstance(numa, bool): @@ -400,6 +483,8 @@ def __init__( self.n_batch = min(n_ctx, n_batch) # ??? self.n_keep = n_keep if n_keep > 0 else 256 self.n_seq_max = n_seq_max + self.n_rs_seq = n_rs_seq + self.n_outputs_max = n_outputs_max self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() @@ -411,9 +496,18 @@ def __init__( self.context_params.n_ctx = n_ctx self.context_params.n_batch = self.n_batch self.context_params.n_ubatch = min(self.n_batch, n_ubatch) - self.context_params.n_seq_max = self.n_seq_max + + self.context_params.n_seq_max = max(1, self.n_seq_max) + if self.context_params.n_seq_max > llama_cpp_lib.LLAMA_MAX_SEQ: + raise RuntimeError(f"n_seq_max must be <= {llama_cpp_lib.LLAMA_MAX_SEQ}") + + self.context_params.n_rs_seq = self.n_rs_seq + self.context_params.n_outputs_max = self.n_batch if self.n_outputs_max == 0 else self.n_outputs_max self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch + + self.context_params.ctx_type = ctx_type + self.context_params.ctx_other = None self.context_params.rope_scaling_type = ( rope_scaling_type if rope_scaling_type is not None @@ -541,6 +635,7 @@ def __init__( _is_recurrent = self._model.is_recurrent() _is_hybrid = self._model.is_hybrid() _n_swa = self._model.n_swa() + # Sync llama.cpp upstream (#20291): warn swa-full is not supported for non-SWA models. if _n_swa == 0: if (self.context_params.swa_full): @@ -555,13 +650,25 @@ def __init__( if self.is_hybrid: if self.verbose: - print(f"Llama.__init__: Hybrid/Recurrent model detected." - f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, n_swa: {_n_swa}, swa_full: {self.context_params.swa_full}). " - f" Enabling HybridCheckpointCache(ctx_checkpoints={ctx_checkpoints}, checkpoint_interval={checkpoint_interval}).", - file=sys.stderr) + print( + f"Llama.__init__: Hybrid/Recurrent model detected. " + f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, " + f"n_swa: {_n_swa}, swa_full: {self.context_params.swa_full}). " + f"Enabling HybridCheckpointCache(" + f"ctx_checkpoints={ctx_checkpoints}, " + f"checkpoint_interval={checkpoint_interval}, " + f"on_device={checkpoint_on_device}).", + file=sys.stderr, + ) self.ctx_checkpoints = ctx_checkpoints self.checkpoint_interval = checkpoint_interval - self._hybrid_cache_mgr = HybridCheckpointCache(self._ctx.ctx, max_checkpoints=self.ctx_checkpoints, verbose=self.verbose) + self.checkpoint_on_device = checkpoint_on_device + self._hybrid_cache_mgr = HybridCheckpointCache( + self._ctx.ctx, + max_checkpoints=self.ctx_checkpoints, + on_device=self.checkpoint_on_device, + verbose=self.verbose, + ) else: self._hybrid_cache_mgr = None @@ -590,9 +697,6 @@ def __init__( self._n_vocab = self.n_vocab() self._n_ctx = self.n_ctx() - self._token_nl = self.token_nl() - self._token_eos = self.token_eos() - self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab) self.n_tokens = 0 @@ -601,23 +705,67 @@ def __init__( try: self.metadata = self._model.metadata() + self.model_desc = self._model.model_desc() + # The total size of all the tensors in the model in bytes + self.model_size = self._model.model_size() + except Exception as e: self.metadata = {} if self.verbose: print(f"Failed to load metadata: {e}", file=sys.stderr) + + if mmproj_path is not None: + if self.chat_handler is not None and self.verbose: + print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True) + + self.chat_handler = llama_multimodal.GenericMTMDChatHandler( + chat_format = self.metadata.get("tokenizer.chat_template", None), + mmproj_path = mmproj_path, + verbose = self.verbose, + chat_template_name=chat_template_name, + **chat_handler_kwargs + ) if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) + print(f"Model desc: {self.model_desc}, " + f"Model size: {self.model_size / (1024 * 1024):.2f} MB, " + f"Model metadata: {self.metadata}", + file=sys.stderr) eos_token_id = self.token_eos() bos_token_id = self.token_bos() + eot_token_id = self.token_eot() + sep_token_id = self.token_sep() + nl_token_id = self.token_nl() + pad_token_id = self.token_pad() + mask_token_id = self.token_mask() + + def _token_text(token_id: int) -> str: + return self._model.token_get_text(token_id) if token_id != -1 else "" + + bos_token = _token_text(bos_token_id) + eos_token = _token_text(eos_token_id) + + special_tokens_map = { + name: text + for name, token_id in { + "eot_token": eot_token_id, + "sep_token": sep_token_id, + "nl_token": nl_token_id, + "pad_token": pad_token_id, + "mask_token": mask_token_id, + }.items() + if token_id != -1 and (text := _token_text(token_id)) + } - eos_token = ( - self._model.token_get_text(eos_token_id) if eos_token_id != -1 else "" - ) - bos_token = ( - self._model.token_get_text(bos_token_id) if bos_token_id != -1 else "" - ) + stop_token_ids = [ + token_id + for token_id in (eos_token_id, eot_token_id) + if token_id != -1 + ] + + if not stop_token_ids: + stop_token_ids = None # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates template_choices = dict( @@ -641,14 +789,14 @@ def __init__( for name, template in template_choices.items(): try: # Attempt to parse and register the template as a valid chat handler. - # We wrap this in a try-block because some models (like LLaVA) contain - # non-standard Jinja2 tags (e.g., {% generation %}) that cause the - # standard parser to crash. + # Keep this guarded because model metadata may contain malformed or + # model-specific Jinja templates that still cannot be rendered by this runtime. self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter( template=template, eos_token=eos_token, bos_token=bos_token, - stop_token_ids=[eos_token_id], + stop_token_ids=stop_token_ids, + special_tokens_map=special_tokens_map, ).to_chat_handler() except Exception as e: # If parsing fails (e.g., TemplateSyntaxError), log a warning but do not crash. @@ -780,6 +928,71 @@ def eval_logits(self) -> Deque[List[float]]: maxlen=self._n_ctx if self._logits_all else 1, ) + # Logger API + + def set_verbosity(self, verbosity: Union[int, str, bool, None]) -> None: + """Set native llama.cpp / ggml runtime log verbosity for this process. + + Levels: + 0 = output only + 1 = error + 2 = warning + 3 = info + 4 = trace + 5 = debug + + Note: + Native backend logging is process-global because llama.cpp / ggml use + a global log callback. Changing this affects all Llama instances in + the current Python process. + """ + set_verbosity(verbosity) + self.verbosity = get_verbosity() + self.verbose = self.verbosity >= 5 + + + def get_verbosity(self) -> int: + """Return the current native runtime log verbosity.""" + return get_verbosity() + + + def set_log_filters( + self, + filters: Sequence[str], + *, + case_sensitive: bool = True, + ) -> None: + """Replace substring filters for native runtime logs. + + Any backend log message containing one of these substrings will be + suppressed. Pass an empty list to disable all substring filtering. + + Note: + Native backend logging is process-global, so this affects all Llama + instances in the current Python process. + """ + set_log_filters(filters, case_sensitive=case_sensitive) + + + def add_log_filters(self, filters: Sequence[str]) -> None: + """Append substring filters for native runtime logs.""" + add_log_filters(filters) + + + def get_log_filters(self) -> List[str]: + """Return the current substring filters for native runtime logs.""" + return get_log_filters() + + + def clear_log_filters(self) -> None: + """Clear all substring filters, including default filters.""" + clear_log_filters() + + + def reset_log_filters(self) -> None: + """Restore default substring filters for native runtime logs.""" + reset_log_filters() + # LoRA / Adapter Management API def load_lora(self, name: str, path: str): @@ -875,11 +1088,20 @@ def eval( tokens: Sequence[int], active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + copy_logits: bool = True, ): """Evaluate a list of tokens. Args: - tokens: The list of tokens to evaluate. + tokens: The token ids to evaluate. + active_loras: Optional LoRA adapters to apply for this evaluation. + Each item should contain a ``name`` and an optional ``scale``. + control_vector: Optional control vector configuration to apply during + this evaluation. + copy_logits: Whether to copy the final logits into ``self.scores`` when + ``logits_all`` is disabled. Set to ``False`` for native sampler paths + that sample directly from the llama context and do not need + Python-side logits. """ n_eval = len(tokens) if n_eval == 0: @@ -1086,9 +1308,11 @@ def eval( if self.verbose: print(f"Llama.eval: [Periodic Checkpoint] HybridCheckpoint save failed at pos {current_pos}, skipping update", file=sys.stderr) - # Save the final logit if not in _logits_all mode - if not self._logits_all: - logits_ptr = self._ctx.get_logits() + # Save the final logits only when Python-side logits are required. + # Native sampler can sample directly from ctx, so normal generation does not + # need to copy n_vocab floats into self.scores on every token. + if not self._logits_all and copy_logits: + logits_ptr = self._ctx.get_logits_ith(-1) logits_view = np.ctypeslib.as_array(logits_ptr, shape=(self._n_vocab,)) self.scores[0, :] = logits_view @@ -1148,6 +1372,13 @@ def sample( grammar_lazy: bool = False, idx: Optional[int] = None, seed: Optional[int] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ): """Sample a token from the model. Returns: @@ -1206,6 +1437,16 @@ def sample( logit_bias=self._convert_logit_bias(logit_bias), grammar=grammar.grammar if grammar else "", grammar_lazy=grammar_lazy, + + # Reasoning Budget + # This generic controller only counts the first visible reasoning + # block. Use reasoning_budget=-1 to leave it disabled. + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) # LogitsProcessor Adapter @@ -1280,6 +1521,13 @@ def generate( seed: Optional[int] = None, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -1325,6 +1573,18 @@ def generate( grammar: Optional BNF-like grammar (GBNF) to constrain sampling syntax. grammar_lazy: If True, activates grammar constraints only on specific trigger tokens. seed: RNG seed for sampling. Overrides the instance seed. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the reasoning budget sampler, 0 forces the block to end + immediately after it starts, and N > 0 allows at most N generated tokens. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + Defaults to "". Pass a model-specific value for non-default tags. + reasoning_end: Token/text sequence that marks the natural and forced end of the reasoning block. + Defaults to "". + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template has already inserted reasoning_start, + so counting starts from the first generated token. + reasoning_start_max_tokens: Safety window for non-reasoning models. If reasoning_start is not + generated within this many output tokens, the sampler becomes a no-op. Set None to wait indefinitely. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -1475,6 +1735,16 @@ def generate( grammar=grammar._grammar if grammar else "", grammar_lazy=grammar_lazy, seed=seed if seed is not None else self._seed, + + # Reasoning Budget + # Keeps the core sampler model-agnostic: callers provide the visible + # reasoning start/end tags, and -1 keeps the controller disabled. + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) # Register custom python-level logits processors if provided @@ -1506,6 +1776,14 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): self._sampling_ctx = LlamaSamplingContext(params, self._model) + # Native sampler samples directly from ctx. Python-side logits are only needed + # for compatibility hooks that explicitly consume self._scores. + copy_logits = ( + self._logits_all + or logits_processor is not None + or stopping_criteria is not None + ) + sample_idx = self.n_tokens + len(tokens) - 1 tokens = list(tokens) @@ -1525,8 +1803,13 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): body_tokens = tokens[:-1] last_token = [tokens[-1]] - # 1. Evaluate up to N-1 - self.eval(body_tokens, active_loras=active_loras, control_vector=control_vector) + # 1. Evaluate up to N-1 without copying logits. + self.eval( + body_tokens, + active_loras=active_loras, + control_vector=control_vector, + copy_logits=False, + ) # 2. Save the N-1 state snapshot current_history = self._input_ids[:self.n_tokens].tolist() @@ -1535,11 +1818,21 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): tokens=current_history, seq_id=0 ) - # 3. Evaluate the final token to refresh logits - self.eval(last_token, active_loras=active_loras, control_vector=control_vector) + # 3. Evaluate final token. Copy logits only if Python-side hooks need them. + self.eval( + last_token, + active_loras=active_loras, + control_vector=control_vector, + copy_logits=copy_logits, + ) else: # Standard evaluation or single-token generation step - self.eval(tokens, active_loras=active_loras, control_vector=control_vector) + self.eval( + tokens, + active_loras=active_loras, + control_vector=control_vector, + copy_logits=copy_logits, + ) # Sample loop while sample_idx < self.n_tokens: @@ -1835,6 +2128,13 @@ def _create_completion( seed: Optional[int] = None, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: @@ -2023,6 +2323,12 @@ def _create_completion( seed=seed if seed is not None else self._seed, active_loras=active_loras, control_vector=control_vector, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ): if llama_cpp_lib.llama_token_is_eog(self._model.vocab, token): text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) @@ -2487,6 +2793,13 @@ def create_completion( grammar_lazy: bool = False, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -2531,6 +2844,14 @@ def create_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. grammar_lazy: If True, enables lazy evaluation. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the sampler, 0 forces an immediate end after reasoning starts, + and N > 0 allows at most N generated tokens inside the block. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + reasoning_end: Token/text sequence that naturally and forcibly ends the reasoning block. + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template already inserted reasoning_start. + reasoning_start_max_tokens: Safety window before disabling the sampler for non-reasoning outputs. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -2590,6 +2911,12 @@ def create_completion( grammar_lazy=grammar_lazy, active_loras=active_loras, control_vector=control_vector, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) if stream: chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks @@ -2641,6 +2968,13 @@ def __call__( grammar_lazy: bool = False, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -2685,6 +3019,14 @@ def __call__( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. grammar_lazy: If True, enables lazy evaluation. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the sampler, 0 forces an immediate end after reasoning starts, + and N > 0 allows at most N generated tokens inside the block. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + reasoning_end: Token/text sequence that naturally and forcibly ends the reasoning block. + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template already inserted reasoning_start. + reasoning_start_max_tokens: Safety window before disabling the sampler for non-reasoning outputs. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -2744,6 +3086,12 @@ def __call__( grammar_lazy=grammar_lazy, active_loras=active_loras, control_vector=control_vector, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) def create_chat_completion( @@ -2795,6 +3143,13 @@ def create_chat_completion( top_logprobs: Optional[int] = None, assistant_prefill: bool = False, add_generation_prompt: bool = True, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -2842,6 +3197,14 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. grammar_lazy: If True, enables lazy evaluation. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the sampler, 0 forces an immediate end after reasoning starts, + and N > 0 allows at most N generated tokens inside the block. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + reasoning_end: Token/text sequence that naturally and forcibly ends the reasoning block. + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template already inserted reasoning_start. + reasoning_start_max_tokens: Safety window before disabling the sampler for non-reasoning outputs. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -2908,6 +3271,12 @@ def create_chat_completion( control_vector=control_vector, assistant_prefill=assistant_prefill, add_generation_prompt=add_generation_prompt, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index dc1dd20d7c..ee37df1200 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -352,58 +352,169 @@ def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): @dataclass class HybridCheckpoint: - """Represents a single snapshot of the RNN/Hybrid model's hidden state.""" - pos: int # The token position (cursor) where this snapshot was taken - data: bytes # The raw binary RNN state data - hash_val: str # SHA-256 hash of the token prefix to ensure exact sequence matching - size: int # Size of the state data in bytes - seq_id: int # Sequence ID this checkpoint belongs to + """ + Represents a single snapshot of the Hybrid/Recurrent model state. + + Notes: + - When on_device=False, `data` contains the full host-side serialized state. + - When on_device=True, `data` contains only the host-visible portion of the + serialized state. The tensor payload is stored in llama_context-owned + device buffers by llama.cpp, keyed by seq_id. + """ + pos: int # The token position (cursor) where this snapshot was taken. + data: bytes # The raw binary RNN state data. + hash_val: str # SHA-256 hash of the token prefix to ensure exact sequence matching. + size: int # Number of bytes written by llama_state_seq_get_data_ext(). + seq_id: int # Sequence id used by llama.cpp state APIs. class HybridCheckpointCache(BaseLlamaCache): """ - Manager for RNN state snapshots (Checkpoints) tailored for Hybrid/Recurrent models. - Provides rollback capabilities for models that cannot physically truncate KV cache. + Checkpoint manager for Hybrid/Recurrent model states. + + This cache is designed for models whose memory cannot be safely truncated like + a regular Transformer KV cache. For recurrent/hybrid architectures, rollback is + implemented by saving and restoring sequence state snapshots. + + Two operating modes are supported: + + 1. Host mode: on_device=False + - Full checkpoint payload is materialized as Python bytes. + - Multiple checkpoints per seq_id are safe. + - This mode is suitable for multi-turn rollback and longer conversation reuse. + + 2. Device mode: on_device=True + - LLAMA_STATE_SEQ_FLAGS_ON_DEVICE is forwarded to llama.cpp. + - Tensor payloads are stored in llama_context-owned device buffers. + - The device buffers are created per seq_id in llama.cpp. + - Therefore only one active checkpoint per seq_id is safe. + - This mode is suitable for fast speculative / branch rollback where avoiding + device-to-host tensor copies is more important than keeping many historical + checkpoints. + + Important: + Do not treat on_device=True as "Python owns a VRAM checkpoint". Python only + owns the host-visible serialized portion. The tensor payload lives inside the + llama_context and is keyed by seq_id. """ - def __init__(self, ctx: llama_cpp_lib.llama_context_p, max_checkpoints: int = 16, verbose: bool = False): + def __init__( + self, + ctx: llama_cpp_lib.llama_context_p, + max_checkpoints: int = 16, + on_device: bool = False, + verbose: bool = False + ): + """ + Args: + ctx (llama_context_p): + Borrowed llama.cpp context pointer used by the state sequence APIs. + This cache does not own the context and must not free it. + + max_checkpoints(int): Maximum number of Python-side checkpoint entries to keep. + - Host mode: This is the maximum number of historical checkpoints across all seq_ids. + - Device mode: This is still a global upper bound for Python-side metadata entries, + but this class also enforces at most one active checkpoint per seq_id, + because llama.cpp stores device tensor payloads per seq_id. + + on_device(bool): Whether to request llama.cpp to keep tensor checkpoint payloads in + context-owned device buffers via LLAMA_STATE_SEQ_FLAGS_ON_DEVICE. + + verbose(bool): Enables diagnostic logging to stderr for checkpoint save/restore/eviction. + """ if ctx is None: - raise ValueError("HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with model context") + raise ValueError("HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with a null model context") self._ctx = ctx + self.on_device = on_device + self.verbose = verbose + + # In host mode, max_checkpoints means "maximum number of Python-owned + # checkpoints across all seq_ids". + # + # In device mode, llama.cpp stores tensor payloads in device buffers keyed + # by seq_id. Multiple Python checkpoint metadata entries for the same seq_id + # would point to the same mutable device-side slot, so only one checkpoint + # per seq_id is safe. self.max_checkpoints = max_checkpoints + + # Python-side checkpoint registry. + # + # Host mode: + # Each HybridCheckpoint owns a full serialized checkpoint payload. + # + # Device mode: + # Each HybridCheckpoint owns only the host-visible serialized portion. + # The corresponding tensor payload is owned by llama_context. self.checkpoints: list[HybridCheckpoint] = [] + + # Total Python-tracked checkpoint size in bytes. + # + # Host mode: + # Roughly equals the total serialized checkpoint payload size. + # + # Device mode: + # Tracks only the host-visible part returned by llama.cpp, not the + # context-owned device tensor storage. self._current_size = 0 - # Cache C-type API function pointers for performance + # Cache C API function pointers for faster repeated calls. self._get_size_ext = llama_cpp_lib.llama_state_seq_get_size_ext self._get_data_ext = llama_cpp_lib.llama_state_seq_get_data_ext self._set_data_ext = llama_cpp_lib.llama_state_seq_set_data_ext - self._flag_partial = llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY - self.verbose = verbose - - if self.max_checkpoints <= 0: - if self.verbose: - import sys - print("HybridCheckpointCache(__init__): Cache is DISABLED (max_checkpoints <= 0). " - "Rollback capabilities are turned off. This is optimal for single-turn workflows.", - file=sys.stderr) + # State serialization flags forwarded to llama.cpp. + # + # LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY: + # Save only the sequence-specific / partial state needed for recurrent + # rollback instead of a full context state. + # + # LLAMA_STATE_SEQ_FLAGS_ON_DEVICE: + # Ask llama.cpp to store tensor payloads in context-owned device buffers. + self._flags = llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY + if on_device: + self._flags |= llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_ON_DEVICE + + if self.max_checkpoints <= 0 and self.verbose: + print("HybridCheckpointCache(__init__): Cache is DISABLED (max_checkpoints <= 0). " + "Rollback capabilities are turned off. This is optimal for single-turn workflows.", + file=sys.stderr) + + if self.on_device and self.max_checkpoints > 1 and self.verbose: + print( + "HybridCheckpointCache(__init__): on_device=True stores tensor payloads " + "in llama_context-owned device buffers keyed by seq_id. Multiple " + "historical checkpoints for the same seq_id are unsafe, so this cache " + "will keep only one checkpoint per seq_id.", + file=sys.stderr, + ) @property def cache_size(self) -> int: - """Returns the total memory used by all stored checkpoints in bytes.""" + """ + Returns the host-visible checkpoint size tracked by Python. + + In host mode, this is close to the full serialized checkpoint payload size. + In device mode, this is only the host-visible metadata/payload size returned + by llama.cpp. Device-side tensor storage is owned by llama_context and is not + fully represented by this number. + """ return self._current_size def clear(self): - """Clears all stored checkpoints and resets memory tracking.""" + """ + Clears Python-side checkpoint metadata. + + This does not explicitly release llama_context-owned device buffers. The + device buffers are managed by llama.cpp and are associated with the context. + """ if not self.checkpoints: # Empty Checkpoint: Return immediately, no need to clear. return self.checkpoints.clear() self._current_size = 0 if self.verbose: - print("HybridCheckpointCache: cleared") + print("HybridCheckpointCache(clear): cleared", file=sys.stderr) def close(self): - self.checkpoints = None + self.clear() self._ctx = None self._get_size_ext = None self._get_data_ext = None @@ -421,23 +532,72 @@ def _hash_prefix(self, tokens: List[int], length: int) -> str: """ if length <= 0: return "empty" - tokens_size = len(tokens) - if length > tokens_size: - length = tokens_size + length = min(length, len(tokens)) data = array.array('i', tokens[:length]).tobytes() return hashlib.sha256(data).hexdigest()[:32] + def _replace_checkpoint_for_seq_id(self, seq_id: int) -> None: + """ + Removes all Python-side checkpoints for one seq_id. + + Required for on_device=True because llama.cpp stores the device tensor + payload per seq_id, not per Python checkpoint object. + """ + kept: list[HybridCheckpoint] = [] + removed_size = 0 + + for cp in self.checkpoints: + if cp.seq_id == seq_id: + removed_size += cp.size + else: + kept.append(cp) + + self.checkpoints = kept + self._current_size -= removed_size + if self._current_size < 0: + self._current_size = 0 + + def _evict_checkpoints_if_needed(self) -> None: + """ + Evicts old checkpoints if needed + + Host mode: + This evicts full Python-owned checkpoint payloads, so FIFO historical + checkpoints are safe and useful. + + Device mode: + This evicts Python-side metadata only. The device tensor payload is owned + by llama_context and is keyed by seq_id. + """ + while len(self.checkpoints) > self.max_checkpoints: + old_cp = self.checkpoints.pop(0) + self._current_size -= old_cp.size + if self._current_size < 0: + self._current_size = 0 + + if self.verbose: + print( + f"HybridCheckpointCache: evicted checkpoint " + f"seq_id={old_cp.seq_id}, pos={old_cp.pos}", + file=sys.stderr, + ) + def find_best_checkpoint(self, tokens: List[int], seq_id: int = 0) -> Optional[HybridCheckpoint]: """ Finds the longest valid checkpoint that perfectly matches the provided token prefix. + + The hash check prevents restoring a checkpoint that has the same length but + belongs to a different prompt/history. + Returns None if no matching checkpoint is found. """ # Empty Checkpoint: Instant return, no hash calculation needed. if self.max_checkpoints <= 0 or len(self.checkpoints) == 0: return None - best_cp = None + best_cp: Optional[HybridCheckpoint] = None best_pos = -1 + for cp in self.checkpoints: if cp.seq_id != seq_id or cp.pos > len(tokens): # Skip if sequence ID mismatches or checkpoint is longer than the current prompt @@ -475,9 +635,17 @@ def save_checkpoint( file=sys.stderr) return False - flags = self._flag_partial + # In on-device mode, remove old Python metadata for this seq_id before saving + # the new checkpoint. The underlying llama.cpp device buffer for this seq_id + # will be overwritten by the get_data_ext() call. + if self.on_device: + self._replace_checkpoint_for_seq_id(seq_id) + + flags = self._flags - # 1. Query the required buffer size from the underlying C++ context + # 1. Query the required host-visible buffer size. + # In on_device mode this may exclude the large tensor payload + # that stays in device memory. size = self._get_size_ext(self._ctx, seq_id, flags) if size == 0: if self.verbose: @@ -487,9 +655,14 @@ def save_checkpoint( # 2. Allocate buffer and extract raw state data buffer = (ctypes.c_uint8 * size)() n_written = self._get_data_ext(self._ctx, buffer, size, seq_id, flags) + if n_written != size: if self.verbose: - print(f"HybridCheckpointCache(save_checkpoint): get failed {n_written}/{size}") + print( + f"HybridCheckpointCache(save_checkpoint): get_data_ext failed " + f"({n_written}/{size})", + file=sys.stderr, + ) return False # Note: This deep copy isolates the state from subsequent C++ backend mutations @@ -506,19 +679,18 @@ def save_checkpoint( ) self._current_size += n_written - # 4. Enforce capacity limits (FIFO eviction) - while len(self.checkpoints) > self.max_checkpoints: - if not self.checkpoints: - break - old_cp = self.checkpoints.pop(0) - self._current_size -= old_cp.size - if self.verbose: - print(f"HybridCheckpointCache(save_checkpoint): evicted pos={old_cp.pos}") + # 4. Evicts old checkpoints if needed + self._evict_checkpoints_if_needed() if self.verbose: - print(f"HybridCheckpointCache(save_checkpoint): Saved checkpoint at pos {current_pos} ({size / 1024 / 1024:.2f} MiB) " - f"total={len(self.checkpoints)} used={self._current_size / 1024 / 1024:.2f} MiB", - file=sys.stderr) + mode = "device" if self.on_device else "host" + print( + f"HybridCheckpointCache(save_checkpoint): saved {mode} checkpoint " + f"seq_id={seq_id}, pos={current_pos}, size={size / 1024 / 1024:.2f} MiB, " + f"hcc_count={len(self.checkpoints)}, " + f"hcc_mem_used={self._current_size / 1024 / 1024:.2f} MiB", + file=sys.stderr, + ) return True @@ -531,17 +703,38 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: if self.verbose: print(f"HybridCheckpointCache(restore_checkpoint): [Error] Sequence ID mismatch: checkpoint has {cp.seq_id}, requested {seq_id}", file=sys.stderr) return False - flags = self._flag_partial - # 2. Verify the underlying C++ context still expects the exact same state size. + # 2. Guard against stale on-device checkpoint objects. + # + # In on_device mode, Python does not own the full checkpoint tensor payload. + # llama.cpp keeps the large tensor payload in llama_context-owned device + # buffers keyed by seq_id. Saving a newer checkpoint for the same seq_id may + # overwrite that device-side payload while an old HybridCheckpoint object can + # still exist outside this cache. + # + # Only checkpoint objects still tracked by this cache are considered valid. + # This avoids restoring old Python metadata together with newer device tensors. + if self.on_device and cp not in self.checkpoints: + if self.verbose: + print( + "HybridCheckpointCache(restore_checkpoint): stale on-device checkpoint; " + "refusing restore because device payload may have been overwritten.", + file=sys.stderr, + ) + return False + + flags = self._flags + + # 3. Verify the underlying C++ context still expects the exact same state size. # This prevents buffer overflows if the backend context was unexpectedly altered or reallocated. current_size = self._get_size_ext(self._ctx, seq_id, flags) if current_size != cp.size: if self.verbose: - print(f"HybridCheckpointCache(restore_checkpoint): [Warning] State size mismatch before restore: expected {cp.size}, got {current_size} -> possible invalidation") + print(f"HybridCheckpointCache(restore_checkpoint): [Warning] State size mismatch before restore: " + f"expected checkpoint size={cp.size}, got current size={current_size} -> possible invalidation") return False - # 3. Copy data back to a ctypes buffer and push to the C++ backend + # 4. Copy data back to a ctypes buffer and push to the C++ backend buffer = (ctypes.c_uint8 * cp.size).from_buffer_copy(cp.data) ret = self._set_data_ext( self._ctx, buffer, cp.size, seq_id, flags @@ -549,7 +742,13 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: success = (ret == cp.size) if self.verbose: - print(f"HybridCheckpointCache(restore_checkpoint): restore {'OK' if success else 'FAIL'} pos={cp.pos}") + mode = "device" if self.on_device else "host" + print( + f"HybridCheckpointCache(restore_checkpoint): restore " + f"{'OK' if success else 'FAIL'} " + f"mode={mode}, seq_id={seq_id}, pos={cp.pos}", + file=sys.stderr, + ) return success # Disable BaseLlamaCache Dictionary Interfaces diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a0d8d25db4..6ffe68e5e3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1,7 +1,5 @@ from __future__ import annotations -import base64 -import ctypes import dataclasses import datetime import json @@ -9,9 +7,7 @@ import random import string import sys -import zlib -from contextlib import ExitStack from typing import ( Any, Dict, @@ -26,21 +22,17 @@ ) import jinja2 +from jinja2.ext import Extension from jinja2.sandbox import ImmutableSandboxedEnvironment import numpy as np import numpy.typing as npt -import urllib.request -from urllib.error import URLError, HTTPError - -import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama as llama_core import llama_cpp.llama_types as llama_types import llama_cpp.llama_grammar as llama_grammar -from ._ggml import GGMLLogLevel -from ._logger import logger, ggml_log_callback +from ._logger import logger from ._utils import suppress_stdout_stderr, Singleton ### Common Chat Templates and Special Tokens ### @@ -130,6 +122,17 @@ def __call__( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, assistant_prefill: bool = False, + # Reasoning Budget Params + # + # Generic first-reasoning-block budget control. These parameters are + # passed through to llama.create_completion() without model-specific + # inference or template guessing. + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -220,6 +223,46 @@ def __call__( class Jinja2ChatFormatter(ChatFormatter): + class IgnoreGenerationTags(Extension): + """Render HuggingFace `{% generation %}` blocks without tracking. + + HuggingFace chat templates may wrap assistant text with: + + {% generation %} + ... + {% endgeneration %} + + Transformers uses this tag to compute assistant-token masks. In + llama-cpp-python chat formatting we only need the final rendered prompt, + so this extension simply removes the tag pair and renders the inner + content as normal Jinja template content. + + This keeps compatibility with HF templates while avoiding the overhead + of span tracking. + + More information see: + https://github.com/huggingface/transformers/blob/39603d0e5cdb6f00e8d473d7fcbb01032d709181/src/transformers/utils/chat_template_utils.py#L425 + """ + + tags = {"generation"} + + def parse(self, parser: jinja2.parser.Parser): + # Consume the opening `{% generation %}` token. + lineno = next(parser.stream).lineno + + # Parse and return the block body until `{% endgeneration %}`. + # Returning the body directly makes the tag a transparent wrapper. + body = parser.parse_statements( + ("name:endgeneration",), + drop_needle=True, + ) + + # Preserve line numbers for better template error messages. + for node in body: + node.set_lineno(lineno) + + return body + def __init__( self, template: str, @@ -227,21 +270,118 @@ def __init__( bos_token: str, add_generation_prompt: bool = True, stop_token_ids: Optional[List[int]] = None, + special_tokens_map: Optional[Dict[str, str]] = None, ): - """A chat formatter that uses jinja2 templates to format the prompt.""" + """Format chat messages with a HuggingFace-style Jinja2 chat template. + + Args: + template: + Raw HuggingFace chat template string. + eos_token: + Text form of the model EOS token. + bos_token: + Text form of the model BOS token. + add_generation_prompt: + Whether to ask the template to append the assistant generation + prefix. This mirrors Transformers' `add_generation_prompt`. + stop_token_ids: + Optional token ids that should stop generation when they appear + as the last generated token. This is llama-cpp-python specific. + special_tokens_map: + Optional tokenizer special-token map. Some HF templates may + reference extra variables such as `pad_token`, `unk_token`, + `sep_token`, or model-specific special tokens. + """ self.template = template self.eos_token = eos_token self.bos_token = bos_token self.add_generation_prompt = add_generation_prompt + self.special_tokens_map = special_tokens_map or {} + self.stop_token_ids = ( - set(stop_token_ids) if stop_token_ids is not None else None + {int(token_id) for token_id in stop_token_ids} + if stop_token_ids is not None + else None ) - self._environment = ImmutableSandboxedEnvironment( + environment = ImmutableSandboxedEnvironment( loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True, - ).from_string(self.template) + # Keep this aligned with Transformers' chat-template Jinja setup: + # - IgnoreGenerationTags supports `{% generation %}` blocks. + # - loopcontrols supports `{% break %}` and `{% continue %}`. + extensions=[ + Jinja2ChatFormatter.IgnoreGenerationTags, + jinja2.ext.loopcontrols, + ], + ) + + # Match Transformers' chat-template JSON behavior. + # Jinja's default `tojson` escapes HTML characters, which is not what + # plain-text chat templates usually expect. + environment.filters["tojson"] = self.tojson + + # Register these as globals once instead of passing them on every render. + environment.globals["raise_exception"] = self.raise_exception + environment.globals["strftime_now"] = self.strftime_now + + self._environment = environment + self._template = environment.from_string(self.template) + + # Precompute static stop fields once. This avoids rebuilding closures and + # StoppingCriteriaList objects for every chat completion request. + self._stop = [self.eos_token] if self.eos_token else [] + self._stopping_criteria = self._build_stopping_criteria() + + @staticmethod + def raise_exception(message: str): + """Raise a Jinja template error from inside a chat template.""" + raise jinja2.exceptions.TemplateError(message) + + @staticmethod + def strftime_now(format_string: str = "%Y-%m-%d %H:%M:%S") -> str: + """Return the current local time formatted with `datetime.strftime`.""" + return datetime.datetime.now().strftime(format_string) + + @staticmethod + def tojson( + x: Any, + ensure_ascii: bool = False, + indent: Optional[int] = None, + separators: Optional[Tuple[str, str]] = None, + sort_keys: bool = False, + ) -> str: + """Serialize an object to JSON for chat-template rendering. + + This intentionally bypasses Jinja's built-in `tojson` filter because + the built-in filter escapes HTML-sensitive characters. HuggingFace chat + templates expect plain JSON text instead. + """ + return json.dumps( + x, + ensure_ascii=ensure_ascii, + indent=indent, + separators=separators, + sort_keys=sort_keys, + ) + + def _build_stopping_criteria(self): + """Create stopping criteria once during initialization.""" + if self.stop_token_ids is None: + return None + + stop_token_ids = self.stop_token_ids + + def stop_on_last_token( + tokens: npt.NDArray[np.intc], + logits: npt.NDArray[np.single], + ) -> bool: + # Defensive guard: generation normally calls this with at least one + # token, but the callback should never crash on empty input. + return len(tokens) > 0 and int(tokens[-1]) in stop_token_ids + + return llama_core.StoppingCriteriaList([stop_on_last_token]) def __call__( self, @@ -251,44 +391,106 @@ def __call__( function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, tools: Optional[List[llama_types.ChatCompletionTool]] = None, tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + documents: Optional[List[Dict[str, Any]]] = None, **kwargs: Any, ) -> ChatFormatterResponse: - def raise_exception(message: str): - raise ValueError(message) + """Render OpenAI-style chat messages into a model prompt. - def strftime_now(format_string="%Y-%m-%d %H:%M:%S") -> str: - """ - Returns the current time formatted as a string. - """ - return datetime.datetime.now().strftime(format_string) + The method builds the variable context expected by HuggingFace-style + Jinja chat templates and renders the final prompt string used by + llama-cpp-python. - prompt = self._environment.render( - messages=messages, - eos_token=self.eos_token, - bos_token=self.bos_token, - raise_exception=raise_exception, - strftime_now=strftime_now, - add_generation_prompt=self.add_generation_prompt, - functions=functions, - function_call=function_call, - tools=tools, - tool_choice=tool_choice, - ) + Template variables provided by default: + messages: + The chat history to render. Each item is expected to be an + OpenAI-style message dictionary, usually containing at least + `role` and `content`. - stopping_criteria = None - if self.stop_token_ids is not None: + eos_token: + The model's end-of-sequence token string. + + bos_token: + The model's beginning-of-sequence token string. + + add_generation_prompt: + Whether the template should append the assistant generation + prefix. This mirrors Transformers' `add_generation_prompt`. + + functions: + Legacy OpenAI-compatible function definitions, if provided. - def stop_on_last_token( - tokens: npt.NDArray[np.intc], logits: npt.NDArray[np.single] - ) -> bool: - return tokens[-1] in self.stop_token_ids + function_call: + Legacy OpenAI-compatible function-call selection, if provided. - stopping_criteria = llama_core.StoppingCriteriaList([stop_on_last_token]) + tools: + OpenAI/HuggingFace-compatible tool definitions, if provided. + This formatter expects tools to already be normalized into + JSON-schema-like dictionaries. It does not auto-convert Python + callables into JSON schemas like Transformers can. + + tool_choice: + Optional tool-choice instruction, such as `"auto"`, `"none"`, + or a specific tool/function selection object. + + documents: + Optional RAG/document context. Some HF chat templates reference + this variable when rendering retrieval-augmented prompts. + + **kwargs: + Extra model-specific or template-specific variables. These are + merged into the template context last, so they can intentionally + override the defaults above when needed. + + Additional variables: + Values from `special_tokens_map` are also exposed to the template, + such as `pad_token`, `unk_token`, `sep_token`, or custom + model-specific special tokens. Core variables like `messages`, + `eos_token`, and `bos_token` override `special_tokens_map` entries + by default. + + Returns: + ChatFormatterResponse: + Contains the rendered prompt, text stop sequences, optional + token-id stopping criteria, and `added_special=True` because the + chat template is responsible for adding model special tokens. + + Raises: + jinja2.exceptions.TemplateError: + If the template calls `raise_exception(...)` or Jinja rendering + fails. + """ + template_kwargs: Dict[str, Any] = {} + + # Make extra tokenizer special tokens available to templates, e.g. + # `pad_token`, `unk_token`, `sep_token`, or model-specific tokens. + template_kwargs.update(self.special_tokens_map) + + # Explicit core variables should override values from special_tokens_map. + template_kwargs.update( + { + "messages": messages, + "eos_token": self.eos_token, + "bos_token": self.bos_token, + "add_generation_prompt": self.add_generation_prompt, + "functions": functions, + "function_call": function_call, + "tools": tools, + "tool_choice": tool_choice, + "documents": documents, + } + ) + + # Let caller-provided kwargs extend the template context. + # If a caller intentionally passes a same-name key, it will override the + # defaults above. This is useful for model-specific template variables. + template_kwargs.update(kwargs) + + prompt = self._template.render(**template_kwargs) return ChatFormatterResponse( prompt=prompt, - stop=[self.eos_token], - stopping_criteria=stopping_criteria, + stop=self._stop, + stopping_criteria=self._stopping_criteria, added_special=True, ) @@ -629,6 +831,17 @@ def chat_completion_handler( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, assistant_prefill: bool = False, + # Reasoning Budget Params + # + # Generic first-reasoning-block budget control. These parameters are + # passed through to llama.create_completion() without model-specific + # inference or template guessing. + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -764,6 +977,12 @@ def chat_completion_handler( stopping_criteria=stopping_criteria, grammar=grammar, logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) if tool is not None: tool_name = tool["function"]["name"] @@ -2809,3112 +3028,188 @@ def generate_streaming(tools, functions, function_call, prompt): ) -class MTMDChatHandler: - DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( -"""You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, -while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful.""" - ) - - CHAT_FORMAT = ( +@register_chat_completion_handler("chatml-function-calling") +def chatml_function_calling( + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + **kwargs, # type: ignore +) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], +]: + function_calling_template = ( "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - - "{% if message.role == 'user' %}" - "{% if message.content is string %}" - "\nUSER: {{ message.content }}" - "{% elif message.content is iterable %}" - "\nUSER: " - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% elif content.type == 'audio_url' %}" - "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" - "{% elif content.type == 'input_audio' %}" - "{% if content.input_audio is string %}" - "{{ content.input_audio }}" - "{% else %}" - "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" - "{% endif %}" - "{% elif content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% endif %}" - - "{% if message.role == 'assistant' and message.content is not none %}" - "\nASSISTANT: {{ message.content }}" - "{% endif %}" + "<|im_start|>{{ message.role }}\n" + # System message + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% if tool_calls %}" + "\n\nYou have access to the following functions:\n" + "{% for tool in tools %}" + "\nfunctions.{{ tool.function.name }}:\n" + "{{ tool.function.parameters | tojson }}" + "\n{% endfor %}" + "\n\nYou can respond to users messages with either a single message or one or more function calls." + "\n\nTo respond with a message begin the message with 'message:', use the following format:" + "\n\nmessage:" + "\n" + "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" + "\n\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "{% endif %}" + "<|im_end|>\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + ## Reglar message + "{% if message.content and message.content | length > 0 %}" + "{% if tool_calls %}" + "message:\n" + "{% endif %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + ## Function calls + "{% if 'tool_calls' in message %}" + "{% for tool_call in message.tool_calls %}" + "functions.{{ tool_call.function.name }}:\n" + "{{ tool_call.function.arguments }}" "{% endfor %}" - - "{% if add_generation_prompt %}" - "\nASSISTANT: " + "<|im_end|>\n" "{% endif %}" + "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" ) + template_renderer = ImmutableSandboxedEnvironment( + autoescape=jinja2.select_autoescape(["html", "xml"]), + undefined=jinja2.StrictUndefined, + ).from_string(function_calling_template) - def __init__( - self, - clip_model_path: str, - verbose: bool = True, - use_gpu: bool = True, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - **kwargs - ): - - self.log_prefix = self.__class__.__name__ - if kwargs: - unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) - raise TypeError( - f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" - f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." - ) - - self.clip_model_path = clip_model_path - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - self.use_gpu = use_gpu - self.verbose = verbose + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] - import llama_cpp.mtmd_cpp as mtmd_cpp - self._mtmd_cpp = mtmd_cpp - self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None - self.extra_template_arguments: dict[str, Any] = {} + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } - if not os.path.exists(clip_model_path): - raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") + stop = ( + [stop, "<|im_end|>"] + if isinstance(stop, str) + else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + ) - # Pre-compile Jinja template - self.chat_template = ImmutableSandboxedEnvironment( - trim_blocks=True, - lstrip_blocks=True, - ).from_string(self.CHAT_FORMAT) - - self._exit_stack = ExitStack() - - def _init_mtmd_context(self, llama_model: llama_core.Llama): - """Initialize mtmd context with the llama model.""" - if self.mtmd_ctx is not None: - return # Already initialized - - self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) - - # Get default parameters - self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() - self.mctx_params.use_gpu = self.use_gpu - self.mctx_params.print_timings = self.verbose - self.mctx_params.n_threads = llama_model.n_threads - self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO - self.mctx_params.warmup = True - if self.image_min_tokens > 0: - self.mctx_params.image_min_tokens = self.image_min_tokens - if self.image_max_tokens > 0: - self.mctx_params.image_max_tokens = self.image_max_tokens - if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " - f"cannot be less than image_min_tokens ({self.image_min_tokens}).") - - # Cache the model's eos token and bos token - self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') - self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') - - # Cache the mtmd_default_marker - self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - - # Initialize mtmd context - self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.clip_model_path.encode(), - llama_model.model, - self.mctx_params + # Case 1: No tool choice by user + if ( + tool_choice is None + or (isinstance(tool_choice, str) and tool_choice == "none") + or tools is None + or len(tools) == 0 + ): + prompt = template_renderer.render( + messages=messages, + tools=[], + tool_calls=None, + add_generation_prompt=True, ) - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") - - # Check if vision is supported - self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) - if self.is_support_vision: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) - - # Check if audio is supported - self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) - if self.is_support_audio: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) - - def close(self) -> None: - """Explicitly free the mtmd context and vision model resources.""" - if getattr(self, "mtmd_ctx", None) is not None: - try: - self._mtmd_cpp.mtmd_free(self.mtmd_ctx) - except Exception: - pass - self.mtmd_ctx = None - self.mctx_params = None - self.chat_template = None - - if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): - self._exit_stack.close() - self._exit_stack = None - - def __del__(self) -> None: - self.close() + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) - def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]: - """ - Extracts all media payloads (images, audio) sequentially to maintain exact chronological order. - Strictly enforces capability checks, raising exceptions if unsupported media is passed. - - Returns: - media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio). - """ - media_items: List[Dict[str, str]] = [] - for message in messages: - if isinstance(message.get("content"), list): - for content in message["content"]: - content_type = content.get("type", "") - - # 1. Vision Processing - if content_type == "image_url": - if not self.is_support_vision: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.") - - url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"] - media_items.append({"url": url, "type": "image"}) - - # 2. Audio Processing - elif content_type in ["audio_url", "input_audio"]: - if not self.is_support_audio: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") - - # Case A: Handle custom/forward-compatible audio_url format - if content == "audio_url": - url = content["audio_url"] if isinstance(content["audio_url"], str) else content["audio_url"]["url"] - media_items.append({"url": url, "type": "audio"}) - # Case B: Handle OpenAI standard input_audio format - else: - input_audio = content.get("input_audio", {}) - if isinstance(input_audio, dict) and "data" in input_audio: - # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic - # input_audio: { - # data: audio.base64Data, - # format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' - # } - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - - # Strictly align with llama.cpp (require wav/mp3) - if audio_format not in ["wav", "mp3"]: - raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'") - - # Format as a Data URI to reuse the unified load_media logic - media_items.append({ - "url": f"data:audio/{audio_format};base64,{audio_data}", - "type": "audio" - }) - else: - # Just a raw base64 data - url = input_audio if isinstance(input_audio, str) else "" - if url: - media_items.append({"url": url, "type": "audio"}) - - # 3. Text & Unknown Types - elif content_type == "text": - continue - else: - if self.verbose: - print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr) - return media_items - - def _create_bitmap_from_bytes(self, media_bytes: bytes): - """ - Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. - - Supported formats: - - Images (via stb_image): jpg, png, bmp, etc. - - Audio (via miniaudio): wav, mp3, flac. - - Note: - - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. - - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. - - Args: - media_bytes (bytes): The raw byte content of the media file. - - Returns: - mtmd_bitmap: A pointer to the allocated bitmap structure containing decoded media features. - """ - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") - - # Create bitmap from buffer using helper function - bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( - self.mtmd_ctx, - (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), - len(media_bytes) - ) - - if bitmap is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): " - "Failed to load image or audio file from media bytes " - "(unsupported media format or corrupted data).") - - return bitmap - - - def _process_mtmd_prompt( - self, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - add_generation_prompt: bool = True, - ) -> Tuple[List[int], List[tuple], Any, List[Any]]: - """ - Core multimodal preprocessing pipeline. - Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. - - Features: - - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. - - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. - - Strict RAII-style C++ memory management to prevent leaks on failure. - - Returns: - full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. - chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). - chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). - bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. - """ - # 1. Inject default system prompt if omitted by the user - system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") - if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: - messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages - - media_items = self._get_media_items(messages) - media_marker = self.media_marker - - # 2. Render the chat template and replace actual URLs with C++ media markers - text = self.chat_template.render( - messages=messages, - add_generation_prompt=add_generation_prompt, - eos_token=self.mtmd_eos_token, - bos_token=self.mtmd_bos_token, - functions=functions, - function_call=function_call, - tools=tools, - tool_choice=tool_choice, - **getattr(self, 'extra_template_arguments', {}) - ) - # Replace image_url by media_marker in text - for item in media_items: - text = text.replace(item["url"], media_marker) - - if self.verbose: - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) - - # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding - bitmaps = [None] * len(media_items) - bitmap_cleanup = [] - chunks = None - - try: - # Concurrent Media Decoding - import concurrent.futures - if media_items: - def _create_bitmap_func(idx: int, item: str): - media_bytes = self.load_media(item["url"], item["type"]) - bitmap = self._create_bitmap_from_bytes(media_bytes) - return idx, bitmap - # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, - # which can be used in the future to process large numbers of video frames. - max_workers = min(llama.n_threads, len(media_items)) - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] - - for future in concurrent.futures.as_completed(futures): - idx, bitmap = future.result() - bitmaps[idx] = bitmap - bitmap_cleanup.append(bitmap) - - # Strict validation: Abort if any thread failed to decode its assigned media - if any(b is None for b in bitmaps): - raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") - else: - if self.verbose: - print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") - else: - # If there are no images, set the bitmaps to empty. - bitmaps = [] - - # 4. Initialize mtmd_input_chunks - input_text = self._mtmd_cpp.mtmd_input_text() - input_text.text = text.encode('utf-8') - input_text.add_special = (llama.n_tokens == 0) - input_text.parse_special = True - - chunks = self._mtmd_cpp.mtmd_input_chunks_init() - if chunks is None: - raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") - - # 5. Hybrid Tokenization (Text + Media binding) - if len(bitmaps) > 0: - bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) - ) - else: - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") - - # 6. Virtual Token Ledger Construction - full_prompt_ids = [] - chunk_token_spans = [] - current_idx = 0 - n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) - - # Cursor to track the actual media contents (URLs or base64 data) provided by the user - media_items_count = len(media_items) - media_items_cur = 0 - - for i in range(n_chunks): - chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) - if chunk is None: continue - chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: - # Extract standard text token IDs - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) - if tokens_ptr and n_tokens_out.value > 0: - tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) - full_prompt_ids.extend(tokens) - current_idx += len(tokens) - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: - # Extract media properties - # Note(JamePeng): - # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). - # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. - # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - - if media_items_cur < media_items_count: - # The C++ parser only sees identical placeholders (e.g., "<__media__>"). - # We MUST inject the actual media content's identity here. - real_media_url = media_items[media_items_cur]["url"] - # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) - # Generate a deterministic, unique negative ID for this specific image/audio. - # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). - # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with - # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). - # This empowers `longest_token_prefix` to correctly identify and reuse cached images, - # while instantly breaking the match if the image content changes. - # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 - media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 - media_items_cur += 1 - else: - # Magic Negative Number as fallback :) - media_id = -314159 - - if self.verbose: - print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") - - chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) - - # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache - full_prompt_ids.extend([media_id] * chunk_n_tokens) - current_idx += chunk_n_tokens - else: - raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") - - return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup - - except Exception as e: - # Ensure no useless pointers remain upon any failure - # Free chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Free bitmaps - if len(bitmap_cleanup) > 0: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup = None - bitmaps = None - - raise e - - def __call__( - self, - *, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - seed: Optional[int] = None, - response_format: Optional[ - llama_types.ChatCompletionRequestResponseFormat - ] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logit_bias: Optional[Dict[str, float]] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - add_generation_prompt: bool = True, - **kwargs, # type: ignore - ) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], - ]: - # 1. Initialize mtmd context - self._init_mtmd_context(llama) - assert self.mtmd_ctx is not None - - # 2. Concurrent Preprocessing & Ledger Construction - full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( - llama=llama, - messages=messages, - functions=functions, - function_call=function_call, - tools=tools, - tool_choice=tool_choice, - add_generation_prompt=add_generation_prompt, - ) - - if self.verbose: - print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) - - try: - # 3. KV Cache Synchronization & State Rollback - # Compares the virtual ledger with physical history to prevent Cache Poisoning. - current_history = llama.input_ids[:llama.n_tokens].tolist() - longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) - - if longest_prefix < llama.n_tokens: - if llama.is_hybrid and llama._hybrid_cache_mgr is not None: - if llama._hybrid_cache_mgr.max_checkpoints > 0: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " - f"Searching for nearest checkpoint...", file=sys.stderr) - - best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) - if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): - llama.n_tokens = best_ckpt.pos - if self.verbose: - print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) - llama._ctx.memory_seq_rm(0, longest_prefix, -1) - llama.n_tokens = longest_prefix - - n_past = llama.n_tokens - - for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: - # Skip previously matched chunks - if end_idx <= n_past: - continue - - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: - unprocessed_start = max(start_idx, n_past) - start_idx - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) - - if tokens_ptr and n_tokens_out.value > 0: - all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - tokens_to_eval = all_tokens[unprocessed_start:] - - if tokens_to_eval: - if self.verbose: - print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - # Text evaluation delegates shift and chunking to native llama.eval - llama.eval(tokens_to_eval) - n_past = llama.n_tokens - - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) - - if self.verbose: - media_str = "IMAGE" if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE else "AUDIO" - print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - - # Stage 5: Multimodal Physical OOM Defense - if n_past + chunk_n_tokens > llama.n_ctx(): - if llama._ctx.memory_can_shift(): - raise RuntimeError( - f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " - f"(n_pos_per_embd > 1 or incompatible M-RoPE). " - f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " - f"You MUST increase n_ctx to fit the dialogue." - ) - else: - # Safely discard oldest tokens while preserving system prompts - n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch - n_keep = min(llama.n_keep, n_past) - n_discard = min(n_discard, n_past - n_keep) - - if n_discard <= 0: - raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") - - if self.verbose: - print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) - - # Execute physical memory shift - llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) - llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) - - # Shift python virtual array to match - remaining_len = n_past - (n_keep + n_discard) - if remaining_len > 0: - llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] - - n_past -= n_discard - llama.n_tokens = n_past - - # Execute C++ Multimodal Black-box Extraction - new_n_past = llama_cpp_lib.llama_pos(0) - result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( - self.mtmd_ctx, - llama._ctx.ctx, - chunk_ptr, - llama_cpp_lib.llama_pos(n_past), - llama_cpp_lib.llama_seq_id(0), - llama.n_batch, - True, # logits_last = True, drastically saves computational overhead - ctypes.byref(new_n_past) - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") - - # Update Ledger with "Negative Reverse Vocabulary" IDs - llama.input_ids[n_past : new_n_past.value] = media_id - n_past = new_n_past.value - llama.n_tokens = n_past - - # Extract the final, perfectly synchronized prompt sequence - prompt = llama.input_ids[: llama.n_tokens].tolist() - - # End-of-Turn Checkpoint - # Anchors the state ONLY after the entire multi-modal turn is processed - if ( - llama.is_hybrid - and llama._hybrid_cache_mgr is not None - and llama._hybrid_cache_mgr.max_checkpoints > 0 - ): - if self.verbose: - print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) - - llama._hybrid_cache_mgr.save_checkpoint( - current_pos=llama.n_tokens, - tokens=prompt, - seq_id=0 - ) - finally: - # Cleanup chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Cleanup bitmaps - if bitmap_cleanup: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup.clear() - bitmap_array = None - - # Handle response format and tools (same as before) - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - tool = None - if ( - tool_choice is not None - and isinstance(tool_choice, dict) - and tools is not None - ): - name = tool_choice["function"]["name"] - tool = next((t for t in tools if t["function"]["name"] == name), None) - if tool is None: - raise ValueError(f"Tool choice '{name}' not found in tools.") - schema = tool["function"]["parameters"] - try: - # create grammar from json schema - grammar = llama_grammar.LlamaGrammar.from_json_schema( - json.dumps(schema), verbose=llama.verbose - ) - except Exception as e: - if llama.verbose: - print(str(e), file=sys.stderr) - grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.JSON_GBNF, verbose=llama.verbose - ) - - completion_or_chunks = llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - logprobs=top_logprobs if logprobs else None, - stream=stream, - stop=stop, - seed=seed, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logit_bias=logit_bias, - ) - - if tool is not None: - tool_name = tool["function"]["name"] - return _convert_completion_to_chat_function( - tool_name, completion_or_chunks, stream - ) - return _convert_completion_to_chat(completion_or_chunks, stream=stream) - - def load_media(self, media_url: str, media_type: str) -> bytes: - """ - Unified dispatcher for loading media payloads. - Routes the URL/URI to the specific image or audio processor based on the media_type. - """ - if media_type == "image": - return self._load_image(media_url) - elif media_type == "audio": - audio_bytes = self._load_audio(media_url) - # Apply ironclad magic bytes validation before returning - try: - self.detect_audio_format(audio_bytes) - except ValueError as e: - raise ValueError(f"{self.log_prefix}(load_media): {e}") - return audio_bytes - else: - raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") - - @staticmethod - def detect_audio_format(audio_bytes: bytes) -> str: - """ - Pure utility function: Detects the audio format from magic bytes. - Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility - and avoid false positives (e.g., AVI files disguised as RIFF). - """ - length = len(audio_bytes) - - if length < 12: - raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") - - # RIFF & WAVE magic bytes verification - is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" - - # ID3 metadata or MPEG sync word verification - is_mp3 = length >= 3 and ( - audio_bytes.startswith(b"ID3") or - (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) - ) - - # FLAC magic bytes verification - is_flac = audio_bytes.startswith(b"fLaC") - - if is_wav: - return "wav" - elif is_mp3: - return "mp3" - elif is_flac: - return "flac" - else: - raise ValueError( - "Unsupported audio format detected via magic bytes. " - "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." - ) - - @staticmethod - def _load_audio(audio_url: str) -> bytes: - """ - Load audio from either a URL, local path, or a data URI and return raw bytes. - """ - - audio_bytes = b"" - - # 1. Handle data URI (base64) - if audio_url.strip().startswith("data:"): - comma_pos = audio_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - base64_data = audio_url[comma_pos + 1 :] - audio_bytes = base64.b64decode(base64_data) - - # 2. Handle local file path - elif os.path.exists(audio_url): - with open(audio_url, "rb") as f: - audio_bytes = f.read() - - # 3. Handle remote URL via HTTP/HTTPS - else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(audio_url, headers=headers) - try: - with urllib.request.urlopen(req, timeout=15) as f: - audio_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download audio from {audio_url}: {e}") - - if not audio_bytes: - raise ValueError("Empty audio data received") - - return audio_bytes - - @staticmethod - def _load_image(image_url: str) -> bytes: - """ - Load an image from either a URL or a data URI and return it as JPEG bytes. - - Supports: - - Remote images via HTTP/HTTPS (with proper User-Agent) - - Data URIs (base64-encoded, e.g., data:image/png;base64,...) - - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background - - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html - - Returns: - JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. - """ - image_bytes = b"" - - # 1. Handle data URI (base64) - if image_url.strip().startswith("data:"): - # Split only once from the right to correctly handle mime types containing commas - comma_pos = image_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - base64_data = image_url[comma_pos + 1 :] - image_bytes = base64.b64decode(base64_data) - - # 2. Handle local/remote URL - else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(image_url, headers=headers) - - try: - with urllib.request.urlopen(req, timeout=15) as f: - image_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download image from {image_url}: {e}") - - if not image_bytes: - raise ValueError("Empty image data received") - - # 3. Open image with Pillow - try: - from PIL import Image, ImageStat - except ImportError: - raise ImportError("Pillow is required for image processing. Install with: pip install pillow") - - import io - image = Image.open(io.BytesIO(image_bytes)) - - # 4. Handle transparency (RGBA, LA, P with transparency, etc.) - if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): - # Use alpha channel as mask - if image.mode == "P": - image = image.convert("RGBA") - - alpha = image.split()[-1] # Last channel is alpha - # Compute average brightness of visible (non-transparent) pixels - stat = ImageStat.Stat(image.convert("L"), mask=alpha) - - # Choose background: white for dark content, black for bright content - bg_color = (255, 255, 255) # white - if stat.count[0] > 0 and stat.mean[0] > 127: - bg_color = (0, 0, 0) # black - - background = Image.new("RGB", image.size, bg_color) - background.paste(image, mask=alpha) - image = background - - # 5. Ensure RGB mode for formats like CMYK, palette, etc. - elif image.mode != "RGB": - image = image.convert("RGB") - - # 6. Save as high-quality JPEG, suitable for most vision models. - output = io.BytesIO() - image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) - return output.getvalue() - - @classmethod - def from_pretrained( - cls, - repo_id: str, - filename: Optional[str], - local_dir: Optional[Union[str, os.PathLike[str]]] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", - cache_dir: Optional[Union[str, os.PathLike[str]]] = None, - **kwargs: Any, - ) -> "MTMDChatHandler": - import fnmatch - from pathlib import Path - - try: - from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore - from huggingface_hub.utils import validate_repo_id # type: ignore - except ImportError: - raise ImportError( - "Llama.from_pretrained requires the huggingface_hub package. " - "You can install it with `pip install --upgrade huggingface_hub`." - ) - - validate_repo_id(repo_id) - - hffs = HfFileSystem() - - files = [ - file["name"] if isinstance(file, dict) else file - for file in hffs.ls(repo_id) # type: ignore - ] - - # split each file into repo_id, subfolder, filename - file_list: List[str] = [] - for file in files: - rel_path = Path(file).relative_to(repo_id) - file_list.append(str(rel_path)) - - matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore - - if len(matching_files) == 0: - raise ValueError( - f"No file found in {repo_id} that match {filename}\n\n" - f"Available Files:\n{json.dumps(file_list)}" - ) - - if len(matching_files) > 1: - raise ValueError( - f"Multiple files found in {repo_id} matching {filename}\n\n" - f"Available Files:\n{json.dumps(files)}" - ) - - (matching_file,) = matching_files - - subfolder = str(Path(matching_file).parent) - filename = Path(matching_file).name - - # download the file - hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=cast(Union[str, Path, None], local_dir), - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - ) - - if local_dir is None: - model_path = hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=local_dir, - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - local_files_only=True, - ) - else: - model_path = os.path.join(local_dir, filename) - - return cls( - clip_model_path=model_path, - **kwargs, - ) - - -class Llava15ChatHandler(MTMDChatHandler): - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - - "{% if message.role == 'user' %}" - "{% if message.content is string %}" - "\nUSER: {{ message.content }}" - "{% elif message.content is iterable %}" - "\nUSER: " - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% endif %}" - - "{% if message.role == 'assistant' and message.content is not none %}" - "\nASSISTANT: {{ message.content }}" - "{% endif %}" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "\nASSISTANT: " - "{% endif %}" - ) - - -class ObsidianChatHandler(MTMDChatHandler): - # Prompt Format - # The model followed ChatML format. However, with ### as the seperator - - # <|im_start|>user - # What is this sign about?\n - # ### - # <|im_start|>assistant - # The sign is about bullying, and it is placed on a black background with a red background. - # ### - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}\n" - "###\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "###\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "###\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MoondreamChatHandler(MTMDChatHandler): - # Chat Format: - # f"\n\n{chat_history}Question: {question}\n\nAnswer:" - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "Question: {{ content.text }}\n\n" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "Question: {{ message.content }}\n\n" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "Answer:{{ message.content }}\n\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class Llava16ChatHandler(MTMDChatHandler): - # Example prompt - # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "{{ message.content }}" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class NanoLlavaChatHandler(MTMDChatHandler): - # Prompt Format - # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: - - # <|im_start|>system - # Answer the question<|im_end|><|im_start|>user - # - # What is the picture about?<|im_end|><|im_start|>assistant - DEFAULT_SYSTEM_MESSAGE = "Answer the question" - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "<|im_end|>" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class Llama3VisionAlphaChatHandler(MTMDChatHandler): - # question = "" + q - - # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "<|start_header_id|>" - "{% if message.role == 'user' %}" - "user<|end_header_id|>\n\n" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "assistant<|end_header_id|>\n\n" - "{{ message.content }}" - "{% endif %}" - "<|eot_id|>" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|start_header_id|>assistant<|end_header_id|>\n\n" - "{% endif %}" - ) - - -# alias -Llama3VisionAlpha = Llama3VisionAlphaChatHandler - - -class MiniCPMv26ChatHandler(MTMDChatHandler): - - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and messages[0]['role'] != 'system' %}" - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is iterable %}" - "{% for content in message['content'] %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - - "{% for content in message['content'] %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% if message['content'] is string %}" - "{{ message['content'] }}" - "{% endif %}" - "<|im_end|>\n" - "{% endfor %}" - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MiniCPMv45ChatHandler(MTMDChatHandler): - """ - Handler for MiniCPM-V 4.5 models. - - Supports: - - Multi-step tool calls with and XML tags. - - Integrated reasoning (thinking) process with tags. - - Specialized system prompt handling with tool definitions. - - Global image numbering for multi-image processing. - """ - - # Model specific control tokens - MINICPMV_BOS_TOKEN = "<|im_start|>" - MINICPMV_EOS_TOKEN = "<|im_end|>" - MINICPMV_PAD_TOKEN = "<|endoftext|>" - - # Image placeholder tags - MINICPMV_IMAGE_START_TOKEN = "" - MINICPMV_IMAGE_END_TOKEN = "" - MINICPMV_IMAGE_ID_START_TOKEN = "" - MINICPMV_IMAGE_ID_END_TOKEN = "" - - CHAT_FORMAT = ( - # --- 1. First System Message & Tools Definitions --- - "{%- if tools %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}" - "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}" - "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}" - "{{- 'You are provided with function signatures within XML tags:\\n' }}" - "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}" - "{{- '\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\"name\": , \"arguments\": }\\n" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- elif messages[0].role == 'system' %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - - # --- 2. Message Stream Processing --- - "{% set image_count = namespace(value=0) %}" - "{%- for message in messages %}" - # --- Unified Role Handling (User, Assistant, and subsequent Systems) --- - "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}" - "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}" - - "{%- set content = message.content %}" - "{%- if content is not string %}" - "{%- set ns = namespace(content_str='') %}" - "{%- for item in content %}" - # --- Explicit image_url type and value checking --- - "{%- if item.type == 'image_url' %}" - "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}" - "{%- set image_count.value = image_count.value + 1 %}" - # Format: N: IMAGE_URL - "{%- set ns.content_str = ns.content_str + '' + (image_count.value | string) + ': ' + image_url + '' %}" - "{%- elif item.type == 'text' %}" - "{%- set ns.content_str = ns.content_str + item.text %}" - "{%- endif %}" - "{%- endfor %}" - "{%- set content = ns.content_str %}" - "{%- endif %}" - - "{{- content -}}" - - # Append tool_calls to assistant messages if they exist - "{%- if message.role == 'assistant' and message.tool_calls %}" - "{%- for tool_call in message.tool_calls %}" - "{%- set tc = tool_call.function if tool_call.function else tool_call %}" - "{{- '\\n\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}" - "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}" - "{{- '}\\n' }}" - "{%- endfor %}" - "{%- endif %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - - # --- Specialized Tool Response Handling --- - # Group consecutive tool responses under a single user-like block - "{%- elif message.role == 'tool' %}" - "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}" - "{{- '" + MINICPMV_BOS_TOKEN + "user' }}" - "{%- endif %}" - "{{- '\\n\\n' + message.content + '\\n' }}" - "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - "{%- endif %}" - "{%- endfor %}" - - # --- 3. Generation Prompt --- - "{%- if add_generation_prompt %}" - "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}" - # Handle thinking/reasoning block visibility based on configuration - "{%- if enable_thinking is defined and enable_thinking is false %}" - "{{- '\\n\\n\\n\\n' }}" - "{%- elif enable_thinking is defined and enable_thinking is true %}" - "{{- '\\n' }}" - "{%- endif %}" - "{%- endif %}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the MiniCPM-V 4.5 Handler. - - Args: - enable_thinking (bool): If True, model generates reasoning before the final answer. - **kwargs: Additional arguments for the base MTMDChatHandler. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject thinking control flag into the template - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set stop token patch - kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class Gemma3ChatHandler(MTMDChatHandler): - - GEMMA3_BOI_TOKEN = "" - GEMMA3_EOI_TOKEN = "" - GEMMA3_BOS_TOKEN = "" - GEMMA3_EOS_TOKEN = "" - - CHAT_FORMAT = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" - "{% if messages[0]['content'] is string %}" - "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" - "{% else %}" - "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" - "{% endif %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set first_user_prefix = '' %}" - "{% endif %}" - - "{% for message in loop_messages %}" - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" - "{% endif %}" - - "{% if message['role'] == 'assistant' %}" - "{% set role = 'model' %}" - "{% else %}" - "{% set role = message['role'] %}" - "{% endif %}" - - "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" - - "{% if message['content'] is string %}" - "{{ message['content'] | trim }}" - "{% elif message['content'] is iterable %}" - "{% for item in message['content'] %}" - "{% if item['type'] == 'image_url' and item['image_url'] is string %}" - "{{ '' + item['image_url'] + '' }}" - "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" - "{{ '' + item['image_url']['url'] + '' }}" - "{% elif item['type'] == 'text' %}" - "{{ item['text'] | trim }}" - "{% endif %}" - "{% endfor %}" - "{% else %}" - "{{ raise_exception('Invalid content type') }}" - "{% endif %}" - - "\n" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "model\n" - "{% endif %}" - ) - - -class Gemma4ChatHandler(MTMDChatHandler): - """ - Handler for Gemma 4 models. - - Note on `enable_thinking`: - The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. - It is NOT supported by Gemma4 E2B and E4B models. - - [Important Note for Audio Processing!] - It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models. - Other quantizations are known to have degraded performance; - ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463 - """ - - # The special token in Gemma 4 - GEMMA4_BOI_TOKEN = "<|image>" - GEMMA4_EOI_TOKEN = "" - GEMMA4_BOA_TOKEN = "<|audio>" - GEMMA4_EOA_TOKEN = "" - GEMMA4_BOS_TOKEN = "" - GEMMA4_EOS_TOKEN = "" - GEMMA4_SOT_TOKEN = "<|turn>" - GEMMA4_EOT_TOKEN = "" - GEMMA4_SOC_TOKEN = "<|channel>" - GEMMA4_EOC_TOKEN = "" - GEMMA4_STC_TOKEN = "<|tool_call>" - GEMMA4_ETC_TOKEN = "" - GEMMA4_STD_TOKEN = "<|tool>" - GEMMA4_ETD_TOKEN = "" - GEMMA4_STR_TOKEN = "<|tool_response>" - GEMMA4_ETR_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro format_parameters(properties, required) -%}\n" - " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in properties | dictsort -%}\n" - " {%- set add_comma = false -%}\n" - " {%- if key not in standard_keys -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {{ key }}:{\n" - " {%- if value['description'] -%}\n" - " description:<|\"|>{{ value['description'] }}<|\"|>\n" - " {%- set add_comma = true -%}\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'STRING' -%}\n" - " {%- if value['enum'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " enum:{{ format_argument(value['enum']) }}\n" - " {%- endif -%}\n" - " {%- elif value['type'] | upper == 'ARRAY' -%}\n" - " {%- if value['items'] is mapping and value['items'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " items:{\n" - " {%- set ns_items = namespace(found_first=false) -%}\n" - " {%- for item_key, item_value in value['items'] | dictsort -%}\n" - " {%- if item_value is not none -%}\n" - " {%- if ns_items.found_first %},{% endif -%}\n" - " {%- set ns_items.found_first = true -%}\n" - " {%- if item_key == 'properties' -%}\n" - " properties:{\n" - " {%- if item_value is mapping -%}\n" - " {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n" - " {%- endif -%}\n" - " }\n" - " {%- elif item_key == 'required' -%}\n" - " required:[\n" - " {%- for req_item in item_value -%}\n" - " <|\"|>{{- req_item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- elif item_key == 'type' -%}\n" - " {%- if item_value is string -%}\n" - " type:{{ format_argument(item_value | upper) }}\n" - " {%- else -%}\n" - " type:{{ format_argument(item_value | map('upper') | list) }}\n" - " {%- endif -%}\n" - " {%- else -%}\n" - " {{ item_key }}:{{ format_argument(item_value) }}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " }\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if value['nullable'] %}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " nullable:true\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'OBJECT' -%}\n" - " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" - " }\n" - " {%- elif value is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value, value['required'] | default([])) -}}\n" - " }\n" - " {%- endif -%}\n" - " {%- if value['required'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " required:[\n" - " {%- for item in value['required'] | default([]) -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - "{%- endmacro -%}\n" - "{%- macro format_function_declaration(tool_data) -%}\n" - " declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n" - " {%- set params = tool_data['function']['parameters'] -%}\n" - " {%- if params -%}\n" - " ,parameters:{\n" - " {%- if params['properties'] -%}\n" - " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" - " {%- endif -%}\n" - " {%- if params['required'] -%}\n" - " required:[\n" - " {%- for item in params['required'] -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {{- ',' if not loop.last -}}\n" - " {%- endfor -%}\n" - " ],\n" - " {%- endif -%}\n" - " {%- if params['type'] -%}\n" - " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if 'response' in tool_data['function'] -%}\n" - " {%- set response_declaration = tool_data['function']['response'] -%}\n" - " ,response:{\n" - " {%- if response_declaration['description'] -%}\n" - " description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n" - " {%- endif -%}\n" - " {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n" - " type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " }\n" - "{%- endmacro -%}\n" - "{%- macro format_argument(argument, escape_keys=True) -%}\n" - " {%- if argument is string -%}\n" - " {{- '<|\"|>' + argument + '<|\"|>' -}}\n" - " {%- elif argument is boolean -%}\n" - " {{- 'true' if argument else 'false' -}}\n" - " {%- elif argument is mapping -%}\n" - " {{- '{' -}}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in argument | dictsort -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {%- if escape_keys -%}\n" - " {{- '<|\"|>' + key + '<|\"|>' -}}\n" - " {%- else -%}\n" - " {{- key -}}\n" - " {%- endif -%}\n" - " :{{- format_argument(value, escape_keys=escape_keys) -}}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- elif argument is sequence -%}\n" - " {{- '[' -}}\n" - " {%- for item in argument -%}\n" - " {{- format_argument(item, escape_keys=escape_keys) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- ']' -}}\n" - " {%- else -%}\n" - " {{- argument -}}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "{%- macro strip_thinking(text) -%}\n" - " {%- set ns = namespace(result='') -%}\n" - " {%- for part in text.split('') -%}\n" - " {%- if '<|channel>' in part -%}\n" - " {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n" - " {%- else -%}\n" - " {%- set ns.result = ns.result + part -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- ns.result | trim -}}\n" - "{%- endmacro -%}\n" - "{%- macro format_tool_response_block(tool_name, response) -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- if response is mapping -%}\n" - " {{- 'response:' + tool_name + '{' -}}\n" - " {%- for key, value in response | dictsort -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- else -%}\n" - " {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n" - " {%- endif -%}\n" - " {{- '' -}}\n" - "{%- endmacro -%}\n" - "{%- set ns = namespace(prev_message_type=None) -%}\n" - "{%- set loop_messages = messages -%}\n" - "{{- bos_token -}}\n" - "{#- Handle System/Tool Definitions Block -#}\n" - "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" - " {{- '<|turn>system\\n' -}}\n" - " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" - " {%- if enable_thinking is defined and enable_thinking -%}\n" - " {{- '<|think|>\\n' -}}\n" - " {%- set ns.prev_message_type = 'think' -%}\n" - " {%- endif -%}\n" - " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" - " {{- messages[0]['content'] | trim -}}\n" - " {%- set loop_messages = messages[1:] -%}\n" - " {%- endif -%}\n" - " {%- if tools -%}\n" - " {%- for tool in tools %}\n" - " {{- '<|tool>' -}}\n" - " {{- format_function_declaration(tool) | trim -}}\n" - " {{- '' -}}\n" - " {%- endfor %}\n" - " {%- set ns.prev_message_type = 'tool' -%}\n" - " {%- endif -%}\n" - " {{- '\\n' -}}\n" - "{%- endif %}\n" - "{#- Pre-scan: find last user message index for reasoning guard -#}\n" - "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" - "{%- for i in range(loop_messages | length) -%}\n" - " {%- if loop_messages[i]['role'] == 'user' -%}\n" - " {%- set ns_turn.last_user_idx = i -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{#- Loop through messages -#}\n" - "{%- for message in loop_messages -%}\n" - " {%- if message['role'] != 'tool' -%}\n" - " {%- set ns.prev_message_type = None -%}\n" - " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" - " {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n" - " {%- set prev_nt = namespace(role=None, found=false) -%}\n" - " {%- if loop.index0 > 0 -%}\n" - " {%- for j in range(loop.index0 - 1, -1, -1) -%}\n" - " {%- if not prev_nt.found -%}\n" - " {%- if loop_messages[j]['role'] != 'tool' -%}\n" - " {%- set prev_nt.role = loop_messages[j]['role'] -%}\n" - " {%- set prev_nt.found = true -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n" - " {%- if not continue_same_model_turn -%}\n" - " {{- '<|turn>' + role + '\\n' }}\n" - " {%- endif -%}\n" - " {#- Render reasoning/reasoning_content as thinking channel -#}\n" - " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" - " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" - " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" - " {%- endif -%}\n" - " {%- if message['tool_calls'] -%}\n" - " {%- for tool_call in message['tool_calls'] -%}\n" - " {%- set function = tool_call['function'] -%}\n" - " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" - " {%- if function['arguments'] is mapping -%}\n" - " {%- set ns_args = namespace(found_first=false) -%}\n" - " {%- for key, value in function['arguments'] | dictsort -%}\n" - " {%- if ns_args.found_first %},{% endif -%}\n" - " {%- set ns_args.found_first = true -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- endfor -%}\n" - " {%- elif function['arguments'] is string -%}\n" - " {{- function['arguments'] -}}\n" - " {%- endif -%}\n" - " {{- '}' -}}\n" - " {%- endfor -%}\n" - " {%- set ns.prev_message_type = 'tool_call' -%}\n" - " {%- endif -%}\n" - " {%- set ns_tr_out = namespace(flag=false) -%}\n" - " {%- if message.get('tool_responses') -%}\n" - " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" - " {%- for tool_response in message['tool_responses'] -%}\n" - " {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endfor -%}\n" - " {%- elif message.get('tool_calls') -%}\n" - " {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n" - " {%- set ns_tool_scan = namespace(stopped=false) -%}\n" - " {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n" - " {%- if ns_tool_scan.stopped -%}\n" - " {%- elif loop_messages[k]['role'] != 'tool' -%}\n" - " {%- set ns_tool_scan.stopped = true -%}\n" - " {%- else -%}\n" - " {%- set follow = loop_messages[k] -%}\n" - " {#- Resolve tool_call_id to function name -#}\n" - " {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n" - " {%- for tc in message['tool_calls'] -%}\n" - " {%- if tc.get('id') == follow.get('tool_call_id') -%}\n" - " {%- set ns_tname.name = tc['function']['name'] -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {#- Handle content as string or content-parts array -#}\n" - " {%- set tool_body = follow.get('content') -%}\n" - " {%- if tool_body is string -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- elif tool_body is sequence and tool_body is not string -%}\n" - " {%- set ns_txt = namespace(s='') -%}\n" - " {%- for part in tool_body -%}\n" - " {%- if part.get('type') == 'text' -%}\n" - " {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" - " {%- else -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- endif -%}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- if message['content'] is string -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(message['content']) -}}\n" - " {%- else -%}\n" - " {{- message['content'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif message['content'] is sequence -%}\n" - " {%- for item in message['content'] -%}\n" - " {%- if item['type'] == 'text' -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(item['text']) -}}\n" - " {%- else -%}\n" - " {{- item['text'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif item['type'] == 'image_url' -%}\n" - " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {{- '<|image|>' + url_val -}}\n" - " {%- set ns.prev_message_type = 'image' -%}\n" - " {%- elif item['type'] == 'audio_url' -%}\n" - " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- set ns.prev_message_type = 'audio' -%}\n" - " {%- elif item['type'] == 'input_audio' -%}\n" - " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- set ns.prev_message_type = 'audio' -%}\n" - # " {%- elif item['type'] == 'video_url' -%}\n" - # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" - # " {{- '<|video|>' + video_val -}}\n" - # " {%- set ns.prev_message_type = 'video' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- elif not (ns_tr_out.flag and not message.get('content')) -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" - " {{- '<|turn>model\\n' -}}\n" - " {%- if not enable_thinking | default(false) -%}\n" - " {{- '<|channel>thought\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Gemma 4 Handler. - - Args: - enable_thinking (bool): Controls whether the <|think|> tag is injected and - manages <|channel>thought behavior. - Note: ONLY supported on Gemma4 31B and 26BA4B models. - NOT supported on Gemma4 E2B and E4B models. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject the thinking variable into the Jinja environment - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set the stop token based on Gemma 4's format () - # generation_config.json: "eos_token_id": [ 1, 106, 50] - kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GLM41VChatHandler(MTMDChatHandler): - # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. - - GLM41V_EOS_TOKEN = "<|endoftext|>" - GLM41V_PAD_TOKEN = "<|endoftext|>" - GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]\n" - "{%- for msg in messages -%}" - "{%- if msg.role == 'system' -%}" - "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'user' -%}" - "<|user|>\n" - "{%- if msg.content is string -%}" - "{{ msg.content }}" - "{%- else -%}" - "{%- for item in msg.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'assistant' -%}" - "{%- if msg.metadata -%}" - "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- else -%}" - "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{%- endif -%}" - ) - - def __call__(self, **kwargs): - self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN - # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json - stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch - kwargs['stop'] = stop_tokens - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class GLM46VChatHandler(MTMDChatHandler): - GLM46V_EOS_TOKEN = "<|endoftext|>" - GLM46V_PAD_TOKEN = "<|endoftext|>" - GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]" - "{%- if tools -%}" - "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" - "You are provided with function signatures within XML tags:\n\n" - "{%- for tool in tools -%}" - "{{ tool | tojson(ensure_ascii=False) }}\n" - "{%- endfor -%}" - "\n\nFor each function call, output the function name and arguments within the following XML format:\n" - "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" - "{%- endif -%}" - - "{%- for m in messages -%}" - "{%- if m.role == 'system' -%}" - "<|system|>\n{{ m.content }}" - "{%- elif m.role == 'user' -%}" - "<|user|>\n" - "{%- if m.content is string -%}" - "{{ m.content }}" - "{%- else -%}" - "{%- for item in m.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - # If enable_thinking is disabled, insert `/nothink` according to the source code logic. - "{{ '/nothink' if not enable_thinking else '' }}" - "{%- elif m.role == 'assistant' -%}" - "<|assistant|>" - "{%- if enable_thinking -%}" - "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" - "\n{{ reasoning.strip() }}" - "{%- else -%}" - "\n" - "{%- endif -%}" - "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" - "{%- endif -%}" - "{{ GLM46V_EOS_TOKEN }}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{{ '' if enable_thinking else '\n' }}" - "{%- endif -%}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - GLM-4.6V Handler - Parameters: - - enable_thinking (bool): Whether to enable the model's think process. The default is True. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN - - # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json - kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GraniteDoclingChatHandler(MTMDChatHandler): - """ - Handler for Granite-Docling models. - - Format(512x512): Content - - Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!! - Since the model does not have special tokens for the start and end of an image, - it is recommended to process only one image at a time. - You can iterate through the images individually for recognition. - - """ - GRANITE_BOS_TOKEN = "<|start_of_role|>" - GRANITE_EOS_TOKEN = "<|end_of_text|>" - GRANITE_PAD_TOKEN = "<|end_of_text|>" - GRANITE_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for part in message['content'] -%}" - "{%- if part['type'] == 'text' -%}" - "{{- part['text'] -}}" - "{%- elif part['type'] == 'image_url' -%}" - "{%- if part.image_url is string -%}" - "{{- part.image_url -}}" - "{%- else -%}" - "{{- part.image_url.url -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '<|end_of_text|>\n' -}}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|start_of_role|>assistant' -}}" - # Support the 'controls' parameter if present in generation arguments - "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}" - "{{- '<|end_of_role|>' -}}" - "{%- endif -%}" - ) - - def __init__(self, controls: dict = None, **kwargs): - """ - Granite-Docling Handler - Args: - controls (dict, optional): Operational parameters passed to the assistant role. - - The 'controls' parameter is used to guide the model's behavior or output format. - Common examples for 'controls' include: - - Document Parsing: {"mode": "document_parsing", "format": "json"} - """ - self.controls = controls - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject controls into the template environment - self.extra_template_arguments["controls"] = self.controls - self.DEFAULT_SYSTEM_MESSAGE = None - kwargs['stop'] = [self.GRANITE_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - - return super().__call__(**kwargs) - - -class LFM2VLChatHandler(MTMDChatHandler): - LFM2VL_BOS_TOKEN = "<|startoftext|>" - LFM2VL_EOS_TOKEN = "<|im_end|>" - LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{ '<|im_start|>' + message['role'] + '\n' }}" - "{%- if message['content'] is string -%}" - "{{ message['content'] }}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if 'image_url' in content -%}" - "{%- if content.image_url is string -%}" - "<|image_start|>{{ content.image_url }}<|image_end|>" - "{%- else -%}" - "<|image_start|>{{ content.image_url.url }}<|image_end|>" - "{%- endif -%}" - "{%- elif content['type'] == 'text' -%}" - "{{ content['text'] }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{ '<|im_end|>\n' }}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{ '<|im_start|>assistant\n' }}" - "{%- endif -%}" - ) - - def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): - """ - LFM2-VL Handler - LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 - """ - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) - - def __call__(self, **kwargs): - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class LFM25VLChatHandler(MTMDChatHandler): - """ - Handler for LFM2.5-VL multimodal models. - - Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing. - """ - # Aligned with LFM2.5-VL tokenizer_config - LFM25VL_BOS_TOKEN = "<|startoftext|>" - LFM25VL_EOS_TOKEN = "<|im_end|>" - LFM25VL_PAD_TOKEN = "<|pad|>" - - # Image specific tokens - LFM25VL_IMAGE_TOKEN = "" - LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" - LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" - - CHAT_FORMAT = ( - "{{- bos_token -}}\n" - "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" - "{%- set ns = namespace(system_prompt='', content='') -%}\n" - "{%- if messages[0]['role'] == 'system' -%}\n" - " {%- set ns.system_prompt = messages[0]['content'] -%}\n" - " {%- set messages = messages[1:] -%}\n" - "{%- endif -%}\n" - "{%- if tools -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" - " {%- for tool in tools -%}\n" - " {%- if tool is not string -%}\n" - " {%- set tool = tool | tojson -%}\n" - " {%- endif -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" - " {%- if not loop.last -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" - "{%- endif -%}\n" - "{%- if ns.system_prompt -%}\n" - " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" - "{%- endif -%}\n" - "{%- set ns.last_assistant_index = -1 -%}\n" - "{%- for message in messages -%}\n" - " {%- if message['role'] == 'assistant' -%}\n" - " {%- set ns.last_assistant_index = loop.index0 -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- for message in messages -%}\n" - " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" - " {%- set content = message['content'] -%}\n" - " {%- if content is not string -%}\n" - " {%- set ns.content = '' -%}\n" - " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" - " {%- for item in content -%}\n" - " {%- if item['type'] == 'image_url' -%}\n" - " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {%- set ns.content = ns.content + img_val -%}\n" - " {%- elif item['type'] == 'text' -%}\n" - " {%- set ns.content = ns.content + item['text'] -%}\n" - " {%- else -%}\n" - " {%- set ns.content = ns.content + (item | tojson) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set content = ns.content -%}\n" - " {%- endif -%}\n" - " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" - " {%- if '' in content -%}\n" - " {%- set content = content.split('')[-1] | trim -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {{- content + '<|im_end|>\\n' -}}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, keep_past_thinking: bool = False, **kwargs): - self.keep_past_thinking = keep_past_thinking - super().__init__(**kwargs) - - - def __call__(self, **kwargs): - if self.image_min_tokens > 256: - if self.verbose: - print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.") - self.image_min_tokens = -1 - - self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking - - kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class PaddleOCRChatHandler(MTMDChatHandler): - """ - Handler for PaddleOCR 1.5 multimodal models. - """ - - PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" - PADDLEOCR_BOS_TOKEN = "" - PADDLEOCR_EOS_TOKEN = "" - PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>" - PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>" - PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>" - - CHAT_FORMAT = ( - "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}" - "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}" - "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}" - - "{{- cls_token -}}" - "{%- for message in messages -%}" - "{%- if message['role'] == 'user' -%}" - "{{- 'User: ' -}}" - - # Robust parsing: Check if content is string or list - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - # Pass 1: Render all images first - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'image_url' and 'image_url' in content -%}" - "{{- '<|IMAGE_START|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|IMAGE_END|>' -}}" - "{%- endif -%}" - "{%- endfor -%}" - - # Pass 2: Render all text second - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '\\n' -}}" - - "{%- elif message['role'] == 'assistant' -%}" - "{{- 'Assistant:\\n' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- eos_token -}}" - - "{%- elif message['role'] == 'system' -%}" - "{%- if message['content'] is string -%}" - "{{- message['content'] + '\\n' -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] + '\\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "{{- 'Assistant:\\n' -}}" - "{%- endif -%}" - ) - - def __init__( - self, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - **kwargs - ): - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__( - image_min_tokens=self.image_min_tokens, - image_max_tokens=self.image_max_tokens, - **kwargs - ) - - def __call__(self, **kwargs): - # Set the specific stop token defined in the PaddleOCR template - kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class Qwen25VLChatHandler(MTMDChatHandler): - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and message['role'] != 'system' %}" - "<|im_start|>system\n" - "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is string %}" - "{{ message['content'] }}<|im_end|>\n" - "{% else %}" - "{% for content in message['content'] %}" - "{% if content['type'] == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>" - "{% else %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>" - "{% endif %}" - "{% elif content['type'] == 'text' %}" - "{{ content['text'] }}" - "{% endif %}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endfor %}" - "<|im_start|>assistant\n" - ) - - def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class Qwen3VLChatHandler(MTMDChatHandler): - CHAT_FORMAT = ( - "{{- '<|im_start|>system\n' -}}" - "{%- if messages[0].content is string and messages[0].role == 'system' -%}" - "{{- messages[0].content -}}" - "{%- elif messages[0].role == 'system' -%}" - "{%- if 'text' in messages[0].content -%}" - "{{- messages[0].content.text -}}" - "{%- else -%}" - "{{- 'You are a helpful assistant.' -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- if tools -%}" - "{{- '\n\n' -}}" - "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' -}}" - "{%- for tool in tools -%}" - "{{- '\n' -}}" - "{{- tool | tojson -}}" - "{%- endfor -%}" - "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" - "{%- endif -%}" - "{{- '<|im_end|>\n' -}}" - "{%- set image_count = namespace(value=0) -%}" - #"{%- set video_count = namespace(value=0) -%}" - "{%- for message in messages -%}" - "{%- if message.role == 'tool' -%}" - "{{- '<|im_start|>user\n\n' -}}" - "{%- elif message.role != 'system' -%}" - "{{- '<|im_start|>' + message.role + '\n' -}}" - "{%- endif -%}" - "{%- if message.content is string and message.role != 'system' -%}" - "{{- message.content -}}" - "{%- elif message.role != 'system' -%}" - "{%- for content in message.content -%}" - "{%- if 'image_url' in content -%}" - "{%- set image_count.value = image_count.value + 1 -%}" - "{%- if add_vision_id -%}" - "{{- 'Picture ' -}}" - "{{- image_count.value | string -}}" - "{{- ': ' -}}" - "{%- endif -%}" - "{{- '<|vision_start|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|vision_end|>' -}}" - "{%- endif -%}" - # Video not supported yet - "{%- if 'text' in content -%}" - "{{- content.text -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- if message.role == 'assistant' -%}" - "{%- if message.tool_calls -%}" - "{%- for tool_call in message.tool_calls -%}" - "{%- if (loop.first and message.content) or (not loop.first) -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- if tool_call.function -%}" - "{%- set tool_call = tool_call.function -%}" - "{%- endif -%}" - "{{- '\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}" - "{%- if tool_call.arguments is string -%}" - "{{- tool_call.arguments -}}" - "{%- else -%}" - "{{- tool_call.arguments | tojson -}}" - "{%- endif -%}" - "{{- '}\n' -}}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- elif message.role == 'tool' -%}" - "{{- '' -}}" - "{%- endif -%}" - "{%- if message.role != 'system' -%}" - "{{- '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|im_start|>assistant\n' -}}" - "{%- if force_reasoning -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - force_reasoning: bool = False, - add_vision_id: bool = True, - **kwargs, - ): - """ - Parameters: - - force_reasoning (bool): - - True: Force the reasoning in the model by adding to the chat template. - - False (default): Don't force the reasoning. - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - """ - super().__init__(**kwargs) - self.force_reasoning = force_reasoning - self.extra_template_arguments["force_reasoning"] = force_reasoning - self.extra_template_arguments["add_vision_id"] = add_vision_id - - def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Qwen35ChatHandler(MTMDChatHandler): - """ - Handler for Qwen3.5/Qwen3.6 models. - """ - CHAT_FORMAT = ( - "{%- set image_count = namespace(value=0) -%}" - "{%- set video_count = namespace(value=0) -%}" - "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}" - " {%- if content is string -%}" - " {{- content -}}" - " {%- elif content is iterable and content is not mapping -%}" - " {%- for item in content -%}" - " {%- if 'image_url' in item or item.type == 'image_url' -%}" - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain images.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set image_count.value = image_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Picture ' -}}" - " {{- image_count.value | string -}}" - " {{- ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {%- if item.image_url is string -%}" - " {{- item.image_url -}}" - " {%- else -%}" - " {{- item.image_url.url -}}" - " {%- endif -%}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'video' in item -%}" - " {{- raise_exception('llama.cpp does not currently support video.') -}}" # Video not supported, raise exception - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain videos.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set video_count.value = video_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Video ' ~ video_count.value ~ ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {{- item.video -}}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'text' in item -%}" - " {{- item.text -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected item type in content.') -}}" - " {%- endif -%}" - " {%- endfor -%}" - " {%- elif content is none or content is undefined -%}" - " {{- '' -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected content type.') -}}" - " {%- endif -%}" - "{%- endmacro -%}" - "{%- if not messages -%}" - " {{- raise_exception('No messages provided.') -}}" - "{%- endif -%}" - "{%- if tools and tools is iterable and tools is not mapping -%}" - " {{- '<|im_start|>system\n' -}}" - " {{- '# Tools\n\nYou have access to the following functions:\n\n' -}}" - " {%- for tool in tools -%}" - " {{- '\n' -}}" - " {{- tool | tojson -}}" - " {%- endfor -%}" - " {{- '\n' -}}" - " {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' -}}" - " {%- if messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) | trim -%}" - " {%- if content -%}" - " {{- '\n\n' + content -}}" - " {%- endif -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - "{%- elif messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) -%}" - " {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}" - "{%- for message in messages[::-1] -%}" - " {%- set index = messages | length - 1 - loop.index0 -%}" - " {%- if ns.multi_step_tool and message.role == 'user' -%}" - " {%- set content = render_content(message.content, false) | trim -%}" - " {%- if not (content.startswith('') and content.endswith('')) -%}" - " {%- set ns.multi_step_tool = false -%}" - " {%- set ns.last_query_index = index -%}" - " {%- endif -%}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if ns.multi_step_tool -%}" - " {{- raise_exception('No user query found in messages.') -}}" - "{%- endif -%}" - "{%- for message in messages -%}" - " {%- set content = render_content(message.content, true) | trim -%}" - " {%- if message.role == 'system' -%}" - " {%- if not loop.first -%}" - " {{- raise_exception('System message must be at the beginning.') -}}" - " {%- endif -%}" - " {%- elif message.role == 'user' -%}" - " {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}" - " {%- elif message.role == 'assistant' -%}" - " {%- set reasoning_content = '' -%}" - " {%- if message.reasoning_content is string -%}" - " {%- set reasoning_content = message.reasoning_content -%}" - " {%- elif '' in content -%}" - " {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') -%}" - " {%- set content = content.split('')[-1].lstrip('\n') -%}" - " {%- endif -%}" - " {%- set reasoning_content = reasoning_content | trim -%}" - " {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}" - " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" - " {%- else -%}" - " {{- '<|im_start|>' + message.role + '\n' + content -}}" - " {%- endif -%}" - " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" - " {%- for tool_call in message.tool_calls -%}" - " {%- if tool_call.function is defined -%}" - " {%- set tool_call = tool_call.function -%}" - " {%- endif -%}" - " {%- if loop.first -%}" - " {%- if content | trim -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- '\n\n\n' -}}" - " {%- endif -%}" - " {%- if tool_call.arguments is defined -%}" - " {%- for (args_name, args_value) in tool_call.arguments | items -%}" - " {{- '\n' -}}" - " {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}" - " {{- args_value -}}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif message.role == 'tool' -%}" - " {%- if loop.previtem and loop.previtem.role != 'tool' -%}" - " {{- '<|im_start|>user' -}}" - " {%- endif -%}" - " {{- '\n\n' -}}" - " {{- content -}}" - " {{- '\n' -}}" - " {%- if not loop.last and loop.nextitem.role != 'tool' -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif loop.last -%}" - " {{- '<|im_end|>\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- raise_exception('Unexpected message role.') -}}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - " {{- '<|im_start|>assistant\n' -}}" - " {%- if enable_thinking is defined and enable_thinking is false -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n' -}}" - " {%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - add_vision_id: bool = True, - enable_thinking: bool = True, - preserve_thinking: bool = False, - **kwargs, - ): - """ - Parameters: - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - - enable_thinking (bool): - - True (default): Enables reasoning for better results. - - False: Disables reasoning for faster results. - - preserve_thinking (bool): - - True: Keeps reasoning process for ALL historical conversational turns. - - False (default): Only keeps for the latest assistant reply to save tokens. - """ - super().__init__(**kwargs) - self.enable_thinking = enable_thinking - self.preserve_thinking = preserve_thinking - self.extra_template_arguments["add_vision_id"] = add_vision_id - self.extra_template_arguments["enable_thinking"] = enable_thinking - self.extra_template_arguments["preserve_thinking"] = preserve_thinking - - def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class Step3VLChatHandler(MTMDChatHandler): - """ - Handler for Step3-VL models. - """ - - STEP3VL_BOS_TOKEN = "<|im_start|>" - STEP3VL_EOS_TOKEN = "<|im_end|>" - STEP3VL_PAD_TOKEN = "<|endoftext|>" - STEP3VL_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro render_content(content) -%}\n" - " {%- if content is none -%}{{- '' -}}\n" - " {%- elif content is string -%}{{- content -}}\n" - " {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n" - " {%- elif content is iterable -%}\n" - " {%- for item in content -%}\n" - " {%- if item.type == 'text' -%}\n" - " {{- item['value'] if 'value' in item else item['text'] -}}\n" - " {%- elif item.type in ['image', 'image_url'] -%}\n" - " {%- set url_val = '' -%}\n" - " {%- if item.image_url -%}\n" - " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" - " {%- endif -%}\n" - " {{- '' + url_val -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "\n" - "{%- if tools -%}\n" - " {{- '<|im_start|>system\\n' -}}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- render_content(messages[0].content) + '\\n\\n' -}}\n" - " {%- endif -%}\n" - " {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n' -}}\n" - " {%- for tool in tools -%}\n" - " {{- '\\n' -}}\n" - " {{- tool | tojson -}}\n" - " {%- endfor -%}\n" - " {{- '\\n\\n\\nAlways adhere to this exact format for tool use:\\n\\n\\n{\"name\": , \"arguments\": }\\n\\n{additional_tool_calls}\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within XML tags.\\n- `` must be an exact match to one of the available tools.\\n- `` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n" - "{%- else -%}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - "\n" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n" - "{%- for message in messages[::-1] -%}\n" - " {%- set index = (messages|length - 1) - loop.index0 -%}\n" - " {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) -%}\n" - " {%- set ns.multi_step_tool = false -%}\n" - " {%- set ns.last_query_index = index -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- for message in messages -%}\n" - " {%- set content = render_content(message.content) -%}\n" - " {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n" - " {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n" - " {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n" - " {%- elif message.role == 'assistant' -%}\n" - " {%- if message.reasoning_content is string -%}\n" - " {%- set reasoning_content = render_content(message.reasoning_content) -%}\n" - " {%- else -%}\n" - " {%- if '' in content -%}\n" - " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') -%}\n" - " {%- set content = content.split('')[-1].lstrip('\\n') -%}\n" - " {%- else -%}\n" - " {%- set reasoning_content = '' -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if loop.index0 > ns.last_query_index -%}\n" - " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n' + content -}}\n" - " {%- else -%}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content -}}\n" - " {%- endif -%}\n" - " {%- if message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- for tool_call in message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- if tool_call.function -%}\n" - " {%- set tool_call = tool_call.function -%}\n" - " {%- endif -%}\n" - " {{- '\\n{\"name\": \"' -}}\n" - " {{- tool_call.name -}}\n" - " {{- '\", \"arguments\": ' -}}\n" - " {%- if tool_call.arguments is string -%}\n" - " {{- tool_call.arguments -}}\n" - " {%- else -%}\n" - " {{- tool_call.arguments | tojson -}}\n" - " {%- endif -%}\n" - " {{- '}\\n' -}}\n" - " {%- endfor -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- elif message.role == 'tool' -%}\n" - " {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n" - " {{- '<|im_start|>tool_response' -}}\n" - " {%- endif -%}\n" - " {{- '\\n\\n' -}}\n" - " {{- content -}}\n" - " {{- '\\n' -}}\n" - " {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n\\n\\n\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Step3-VL Handler. - - Args: - enable_thinking (bool): If False, injects an empty block to bypass reasoning. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Pass thinking toggle into Jinja - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Step3 uses standard <|im_end|> ChatML stop formatting - kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -@register_chat_completion_handler("chatml-function-calling") -def chatml_function_calling( - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - **kwargs, # type: ignore -) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], -]: - function_calling_template = ( - "{% for message in messages %}" - "<|im_start|>{{ message.role }}\n" - # System message - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% if tool_calls %}" - "\n\nYou have access to the following functions:\n" - "{% for tool in tools %}" - "\nfunctions.{{ tool.function.name }}:\n" - "{{ tool.function.parameters | tojson }}" - "\n{% endfor %}" - "\n\nYou can respond to users messages with either a single message or one or more function calls." - "\n\nTo respond with a message begin the message with 'message:', use the following format:" - "\n\nmessage:" - "\n" - "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" - "\n\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "{% endif %}" - "<|im_end|>\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - ## Reglar message - "{% if message.content and message.content | length > 0 %}" - "{% if tool_calls %}" - "message:\n" - "{% endif %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - ## Function calls - "{% if 'tool_calls' in message %}" - "{% for tool_call in message.tool_calls %}" - "functions.{{ tool_call.function.name }}:\n" - "{{ tool_call.function.arguments }}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" - ) - template_renderer = ImmutableSandboxedEnvironment( - autoescape=jinja2.select_autoescape(["html", "xml"]), - undefined=jinja2.StrictUndefined, - ).from_string(function_calling_template) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - stop = ( - [stop, "<|im_end|>"] - if isinstance(stop, str) - else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] - ) - - # Case 1: No tool choice by user - if ( - tool_choice is None - or (isinstance(tool_choice, str) and tool_choice == "none") - or tools is None - or len(tools) == 0 - ): - prompt = template_renderer.render( - messages=messages, - tools=[], - tool_calls=None, - add_generation_prompt=True, - ) - - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - return _convert_completion_to_chat( - llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=stream, - stop=stop, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logprobs=top_logprobs if logprobs else None, - ), - stream=stream, - ) + return _convert_completion_to_chat( + llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=stop, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logprobs=top_logprobs if logprobs else None, + ), + stream=stream, + ) # Case 2: Tool choice by user if isinstance(tool_choice, dict): @@ -6244,3 +3539,35 @@ def chatml_function_calling( } raise ValueError("Automatic streaming tool choice is not supported") + +# Backward compatibility re-exports. +# These multimodal chat handlers have been moved to `llama_multimodal`. +# New code should import them from `llama_cpp.llama_multimodal` instead of +# `llama_cpp.llama_chat_format`. +from llama_cpp.llama_multimodal import ( + MTMDChatHandler, + GenericMTMDChatHandler, + Llava15ChatHandler, + ObsidianChatHandler, + MoondreamChatHandler, + Llava16ChatHandler, + NanoLlavaChatHandler, + Llama3VisionAlphaChatHandler, + Llama3VisionAlpha, + MiniCPMv26ChatHandler, + MiniCPMv45ChatHandler, + MiniCPMV46ChatHandler, + Gemma3ChatHandler, + Gemma4ChatHandler, + GLM41VChatHandler, + GLM46VChatHandler, + GraniteDoclingChatHandler, + LFM2VLChatHandler, + LFM25VLChatHandler, + PaddleOCRChatHandler, + Qwen25VLChatHandler, + Qwen3ASRChatHandler, + Qwen3VLChatHandler, + Qwen35ChatHandler, + Step3VLChatHandler +) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 416e8b9357..1e81d80f65 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -55,6 +55,8 @@ LLAMA_MAX_DEVICES = _lib.llama_max_devices() +LLAMA_MAX_SEQ = 256 + # define LLAMA_DEFAULT_SEED 0xFFFFFFFF LLAMA_DEFAULT_SEED = 0xFFFFFFFF @@ -122,20 +124,21 @@ # LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization # LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming # }; -LLAMA_VOCAB_TYPE_NONE = 0 -"""For models without vocab""" -LLAMA_VOCAB_TYPE_SPM = 1 -"""LLaMA tokenizer based on byte-level BPE with byte fallback""" -LLAMA_VOCAB_TYPE_BPE = 2 -"""GPT-2 tokenizer based on byte-level BPE""" -LLAMA_VOCAB_TYPE_WPM = 3 -"""BERT tokenizer based on WordPiece""" -LLAMA_VOCAB_TYPE_UGM = 4 -"""T5 tokenizer based on Unigram""" -LLAMA_VOCAB_TYPE_RWKV = 5 -"""RWKV tokenizer based on greedy tokenization""" -LLAMA_VOCAB_TYPE_PLAMO2 = 6 -"""PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming""" +class llama_vocab_type(enum.IntEnum): + LLAMA_VOCAB_TYPE_NONE = 0 + """For models without vocab""" + LLAMA_VOCAB_TYPE_SPM = 1 + """LLaMA tokenizer based on byte-level BPE with byte fallback""" + LLAMA_VOCAB_TYPE_BPE = 2 + """GPT-2 tokenizer based on byte-level BPE""" + LLAMA_VOCAB_TYPE_WPM = 3 + """BERT tokenizer based on WordPiece""" + LLAMA_VOCAB_TYPE_UGM = 4 + """T5 tokenizer based on Unigram""" + LLAMA_VOCAB_TYPE_RWKV = 5 + """RWKV tokenizer based on greedy tokenization""" + LLAMA_VOCAB_TYPE_PLAMO2 = 6 + """PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming""" # NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp) @@ -193,58 +196,65 @@ # LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, # LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, # LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, +# LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51, +# LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52, +# LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53, # }; -LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 -LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 -LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2 -LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3 -LLAMA_VOCAB_PRE_TYPE_FALCON = 4 -LLAMA_VOCAB_PRE_TYPE_MPT = 5 -LLAMA_VOCAB_PRE_TYPE_STARCODER = 6 -LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 -LLAMA_VOCAB_PRE_TYPE_REFACT = 8 -LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9 -LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10 -LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 -LLAMA_VOCAB_PRE_TYPE_OLMO = 12 -LLAMA_VOCAB_PRE_TYPE_DBRX = 13 -LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 -LLAMA_VOCAB_PRE_TYPE_PORO = 15 -LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16 -LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17 -LLAMA_VOCAB_PRE_TYPE_VIKING = 18 -LLAMA_VOCAB_PRE_TYPE_JAIS = 19 -LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20 -LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21 -LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22 -LLAMA_VOCAB_PRE_TYPE_BLOOM = 23 -LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24 -LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 -LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 -LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 -LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 -LLAMA_VOCAB_PRE_TYPE_GPT4O = 29 -LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30 -LLAMA_VOCAB_PRE_TYPE_TRILLION = 31 -LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32 -LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33 -LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34 -LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35 -LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36 -LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37 -LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38 -LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39 -LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40 -LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41 -LLAMA_VOCAB_PRE_TYPE_AFMOE = 42 -LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43 -LLAMA_VOCAB_PRE_TYPE_YOUTU = 44 -LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 -LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 -LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 -LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48 -LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49 -LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50 +class llama_vocab_pre_type(enum.IntEnum): + LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 + LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2 + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3 + LLAMA_VOCAB_PRE_TYPE_FALCON = 4 + LLAMA_VOCAB_PRE_TYPE_MPT = 5 + LLAMA_VOCAB_PRE_TYPE_STARCODER = 6 + LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 + LLAMA_VOCAB_PRE_TYPE_REFACT = 8 + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9 + LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10 + LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 + LLAMA_VOCAB_PRE_TYPE_OLMO = 12 + LLAMA_VOCAB_PRE_TYPE_DBRX = 13 + LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 + LLAMA_VOCAB_PRE_TYPE_PORO = 15 + LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16 + LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17 + LLAMA_VOCAB_PRE_TYPE_VIKING = 18 + LLAMA_VOCAB_PRE_TYPE_JAIS = 19 + LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20 + LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21 + LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22 + LLAMA_VOCAB_PRE_TYPE_BLOOM = 23 + LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24 + LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 + LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 + LLAMA_VOCAB_PRE_TYPE_GPT4O = 29 + LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30 + LLAMA_VOCAB_PRE_TYPE_TRILLION = 31 + LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32 + LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33 + LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34 + LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35 + LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36 + LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37 + LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38 + LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39 + LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40 + LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41 + LLAMA_VOCAB_PRE_TYPE_AFMOE = 42 + LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43 + LLAMA_VOCAB_PRE_TYPE_YOUTU = 44 + LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 + LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 + LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 + LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48 + LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49 + LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50 + LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51 + LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52 + LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53 # // note: these values should be synchronized with ggml_rope @@ -257,12 +267,13 @@ # LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE, # LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION, # }; -LLAMA_ROPE_TYPE_NONE = -1 -LLAMA_ROPE_TYPE_NORM = 0 -LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 -LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8 -LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40 -LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24 +class llama_rope_type(enum.IntEnum): + LLAMA_ROPE_TYPE_NONE = -1 + LLAMA_ROPE_TYPE_NORM = 0 + LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 + LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8 + LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24 + LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40 # enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file @@ -274,13 +285,14 @@ # LLAMA_TOKEN_TYPE_UNUSED = 5, # LLAMA_TOKEN_TYPE_BYTE = 6, # }; -LLAMA_TOKEN_TYPE_UNDEFINED = 0 -LLAMA_TOKEN_TYPE_NORMAL = 1 -LLAMA_TOKEN_TYPE_UNKNOWN = 2 -LLAMA_TOKEN_TYPE_CONTROL = 3 -LLAMA_TOKEN_TYPE_USER_DEFINED = 4 -LLAMA_TOKEN_TYPE_UNUSED = 5 -LLAMA_TOKEN_TYPE_BYTE = 6 +class llama_token_type(enum.IntEnum): + LLAMA_TOKEN_TYPE_UNDEFINED = 0 + LLAMA_TOKEN_TYPE_NORMAL = 1 + LLAMA_TOKEN_TYPE_UNKNOWN = 2 + LLAMA_TOKEN_TYPE_CONTROL = 3 + LLAMA_TOKEN_TYPE_USER_DEFINED = 4 + LLAMA_TOKEN_TYPE_UNUSED = 5 + LLAMA_TOKEN_TYPE_BYTE = 6 # enum llama_token_attr { @@ -355,45 +367,46 @@ # # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; -LLAMA_FTYPE_ALL_F32 = 0 -LLAMA_FTYPE_MOSTLY_F16 = 1 -LLAMA_FTYPE_MOSTLY_Q4_0 = 2 -LLAMA_FTYPE_MOSTLY_Q4_1 = 3 -LLAMA_FTYPE_MOSTLY_Q8_0 = 7 -LLAMA_FTYPE_MOSTLY_Q5_0 = 8 -LLAMA_FTYPE_MOSTLY_Q5_1 = 9 -LLAMA_FTYPE_MOSTLY_Q2_K = 10 -LLAMA_FTYPE_MOSTLY_Q3_K_S = 11 -LLAMA_FTYPE_MOSTLY_Q3_K_M = 12 -LLAMA_FTYPE_MOSTLY_Q3_K_L = 13 -LLAMA_FTYPE_MOSTLY_Q4_K_S = 14 -LLAMA_FTYPE_MOSTLY_Q4_K_M = 15 -LLAMA_FTYPE_MOSTLY_Q5_K_S = 16 -LLAMA_FTYPE_MOSTLY_Q5_K_M = 17 -LLAMA_FTYPE_MOSTLY_Q6_K = 18 -LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 -LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 -LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 -LLAMA_FTYPE_MOSTLY_IQ3_XS = 22 -LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23 -LLAMA_FTYPE_MOSTLY_IQ1_S = 24 -LLAMA_FTYPE_MOSTLY_IQ4_NL = 25 -LLAMA_FTYPE_MOSTLY_IQ3_S = 26 -LLAMA_FTYPE_MOSTLY_IQ3_M = 27 -LLAMA_FTYPE_MOSTLY_IQ2_S = 28 -LLAMA_FTYPE_MOSTLY_IQ2_M = 29 -LLAMA_FTYPE_MOSTLY_IQ4_XS = 30 -LLAMA_FTYPE_MOSTLY_IQ1_M = 31 -LLAMA_FTYPE_MOSTLY_BF16 = 32 -# LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33 -# LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34 -# LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35 -LLAMA_FTYPE_MOSTLY_TQ1_0 = 36 -LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 -LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 -LLAMA_FTYPE_MOSTLY_NVFP4 = 39 -LLAMA_FTYPE_MOSTLY_Q1_0 = 40 -LLAMA_FTYPE_GUESSED = 1024 +class llama_ftype(enum.IntEnum): + LLAMA_FTYPE_ALL_F32 = 0 + LLAMA_FTYPE_MOSTLY_F16 = 1 + LLAMA_FTYPE_MOSTLY_Q4_0 = 2 + LLAMA_FTYPE_MOSTLY_Q4_1 = 3 + LLAMA_FTYPE_MOSTLY_Q8_0 = 7 + LLAMA_FTYPE_MOSTLY_Q5_0 = 8 + LLAMA_FTYPE_MOSTLY_Q5_1 = 9 + LLAMA_FTYPE_MOSTLY_Q2_K = 10 + LLAMA_FTYPE_MOSTLY_Q3_K_S = 11 + LLAMA_FTYPE_MOSTLY_Q3_K_M = 12 + LLAMA_FTYPE_MOSTLY_Q3_K_L = 13 + LLAMA_FTYPE_MOSTLY_Q4_K_S = 14 + LLAMA_FTYPE_MOSTLY_Q4_K_M = 15 + LLAMA_FTYPE_MOSTLY_Q5_K_S = 16 + LLAMA_FTYPE_MOSTLY_Q5_K_M = 17 + LLAMA_FTYPE_MOSTLY_Q6_K = 18 + LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 + LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 + LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 + LLAMA_FTYPE_MOSTLY_IQ3_XS = 22 + LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23 + LLAMA_FTYPE_MOSTLY_IQ1_S = 24 + LLAMA_FTYPE_MOSTLY_IQ4_NL = 25 + LLAMA_FTYPE_MOSTLY_IQ3_S = 26 + LLAMA_FTYPE_MOSTLY_IQ3_M = 27 + LLAMA_FTYPE_MOSTLY_IQ2_S = 28 + LLAMA_FTYPE_MOSTLY_IQ2_M = 29 + LLAMA_FTYPE_MOSTLY_IQ4_XS = 30 + LLAMA_FTYPE_MOSTLY_IQ1_M = 31 + LLAMA_FTYPE_MOSTLY_BF16 = 32 + # LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33 + # LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34 + # LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35 + LLAMA_FTYPE_MOSTLY_TQ1_0 = 36 + LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 + LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 + LLAMA_FTYPE_MOSTLY_NVFP4 = 39 + LLAMA_FTYPE_MOSTLY_Q1_0 = 40 + LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { # LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, @@ -471,6 +484,14 @@ class llama_split_mode(enum.IntEnum): LLAMA_SPLIT_MODE_ROW = 2 LLAMA_SPLIT_MODE_TENSOR = 3 +# enum llama_context_type { +# LLAMA_CONTEXT_TYPE_DEFAULT = 0, +# LLAMA_CONTEXT_TYPE_MTP = 1, +# }; +class llama_context_type(enum.IntEnum): + LLAMA_CONTEXT_TYPE_DEFAULT = 0 + LLAMA_CONTEXT_TYPE_MTP = 1 + # typedef struct llama_token_data { # llama_token id; // token id # float logit; // log-odds of the token @@ -827,9 +848,12 @@ class llama_sampler_seq_config(ctypes.Structure): # uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode # uint32_t n_ubatch; // physical maximum batch size # uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) +# uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] +# uint32_t n_outputs_max; // max outputs in a ubatch (0 = n_batch) # int32_t n_threads; // number of threads to use for generation # int32_t n_threads_batch; // number of threads to use for batch processing +# enum llama_context_type ctx_type; // set the context type (e.g. MTP) # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id # enum llama_attention_type attention_type; // attention type to use for embeddings @@ -843,13 +867,14 @@ class llama_sampler_seq_config(ctypes.Structure): # float yarn_beta_fast; // YaRN low correction dim # float yarn_beta_slow; // YaRN high correction dim # uint32_t yarn_orig_ctx; // YaRN original context size -# float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, < 0 disabled (default) +# float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default) # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; # enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] # enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] + # // Abort callback # // if it returns true, execution of llama_decode() will be aborted # // currently works only with CPU execution @@ -862,16 +887,20 @@ class llama_sampler_seq_config(ctypes.Structure): # bool no_perf; // measure performance timings # bool op_offload; // offload host tensor operations to device # bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) -# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some casesAdd commentMore actions -# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 +# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases +# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 # bool kv_unified; // use a unified buffer across the input sequences when computing the attention -# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix -# // ref: https://github.com/ggml-org/llama.cpp/pull/14363 +# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix +# // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + # // [EXPERIMENTAL] # // backend sampler chain configuration (make sure the caller keeps the sampler chains alive) # // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) # struct llama_sampler_seq_config * samplers; # size_t n_samplers; +# // a source/target/parent context +# // can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts +# struct llama_context * ctx_other; # }; class llama_context_params(ctypes.Structure): """Parameters for llama_context @@ -881,12 +910,17 @@ class llama_context_params(ctypes.Structure): n_batch (int): logical maximum batch size that can be submitted to llama_decode n_ubatch (int): physical maximum batch size n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models) + n_rs_seq (int): number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] + n_outputs_max (int): max outputs in a ubatch (0 = n_batch) n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing + + ctx_type (int): set the context type (e.g. MTP) rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) attention_type (int): attention type to use for embeddings flash_attn_type (int): when to enable Flash Attention + rope_freq_base (float): RoPE base frequency, 0 = from model rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model @@ -895,20 +929,27 @@ class llama_context_params(ctypes.Structure): yarn_beta_slow (float): YaRN high correction dim yarn_orig_ctx (int): YaRN original context size defrag_thold (float): [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default) + cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval + type_k (int): data type for K cache type_v (int): data type for V cache + abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback + embeddings (bool): if true, extract embeddings (together with logits) offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU no_perf (bool): whether to measure performance timings op_offload(bool): whether to offload host tensor operations to device swa_full(bool): whether to use full-size SWA cache kv_unified(bool): use a unified buffer across the input sequences when computing the attention + samplers(llama_sampler_seq_config *): the samplers must be sampler chains (i.e. use llama_sampler_chain_init) n_samplers(size_t): numbers of sampler chains + + ctx_other(llama_context *): a source/target/parent context can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts """ if TYPE_CHECKING: @@ -916,8 +957,11 @@ class llama_context_params(ctypes.Structure): n_batch: int n_ubatch: int n_seq_max: int + n_rs_seq: int + n_outputs_max: int n_threads: int n_threads_batch: int + ctx_type: int rope_scaling_type: int pooling_type: int attention_type: int @@ -944,14 +988,18 @@ class llama_context_params(ctypes.Structure): kv_unified:bool samplers: ctypes.c_void_p n_samplers: int + ctx_other: ctypes.c_void_p _fields_ = [ ("n_ctx", ctypes.c_uint32), ("n_batch", ctypes.c_uint32), ("n_ubatch", ctypes.c_uint32), ("n_seq_max", ctypes.c_uint32), + ("n_rs_seq", ctypes.c_uint32), + ("n_outputs_max", ctypes.c_uint32), ("n_threads", ctypes.c_int32), ("n_threads_batch", ctypes.c_int32), + ("ctx_type", ctypes.c_int), ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), @@ -978,6 +1026,7 @@ class llama_context_params(ctypes.Structure): ("kv_unified", ctypes.c_bool), ("samplers", llama_sampler_seq_config_p), ("n_samplers", ctypes.c_int), + ("ctx_other", ctypes.c_void_p), ] llama_context_params_p = ctypes.POINTER(llama_context_params) @@ -1602,6 +1651,12 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... +# LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx); +@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_rs_seq(ctx: llama_context_p, /) -> int: + ... + + # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_ctx_train(model: llama_model_p, /) -> int: @@ -2763,6 +2818,8 @@ def llama_state_seq_load_file( ) -> int: ... +# define LLAMA_STATE_SEQ_FLAGS_NONE 0 +LLAMA_STATE_SEQ_FLAGS_NONE = 0 # // for backwards-compat LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1 @@ -2770,6 +2827,10 @@ def llama_state_seq_load_file( # // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba) LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 1 +# // keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load) +# // Getting the state for a seq_id with this flag invalidates all prior states gotten for that seq_id with this flag. +LLAMA_STATE_SEQ_FLAGS_ON_DEVICE = 2 + llama_state_seq_flags = ctypes.c_uint32 # LLAMA_API size_t llama_state_seq_get_size_ext( @@ -3031,11 +3092,15 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /): # // Set whether the model is in warmup mode or not # // If true, all model tensors are activated during llama_decode() to load and cache their weights. -# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); +# // +# // note: using this can cause extra graph reallocations because it changes the graph topology with MoE models, +# // so it is generally not recommended to use in practice. will be removed in the future +# DEPRECATED(LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup), +# "user code should do warmup runs manually [TAG_LLAMA_GRAPH_NO_WARMUP]"); @ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None) def llama_set_warmup(ctx: llama_context_p, warmup: bool, /): - """ Set whether the model is in warmup mode or not - If true, all model tensors are activated during llama_decode() to load and cache their weights""" + """DEPRECATED: using this can cause extra graph reallocations because it changes the graph topology with MoE models, + so it is generally not recommended to use in practice. will be removed in the future""" ... # // Set abort callback diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 7c8ad1e90f..0c1df339ce 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -303,9 +303,7 @@ def rank(self, query: str, documents: List[str]) -> List[float]: # 1. Attempt to retrieve the built-in 'rerank' chat template from model metadata. # Modern GGUF models often include a template for formatting query/document pairs. - rerank_template = llama_cpp.llama_model_chat_template(self._model.model, b"rerank") - if rerank_template: - rerank_template = rerank_template.decode("utf-8") + rerank_template = self._model.model_chat_template(b"rerank") batch_inputs: List[List[int]] = [] diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 3c431fc3d8..67ad424490 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -465,18 +465,18 @@ def __init__(self, content: str, deps: list = None): SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}' PRIMITIVE_RULES = { - 'boolean' : BuiltinRule('("true" | "false") space', []), + 'boolean' : BuiltinRule('("true" | "false")', []), 'decimal-part' : BuiltinRule('[0-9]{1,16}', []), 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []), - 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), - 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']), + 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?', ['integral-part', 'decimal-part']), + 'integer' : BuiltinRule('("-"? integral-part)', ['integral-part']), 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), - 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), - 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), - 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []), + 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? space "}"', ['string', 'value']), + 'array' : BuiltinRule('"[" space ( value ("," space value)* )? space "]"', ['value']), + 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""', []), 'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []), - 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), - 'null' : BuiltinRule('"null" space', []), + 'string' : BuiltinRule(r'"\"" char* "\""', ['char']), + 'null' : BuiltinRule('"null"', []), } # TODO: support "uri", "email" string formats @@ -484,9 +484,9 @@ def __init__(self, content: str, deps: list = None): 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), 'date-time' : BuiltinRule('date "T" time', ['date', 'time']), - 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']), - 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']), - 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']), + 'date-string' : BuiltinRule('"\\"" date "\\""', ['date']), + 'time-string' : BuiltinRule('"\\"" time "\\""', ['time']), + 'date-time-string': BuiltinRule('"\\"" date-time "\\""', ['date-time']), } DOTALL = '[\\U00000000-\\U0010FFFF]' @@ -585,7 +585,7 @@ def visit(node): out.append(f'[^"{"".join(rejects)}] {char_rule}*') visit(trie) - out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space') + out.append(f' ){"" if trie.is_end_of_string else "?"} ["]') return ''.join(out) def _add_rule(self, name, rule): @@ -815,7 +815,7 @@ def join_seq(): return self._add_rule( name, to_rule(transform()) if self._raw_pattern \ - else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") + else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"") def _resolve_ref(self, ref): @@ -846,10 +846,10 @@ def visit(self, schema, name): return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type])) elif 'const' in schema: - return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space') + return self._add_rule(rule_name, self._generate_constant_rule(schema['const'])) elif 'enum' in schema: - rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space' + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ')' return self._add_rule(rule_name, rule) elif schema_type in (None, 'object') and \ @@ -890,7 +890,7 @@ def add_component(comp_schema, is_required): enum_intersection &= s if enum_intersection: - rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space' + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ')' return self._add_rule(rule_name, rule) return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) @@ -904,12 +904,12 @@ def add_component(comp_schema, is_required): ' "," space '.join( self.visit(item, f'{name}{"-" if name else ""}tuple-{i}') for i, item in enumerate(items)) + - ' "]" space') + ' space "]"') else: item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') min_items = schema.get("minItems", 0) max_items = schema.get("maxItems") - return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space') + return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' space "]"') elif schema_type in (None, 'string') and 'pattern' in schema: return self._visit_pattern(schema['pattern'], rule_name) @@ -929,7 +929,7 @@ def add_component(comp_schema, is_required): min_len = schema.get('minLength', 0) max_len = schema.get('maxLength') - return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') + return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\""') elif schema_type in (None, 'integer') and \ ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema): @@ -946,7 +946,7 @@ def add_component(comp_schema, is_required): out = ["("] _generate_min_max_int(min_value, max_value, out) - out.append(") space") + out.append(")") return self._add_rule(rule_name, ''.join(out)) elif (schema_type == 'object') or (len(schema) == 0): @@ -1031,7 +1031,7 @@ def get_recursive_refs(ks, first_is_optional): rule += ' )' rule += ' )?' - rule += ' "}" space' + rule += ' space "}"' return rule diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py new file mode 100644 index 0000000000..f1b320b772 --- /dev/null +++ b/llama_cpp/llama_multimodal.py @@ -0,0 +1,3686 @@ +from __future__ import annotations + +import base64 +import ctypes +import json +import os +import sys +import zlib + +from contextlib import ExitStack +from typing import ( + Any, + Dict, + Iterator, + List, + Literal, + Optional, + Tuple, + Union, + Protocol, + TYPE_CHECKING, + cast, +) + +import urllib.request +from urllib.error import URLError, HTTPError + +import llama_cpp.llama_cpp as llama_cpp_lib +import llama_cpp.llama_types as llama_types +import llama_cpp.llama_grammar as llama_grammar + +if TYPE_CHECKING: + import llama_cpp.llama as llama_core + +from ._logger import ggml_log_callback + +from llama_cpp.llama_chat_format import ( + _convert_completion_to_chat, + _convert_completion_to_chat_function, + _grammar_for_response_format, + ImmutableSandboxedEnvironment +) + +class MTMDChatHandler: + DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( +"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " +"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." + ) + + CHAT_FORMAT = ( + "{{ bos_token if bos_token is defined else '' }}" + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% elif message.role == 'user' %}" + "USER: " + "{% if message.content is string %}" + "{{ message.content }}" + "{% elif message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% elif content.type == 'audio_url' %}" + "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" + "{% elif content.type == 'input_audio' %}" + "{% if content.input_audio is string %}" + "{{ content.input_audio }}" + "{% else %}" + "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" + "{% endif %}" + "{% elif content.type == 'video_url' %}" + "{{ content.video_url if content.video_url is string else content.video_url.url }}" + "{% elif content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + + "{% elif message.role == 'assistant' and message.content is not none %}" + "ASSISTANT: {{ message.content }}" + "{% endif %}" + "{{ \"\n\" }}" + "{% endfor %}" + + "{% if eos_token is defined %}" + "{{ eos_token }}" + "{% endif %}" + + "{% if add_generation_prompt %}" + "ASSISTANT: " + "{% endif %}" + ) + + KNOWN_MEDIA_TAGS: List[str] = [] + + def __init__( + self, + mmproj_path: Optional[str] = None, + verbose: bool = True, + use_gpu: bool = True, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + chat_template_override: Optional[str] = None, + batch_max_tokens: int = 1024, + **kwargs + ): + + self.log_prefix = self.__class__.__name__ + self.verbose = verbose + + # Backward compatibility: `clip_model_path` was the old name for `mmproj_path`. + # Accept it for existing user code, warn during initialization, and normalize + # all internal usage to `mmproj_path`. + clip_model_path = kwargs.pop("clip_model_path", None) + if mmproj_path is None and clip_model_path is not None: + mmproj_path = clip_model_path + if self.verbose: + print( + f"{self.log_prefix}(__init__): `clip_model_path` is deprecated; " + "please use `mmproj_path` instead.", + file=sys.stderr, + ) + + if kwargs: + unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) + raise TypeError( + f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" + f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." + ) + + if mmproj_path is None: + raise ValueError( + f"{self.log_prefix}(__init__): `mmproj_path` is required. " + "`clip_model_path` is accepted only as a deprecated compatibility alias." + ) + + self.mmproj_path = mmproj_path + if not os.path.exists(self.mmproj_path): + raise ValueError( + f"{self.log_prefix}(__init__): mmproj path does not exist: {self.mmproj_path}" + ) + + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + self.batch_max_tokens = batch_max_tokens + self.use_gpu = use_gpu + + import llama_cpp.mtmd_cpp as mtmd_cpp + self._mtmd_cpp = mtmd_cpp + self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None + self.extra_template_arguments: dict[str, Any] = {} + + self.is_support_vision = False + self.is_support_audio = False + self.is_support_video = False + + # Pre-compile Jinja template + if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: + self.chat_format = self.CHAT_FORMAT + elif chat_template_override is not None: + self.chat_format = chat_template_override + + self._chat_format_parser_tags = [] + self._change_chat_template(self.chat_format) + + self._exit_stack = ExitStack() + + def _change_chat_template(self, new_template: str): + self.chat_template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True + ).from_string(new_template) + + def _init_mtmd_context(self, llama_model: llama_core.Llama): + """Initialize mtmd context with the llama model.""" + if self.mtmd_ctx is not None: + return # Already initialized + + self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) + + # Get default parameters + self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() + self.mctx_params.use_gpu = self.use_gpu + self.mctx_params.print_timings = self.verbose + self.mctx_params.n_threads = llama_model.n_threads + self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + self.mctx_params.warmup = True + if self.image_min_tokens > 0: + self.mctx_params.image_min_tokens = self.image_min_tokens + if self.image_max_tokens > 0: + self.mctx_params.image_max_tokens = self.image_max_tokens + if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " + f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + self.mctx_params.batch_max_tokens = self.batch_max_tokens + + # Cache the model's eos token and bos token + self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') + self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') + + # Cache the mtmd_default_marker + self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') + + # Initialize mtmd context + self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( + self.mmproj_path.encode(), + llama_model.model, + self.mctx_params + ) + + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") + + # Check if vision is supported + self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) + if self.is_support_vision: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if audio is supported + self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) + if self.is_support_audio: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if video is supported + self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) + if self.is_support_video: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) + + def close(self) -> None: + """Explicitly free the mtmd context and vision model resources.""" + if getattr(self, "mtmd_ctx", None) is not None: + try: + self._mtmd_cpp.mtmd_free(self.mtmd_ctx) + except Exception: + pass + self.mtmd_ctx = None + self.mctx_params = None + self.chat_template = None + + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): + self._exit_stack.close() + self._exit_stack = None + + def __del__(self) -> None: + self.close() + + def _get_media_url( + self, + content: Dict[str, Any], + keys: Tuple[str, ...], + media_type: str, + ) -> str: + """ + Extract a media URL or data URI from a multimodal content item. + + Different chat templates and client APIs may represent the same media + payload with slightly different keys. For example, an image may appear as + `image`, `image_url`, or a typed chunk with `{"type": "image", ...}`. + This helper checks the provided keys in order and returns the first usable + media payload. + + Returns an empty string when none of the requested keys exist or when the + payload shape is unsupported. The caller is responsible for raising a + media-type-specific error when an empty value is not acceptable. + """ + # Try keys in priority order. This lets callers prefer canonical fields + # such as "image" over compatibility aliases such as "image_url", while + # still accepting either representation. + value = None + for key in keys: + if key in content: + value = content[key] + break + + # String payloads may already be URLs, local paths, or data URIs. + if isinstance(value, str): + return value + + if isinstance(value, dict): + # Common OpenAI-style shape: + # {"image_url": {"url": "..."}} + if "url" in value: + return value["url"] + + # Forward-compatible inline media shape: + # {"audio": {"data": "...", "format": "wav"}} + # + # Convert it to a data URI so downstream media loading does not need + # separate branches for raw base64 payloads. + if "data" in value and "format" in value: + media_format = value.get("format", "") + media_data = value.get("data", "") + if media_format and media_data: + return f"data:{media_type}/{media_format};base64,{media_data}" + + return "" + + def _get_media_items( + self, + messages: List[llama_types.ChatCompletionRequestMessage], + ) -> List[Dict[str, str]]: + """ + Extract media payloads from chat messages in message/content order. + + Supports OpenAI-style typed media chunks as well as template-friendly + variants used by multimodal chat templates, such as: + - {"type": "image_url", "image_url": {"url": "..."}} + - {"type": "image", "image": "..."} + - {"image": "..."} + - {"type": "audio_url", "audio_url": {"url": "..."}} + - {"type": "audio", "audio": "..."} + - {"type": "input_audio", "input_audio": {"data": "...", "format": "wav"}} + - {"type": "video_url", "video_url": {"url": "..."}} + - {"type": "video", "video": "..."} + - {"video": "..."} + + The returned order must match the media placeholders emitted by the rendered + chat template as closely as possible. + """ + media_items: List[Dict[str, str]] = [] + + for message in messages: + content_list = message.get("content") + if not isinstance(content_list, list): + continue + + for content in content_list: + if not isinstance(content, dict): + continue + + content_type = content.get("type", "") + + has_image = ( + content_type in ("image", "image_url") + or "image" in content + or "image_url" in content + ) + has_audio = ( + content_type in ("audio", "audio_url", "input_audio") + or "audio" in content + or "audio_url" in content + or "input_audio" in content + ) + has_video = ( + content_type in ("video", "video_url") + or "video" in content + or "video_url" in content + ) + + media_kind_count = int(has_image) + int(has_audio) + int(has_video) + if media_kind_count > 1: + raise ValueError( + f"{self.log_prefix}: content item contains multiple media types; " + "each content item must contain only one of image, audio, or video." + ) + + # 1. Vision Processing + if has_image: + if not self.is_support_vision: + raise ValueError( + f"{self.log_prefix}: This mmproj model instance does not support image inputs." + ) + + url = self._get_media_url( + content, + keys=("image", "image_url"), + media_type="image", + ) + if not url: + raise ValueError(f"{self.log_prefix}: missing image url/data.") + + media_items.append({"url": url, "type": "image"}) + + # 2. Audio Processing + elif has_audio: + if not self.is_support_audio: + raise ValueError( + f"{self.log_prefix}: This mmproj model instance does not support audio inputs." + ) + + if content_type == "input_audio" or "input_audio" in content: + input_audio = content.get("input_audio", {}) + + if isinstance(input_audio, dict) and "data" in input_audio: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + + # Strictly align with llama.cpp. + if audio_format not in ["wav", "mp3"]: + raise ValueError( + f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'" + ) + + url = f"data:audio/{audio_format};base64,{audio_data}" + else: + url = input_audio if isinstance(input_audio, str) else "" + else: + url = self._get_media_url( + content, + keys=("audio", "audio_url"), + media_type="audio", + ) + + if not url: + raise ValueError(f"{self.log_prefix}: missing audio url/data.") + + media_items.append({"url": url, "type": "audio"}) + + # 3. Video Processing + elif has_video: + if not self.is_support_video: + raise ValueError( + f"{self.log_prefix}: This libmtmd build does not support video inputs." + ) + + url = self._get_media_url( + content, + keys=("video", "video_url"), + media_type="video", + ) + if not url: + raise ValueError(f"{self.log_prefix}: missing video url/data.") + + media_items.append({"url": url, "type": "video"}) + + # 4. Text & Unknown Types + elif content_type == "text" or "text" in content: + continue + else: + if self.verbose: + print( + f"{self.log_prefix}: ignored unknown content type '{content_type}'.", + file=sys.stderr, + ) + + return media_items + + def _create_bitmap_from_bytes(self, media_bytes: bytes): + """ + Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. + + Supported formats: + - Images (via stb_image): jpg, png, bmp, etc. + - Audio (via miniaudio): wav, mp3, flac. + - Video: depends on whether MTMD_VIDEO was enabled at build time. + + Note: + - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. + - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. + + Args: + media_bytes (bytes): The raw byte content of the media file. + + Returns: + bitmap: mtmd_bitmap * + video_ctx: mtmd_helper_video * or NULL + """ + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") + + if not media_bytes: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + + buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + + wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + self.mtmd_ctx, + buf, + len(media_bytes), + False, + ) + + if not wrapper.bitmap: + if wrapper.video_ctx: + self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) + + raise ValueError( + f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load media from bytes " + "(unsupported media format, corrupted data, or missing helper support)." + ) + + return wrapper.bitmap, wrapper.video_ctx + + def _is_text_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD text chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT + ) + + def _is_image_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD image chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + ) + + def _is_audio_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ) + + def _process_mtmd_prompt( + self, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + add_generation_prompt: bool = True, + ) -> Tuple[List[int], List[tuple], Any, List[Any]]: + """ + Core multimodal preprocessing pipeline. + Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. + + Features: + - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. + - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. + - Strict RAII-style C++ memory management to prevent leaks on failure. + + Returns: + full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. + chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). + chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). + bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. + """ + # 1. Inject default system prompt if omitted by the user + system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") + if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: + messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages + + media_items = self._get_media_items(messages) + media_marker = self.media_marker + + # 2. Render the chat template and replace actual URLs with C++ media markers + text = self.chat_template.render( + messages=messages, + add_generation_prompt=add_generation_prompt, + eos_token=self.mtmd_eos_token, + bos_token=self.mtmd_bos_token, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + **getattr(self, 'extra_template_arguments', {}) + ) + + for tag in self._chat_format_parser_tags: + if tag not in text: + continue + + text = text.replace(tag, media_marker) + + # Replace image_url by media_marker in text + for item in media_items: + text = text.replace(item["url"], media_marker) + + if self.verbose: + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) + + # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding + bitmaps = [None] * len(media_items) + bitmap_cleanup = [] + video_cleanup = [] + chunks = None + + try: + # Concurrent Media Decoding + import concurrent.futures + if media_items: + def _create_bitmap_func(idx: int, item: dict): + media_bytes = self.load_media(item["url"], item["type"]) + bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) + return idx, bitmap, video_ctx + # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, + # which can be used in the future to process large numbers of video frames. + max_workers = min(llama.n_threads, len(media_items)) + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] + + for future in concurrent.futures.as_completed(futures): + idx, bitmap, video_ctx = future.result() + + bitmaps[idx] = bitmap + bitmap_cleanup.append(bitmap) + + if video_ctx: + video_cleanup.append(video_ctx) + + # Strict validation: Abort if any thread failed to decode its assigned media + if any(b is None for b in bitmaps): + raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") + else: + if self.verbose: + print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") + else: + # If there are no images, set the bitmaps to empty. + bitmaps = [] + + # 4. Initialize mtmd_input_chunks + input_text = self._mtmd_cpp.mtmd_input_text() + input_text.text = text.encode('utf-8') + input_text.add_special = (llama.n_tokens == 0) + input_text.parse_special = True + + chunks = self._mtmd_cpp.mtmd_input_chunks_init() + if chunks is None: + raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") + + # 5. Hybrid Tokenization (Text + Media binding) + if len(bitmaps) > 0: + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) + ) + else: + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") + + # Video helper contexts only need to stay alive until mtmd_tokenize() completes. + if video_cleanup: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup.clear() + + # 6. Virtual Token Ledger Construction + full_prompt_ids = [] + chunk_token_spans = [] + current_idx = 0 + n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + + # Cursor to track the actual media contents (URLs or base64 data) provided by the user + media_items_count = len(media_items) + media_items_cur = 0 + last_media_id = None + + for i in range(n_chunks): + chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) + if chunk is None: continue + chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) + + if self._is_text_chunk(chunk_type): + # Extract standard text token IDs + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) + if tokens_ptr and n_tokens_out.value > 0: + tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) + full_prompt_ids.extend(tokens) + current_idx += len(tokens) + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + # Extract media properties + # Note(JamePeng): + # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). + # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. + # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) + + if media_items_cur < media_items_count: + # The C++ parser only sees identical placeholders (e.g., "<__media__>"). + # We MUST inject the actual media content's identity here. + real_media_url = media_items[media_items_cur]["url"] + # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) + # Generate a deterministic, unique negative ID for this specific image/audio. + # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). + # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with + # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). + # This empowers `longest_token_prefix` to correctly identify and reuse cached images, + # while instantly breaking the match if the image content changes. + # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 + media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 + last_media_id = media_id + media_items_cur += 1 + elif last_media_id is not None: + # video may expand into multiple image chunks from one media marker + media_id = last_media_id + else: + # Magic Negative Number as fallback :) + media_id = -314159 + + if self.verbose: + print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") + + chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) + + # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache + full_prompt_ids.extend([media_id] * chunk_n_tokens) + current_idx += chunk_n_tokens + else: + raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") + + return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup + + except Exception as e: + # Ensure no useless pointers remain upon any failure + # Free chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Free bitmaps + if len(bitmap_cleanup) > 0: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup = None + # Free videos + if len(video_cleanup) > 0: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup = None + + bitmaps = None + + raise e + + def __call__( + self, + *, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + seed: Optional[int] = None, + response_format: Optional[ + llama_types.ChatCompletionRequestResponseFormat + ] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + add_generation_prompt: bool = True, + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, + **kwargs, # type: ignore + ) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], + ]: + # 1. Initialize mtmd context + self._init_mtmd_context(llama) + assert self.mtmd_ctx is not None + + # 2. Concurrent Preprocessing & Ledger Construction + full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( + llama=llama, + messages=messages, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + add_generation_prompt=add_generation_prompt, + ) + + if self.verbose: + print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) + + try: + # 3. KV Cache Synchronization & State Rollback + # Compares the virtual ledger with physical history to prevent Cache Poisoning. + current_history = llama.input_ids[:llama.n_tokens].tolist() + longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) + + if longest_prefix < llama.n_tokens: + if llama.is_hybrid and llama._hybrid_cache_mgr is not None: + if llama._hybrid_cache_mgr.max_checkpoints > 0: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " + f"Searching for nearest checkpoint...", file=sys.stderr) + + best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) + if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + llama.n_tokens = best_ckpt.pos + if self.verbose: + print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) + llama._ctx.memory_seq_rm(0, longest_prefix, -1) + llama.n_tokens = longest_prefix + + n_past = llama.n_tokens + + for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: + # Skip previously matched chunks + if end_idx <= n_past: + continue + + if self._is_text_chunk(chunk_type): + unprocessed_start = max(start_idx, n_past) - start_idx + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) + + if tokens_ptr and n_tokens_out.value > 0: + all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + tokens_to_eval = all_tokens[unprocessed_start:] + + if tokens_to_eval: + if self.verbose: + print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + # Text evaluation delegates shift and chunking to native llama.eval + llama.eval(tokens_to_eval) + n_past = llama.n_tokens + + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) + + if self.verbose: + media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" + print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + + # Stage 5: Multimodal Physical OOM Defense + if n_past + chunk_n_tokens > llama.n_ctx(): + if not llama._ctx.memory_can_shift(): + raise RuntimeError( + f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " + f"(n_pos_per_embd > 1 or incompatible M-RoPE). " + f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " + f"You MUST increase n_ctx to fit the dialogue." + ) + else: + # Safely discard oldest tokens while preserving system prompts + n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch + n_keep = min(llama.n_keep, n_past) + n_discard = min(n_discard, n_past - n_keep) + + if n_discard <= 0: + raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") + + if self.verbose: + print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) + + # Execute physical memory shift + llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) + llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) + + # Shift python virtual array to match + remaining_len = n_past - (n_keep + n_discard) + if remaining_len > 0: + llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] + + n_past -= n_discard + llama.n_tokens = n_past + + # Execute C++ Multimodal Black-box Extraction + new_n_past = llama_cpp_lib.llama_pos(0) + result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( + self.mtmd_ctx, + llama._ctx.ctx, + chunk_ptr, + llama_cpp_lib.llama_pos(n_past), + llama_cpp_lib.llama_seq_id(0), + llama.n_batch, + True, # logits_last = True, drastically saves computational overhead + ctypes.byref(new_n_past) + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") + + # Update Ledger with "Negative Reverse Vocabulary" IDs + llama.input_ids[n_past : new_n_past.value] = media_id + n_past = new_n_past.value + llama.n_tokens = n_past + + # Extract the final, perfectly synchronized prompt sequence + prompt = llama.input_ids[: llama.n_tokens].tolist() + + # End-of-Turn Checkpoint + # Anchors the state ONLY after the entire multi-modal turn is processed + if ( + llama.is_hybrid + and llama._hybrid_cache_mgr is not None + and llama._hybrid_cache_mgr.max_checkpoints > 0 + ): + if self.verbose: + print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) + + llama._hybrid_cache_mgr.save_checkpoint( + current_pos=llama.n_tokens, + tokens=prompt, + seq_id=0 + ) + finally: + # Cleanup chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Cleanup bitmaps + if bitmap_cleanup: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup.clear() + bitmap_array = None + + # Handle response format and tools (same as before) + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) + + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] + + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } + + tool = None + if ( + tool_choice is not None + and isinstance(tool_choice, dict) + and tools is not None + ): + name = tool_choice["function"]["name"] + tool = next((t for t in tools if t["function"]["name"] == name), None) + if tool is None: + raise ValueError(f"Tool choice '{name}' not found in tools.") + schema = tool["function"]["parameters"] + try: + # create grammar from json schema + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(schema), verbose=llama.verbose + ) + except Exception as e: + if llama.verbose: + print(str(e), file=sys.stderr) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + logprobs=top_logprobs if logprobs else None, + stream=stream, + stop=stop, + seed=seed, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, + ) + + if tool is not None: + tool_name = tool["function"]["name"] + return _convert_completion_to_chat_function( + tool_name, completion_or_chunks, stream + ) + return _convert_completion_to_chat(completion_or_chunks, stream=stream) + + def load_media(self, media_url: str, media_type: str) -> bytes: + """ + Unified dispatcher for loading media payloads. + Routes the URL/URI to the specific image, audio, or video processor based on the media_type. + """ + if media_type == "image": + return self._load_image(media_url) + + elif media_type == "audio": + audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") + try: + self.detect_audio_format(audio_bytes) + except ValueError as e: + raise ValueError(f"{self.log_prefix}(load_media): {e}") + return audio_bytes + + elif media_type == "video": + return self._load_bytes(media_url, timeout=30, kind="video") + + else: + raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") + + @staticmethod + def detect_audio_format(audio_bytes: bytes) -> str: + """ + Pure utility function: Detects the audio format from magic bytes. + Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility + and avoid false positives (e.g., AVI files disguised as RIFF). + """ + length = len(audio_bytes) + + if length < 12: + raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") + + # RIFF & WAVE magic bytes verification + is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" + + # ID3 metadata or MPEG sync word verification + is_mp3 = length >= 3 and ( + audio_bytes.startswith(b"ID3") or + (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) + ) + + # FLAC magic bytes verification + is_flac = audio_bytes.startswith(b"fLaC") + + if is_wav: + return "wav" + elif is_mp3: + return "mp3" + elif is_flac: + return "flac" + else: + raise ValueError( + "Unsupported audio format detected via magic bytes. " + "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." + ) + + DEFAULT_HTTP_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/148.0.0.0 Safari/537.36" + ), + } + + @staticmethod + def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: + """ + Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. + """ + media_bytes = b"" + + # 1. Handle data URI + if media_url.strip().startswith("data:"): + comma_pos = media_url.find(",") + if comma_pos == -1: + raise ValueError("Invalid data URI: missing comma separator") + + base64_data = media_url[comma_pos + 1:] + media_bytes = base64.b64decode(base64_data) + + # 2. Handle local file path + elif os.path.exists(media_url): + with open(media_url, "rb") as f: + media_bytes = f.read() + + # 3. Handle remote URL via HTTP/HTTPS + else: + req = urllib.request.Request( + media_url, + headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, + ) + try: + with urllib.request.urlopen(req, timeout=timeout) as f: + media_bytes = f.read() + except (URLError, HTTPError) as e: + raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") + + if not media_bytes: + raise ValueError(f"Empty {kind} data received") + + return media_bytes + + @staticmethod + def _load_image(image_url: str) -> bytes: + """ + Load an image from either a URL or a data URI and return it as JPEG bytes. + + Supports: + - Remote images via HTTP/HTTPS (with proper User-Agent) + - Data URIs (base64-encoded, e.g., data:image/png;base64,...) + - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background + - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html + + Returns: + JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. + """ + # 1. Load image bytes from image_url + image_bytes = MTMDChatHandler._load_bytes( + image_url, + timeout=15, + kind="image", + ) + + # 2. Check if image_bytes is empty. + if not image_bytes: + raise ValueError("Empty image data received") + + # 3. Open image with Pillow + try: + from PIL import Image, ImageStat + except ImportError: + raise ImportError("Pillow is required for image processing. Install with: pip install pillow") + + import io + image = Image.open(io.BytesIO(image_bytes)) + + # 4. Handle transparency (RGBA, LA, P with transparency, etc.) + if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): + # Use alpha channel as mask + if image.mode == "P": + image = image.convert("RGBA") + + alpha = image.split()[-1] # Last channel is alpha + # Compute average brightness of visible (non-transparent) pixels + stat = ImageStat.Stat(image.convert("L"), mask=alpha) + + # Choose background: white for dark content, black for bright content + bg_color = (255, 255, 255) # white + if stat.count[0] > 0 and stat.mean[0] > 127: + bg_color = (0, 0, 0) # black + + background = Image.new("RGB", image.size, bg_color) + background.paste(image, mask=alpha) + image = background + + # 5. Ensure RGB mode for formats like CMYK, palette, etc. + elif image.mode != "RGB": + image = image.convert("RGB") + + # 6. Save as high-quality JPEG, suitable for most vision models. + output = io.BytesIO() + image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) + return output.getvalue() + + @classmethod + def from_pretrained( + cls, + repo_id: str, + filename: Optional[str], + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + cache_dir: Optional[Union[str, os.PathLike[str]]] = None, + **kwargs: Any, + ) -> "MTMDChatHandler": + import fnmatch + from pathlib import Path + + try: + from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore + from huggingface_hub.utils import validate_repo_id # type: ignore + except ImportError: + raise ImportError( + "Llama.from_pretrained requires the huggingface_hub package. " + "You can install it with `pip install --upgrade huggingface_hub`." + ) + + validate_repo_id(repo_id) + + hffs = HfFileSystem() + + files = [ + file["name"] if isinstance(file, dict) else file + for file in hffs.ls(repo_id) # type: ignore + ] + + # split each file into repo_id, subfolder, filename + file_list: List[str] = [] + for file in files: + rel_path = Path(file).relative_to(repo_id) + file_list.append(str(rel_path)) + + matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore + + if len(matching_files) == 0: + raise ValueError( + f"No file found in {repo_id} that match {filename}\n\n" + f"Available Files:\n{json.dumps(file_list)}" + ) + + if len(matching_files) > 1: + raise ValueError( + f"Multiple files found in {repo_id} matching {filename}\n\n" + f"Available Files:\n{json.dumps(files)}" + ) + + (matching_file,) = matching_files + + subfolder = str(Path(matching_file).parent) + filename = Path(matching_file).name + + # download the file + hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=cast(Union[str, Path, None], local_dir), + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + ) + + if local_dir is None: + model_path = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + local_files_only=True, + ) + else: + model_path = os.path.join(local_dir, filename) + + return cls( + mmproj_path=model_path, + **kwargs, + ) + +# Generic template-driven MTMD handler. +class GenericMTMDChatHandler(MTMDChatHandler): + """ + Generic MTMD chat handler backed by the model-provided chat template. + + This handler is intentionally template-driven. It renders the model's + tokenizer.chat_template first, then normalizes rendered media URLs or + placeholder tokens into MTMD media markers before tokenization. + + It is designed for model templates that emit media placeholders such as + <|image_pad|>, <|image|>, , [IMG], or Kimi-style <|media_pad|>. + Model-specific handlers may still be preferable when a model requires + special stop tokens, generation flags, or non-standard template arguments. + """ + + KNOWN_MEDIA_TAGS = [ + # Pad placeholders inside model-specific wrappers. + "<|image_pad|>", + "<|audio_pad|>", + "<|video_pad|>", + + # Direct placeholders inside Gemma/Llama/GLM-style wrappers. + "<|image|>", + "<|audio|>", + "<|video|>", + + # LLaVA / LFM / Mistral-style placeholders. + "", + "