diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml new file mode 100644 index 0000000000..790d7c9665 --- /dev/null +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -0,0 +1,249 @@ +name: Build Wheels (CU130) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu130 + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["windows-2022"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] + cuda: ["13.0.2"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + + defaults: + run: + shell: pwsh + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Add MSBuild to PATH + uses: microsoft/setup-msbuild@v3 + with: + msbuild-architecture: x64 + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Inspect Visual Studio OpenMP runtime paths + run: | + Write-Output "ProgramFiles=$env:ProgramFiles" + Write-Output "ProgramFiles(x86)=${env:ProgramFiles(x86)}" + Write-Output "" + + $vsRoots = @( + "$env:ProgramFiles\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "$env:ProgramFiles\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC" + ) + + foreach ($root in $vsRoots) { + Write-Output "Checking root: $root" + + if (Test-Path $root) { + Write-Output " Exists: yes" + Write-Output " MSVC version directories:" + + Get-ChildItem $root -Directory -ErrorAction SilentlyContinue | + Sort-Object Name | + ForEach-Object { + Write-Output " $($_.FullName)" + } + + Write-Output " OpenMP runtime candidates:" + + Get-ChildItem $root -Recurse -Filter "libomp140.x86_64.dll" -ErrorAction SilentlyContinue | + Sort-Object FullName | + ForEach-Object { + $sizeKB = [Math]::Round($_.Length / 1KB, 2) + $sizeMB = [Math]::Round($_.Length / 1MB, 4) + + Write-Output " Path: $($_.FullName)" + Write-Output " Size: $($_.Length) bytes / $sizeKB KB / $sizeMB MB" + } + } else { + Write-Output " Exists: no" + } + + Write-Output "" + } + + Write-Output "Checking System32 fallback:" + $system32OpenMP = "C:\Windows\System32\libomp140.x86_64.dll" + + if (Test-Path $system32OpenMP) { + $dll = Get-Item $system32OpenMP + $sizeKB = [Math]::Round($dll.Length / 1KB, 2) + $sizeMB = [Math]::Round($dll.Length / 1MB, 4) + + Write-Output " Path: $($dll.FullName)" + Write-Output " Size: $($dll.Length) bytes / $sizeKB KB / $sizeMB MB" + } else { + Write-Output " Not found: $system32OpenMP" + } + + - name: Install CUDA ${{ matrix.cuda }} + uses: Jimver/cuda-toolkit@v0.2.35 + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda }} + use-github-cache: false + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 + } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + + # Write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV + + - name: Get current date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a9f359d1cd..ec81b294c4 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -40,11 +40,11 @@ jobs: # macOS Metal - os: macos-26 python-version: "3.9" - cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" + cmake_args: "-DGGML_METAL_EMBED_LIBRARY=off -DGGML_RPC=on" metal_status: "(Metal)" - os: macos-26 python-version: "3.14" - cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" + cmake_args: "-DGGML_METAL_EMBED_LIBRARY=off -DGGML_RPC=on" metal_status: "(Metal)" steps: diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f09cdb783..5b2cfeeb8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,8 @@ function(llama_cpp_python_install_target target) endfunction() -# Install an extra Windows runtime DLL into the Python package runtime directory. +# Copy an extra Windows runtime DLL into the Python package runtime directory +# during the CMake install step. # # Some dynamically loaded backend libraries depend on runtime DLLs that are not # always discoverable through $. One important example @@ -75,7 +76,10 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file) endif() if(NOT EXISTS "${runtime_file}") - message(WARNING "Windows runtime file does not exist and will not be installed: ${runtime_file}") + message(WARNING + "Windows runtime DLL was selected but does not exist and will not be copied: " + "${runtime_file}" + ) return() endif() @@ -92,6 +96,11 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file) foreach(DIR ${INSTALL_DIRS}) file(TO_CMAKE_PATH "${DIR}" DIR_CMAKE) + message(STATUS + "Will copy Windows runtime DLL during install: " + "${runtime_file_cmake} -> ${DIR_CMAKE}" + ) + install( FILES "${runtime_file_cmake}" DESTINATION "${DIR_CMAKE}" @@ -115,42 +124,73 @@ function(llama_cpp_python_install_windows_openmp_runtime) endif() set(OPENMP_RUNTIME_DLL "") + set(OPENMP_RUNTIME_SOURCE "") + set(FOUND_OPENMP_DLLS "") + + if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL) + if(EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "LLAMA_CPP_OPENMP_RUNTIME_DLL") + else() + message(WARNING + "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: " + "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio " + "VC143 LLVM OpenMP runtime discovery." + ) + endif() + endif() - if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL AND EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") - set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") - else() + if(NOT OPENMP_RUNTIME_DLL) file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) - set(VS_OPENMP_SEARCH_ROOTS - "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" - "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" - "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" - "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" - ) + set(VS_OPENMP_VC143_PATTERNS + # Prefer VS 2022 VC143 LLVM OpenMP redist paths. + # The MSVC version directory is intentionally globbed because + # GitHub runners may contain versions such as 14.44.35112 or 14.44.35207. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - foreach(ROOT ${VS_OPENMP_SEARCH_ROOTS}) - if(EXISTS "${ROOT}") - file( - GLOB_RECURSE FOUND_OPENMP_DLLS - "${ROOT}/*/debug_nonredist/x64/Microsoft.VC*.OpenMP.LLVM/libomp140.x86_64.dll" - "${ROOT}/**/libomp140.x86_64.dll" - ) + # Secondary VS layout fallbacks for unusual installations. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + ) - if(FOUND_OPENMP_DLLS) - list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) - break() - endif() - endif() + foreach(PATTERN ${VS_OPENMP_VC143_PATTERNS}) + file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}") + list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS}) endforeach() + + if(FOUND_OPENMP_DLLS) + list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS) + list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING) + list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) + set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 VC143 LLVM OpenMP redist") + endif() + endif() + + if(NOT OPENMP_RUNTIME_DLL) + set(SYSTEM32_OPENMP_RUNTIME_DLL "C:/Windows/System32/libomp140.x86_64.dll") + + if(EXISTS "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "System32 fallback") + endif() endif() if(OPENMP_RUNTIME_DLL) - message(STATUS "Installing Windows LLVM OpenMP runtime: ${OPENMP_RUNTIME_DLL}") + message(STATUS + "Selected Windows LLVM OpenMP runtime from ${OPENMP_RUNTIME_SOURCE}: " + "${OPENMP_RUNTIME_DLL}" + ) llama_cpp_python_install_windows_runtime_file("${OPENMP_RUNTIME_DLL}") else() message(WARNING - "Could not find libomp140.x86_64.dll. " + "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. " + "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL, Visual Studio 2022 " + "Enterprise/BuildTools VC143 redist paths under Program Files and " + "Program Files (x86), with a fuzzy MSVC version match such as " + "14.44.35112 or 14.44.35207, and C:/Windows/System32 as a final fallback. " "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " "the packaged ggml-cpu-*.dll files may fail to load at runtime. " "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 434921e6bd..91befb2247 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -152,12 +152,17 @@ def model_size(self) -> int: """ return llama_cpp.llama_model_size(self.model) - def model_chat_template(self, name: bytes) -> str: + def model_chat_template(self, name: Optional[bytes] = None) -> Optional[str]: """ - Get the default chat template. Returns nullptr if not available - If name is NULL, returns the default chat template + Get a chat template from the model. + + If name is None, returns the default chat template. + Returns None if no chat template is available. """ - return llama_cpp.llama_model_chat_template(self.model, name).decode("utf-8") + template = llama_cpp.llama_model_chat_template(self.model, name) + if template is None: + return None + return template.decode("utf-8") def n_params(self) -> int: """ diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d89f4c361d..b6a2c8d5a7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -45,6 +45,7 @@ from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama_chat_format as llama_chat_format +import llama_cpp.llama_multimodal as llama_multimodal from llama_cpp.llama_speculative import LlamaDraftModel @@ -96,6 +97,7 @@ class Llama: def __init__( self, model_path: str, + mmproj_path: Optional[str] = None, *, # Model Params n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", @@ -172,6 +174,8 @@ def __init__( log_filters: Optional[Sequence[str]] = None, log_filters_case_sensitive: bool = True, # Extra Params + chat_template_name: Optional[str] = None, + chat_handler_kwargs: Dict[str, Any] = {}, **kwargs, # type: ignore ): """Load a llama.cpp model from `model_path`. @@ -709,6 +713,18 @@ def __init__( self.metadata = {} if self.verbose: print(f"Failed to load metadata: {e}", file=sys.stderr) + + if mmproj_path is not None: + if self.chat_handler is not None and self.verbose: + print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True) + + self.chat_handler = llama_multimodal.GenericMTMDChatHandler( + chat_format = self.metadata.get("tokenizer.chat_template", None), + mmproj_path = mmproj_path, + verbose = self.verbose, + chat_template_name=chat_template_name, + **chat_handler_kwargs + ) if self.verbose: print(f"Model desc: {self.model_desc}, " diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index fb42a59f23..6ffe68e5e3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1,7 +1,5 @@ from __future__ import annotations -import base64 -import ctypes import dataclasses import datetime import json @@ -9,9 +7,7 @@ import random import string import sys -import zlib -from contextlib import ExitStack from typing import ( Any, Dict, @@ -32,16 +28,11 @@ import numpy as np import numpy.typing as npt -import urllib.request -from urllib.error import URLError, HTTPError - -import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama as llama_core import llama_cpp.llama_types as llama_types import llama_cpp.llama_grammar as llama_grammar -from ._ggml import GGMLLogLevel -from ._logger import logger, ggml_log_callback +from ._logger import logger from ._utils import suppress_stdout_stderr, Singleton ### Common Chat Templates and Special Tokens ### @@ -3037,3489 +3028,208 @@ def generate_streaming(tools, functions, function_call, prompt): ) -class MTMDChatHandler: - DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( -"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " -"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." - ) - - CHAT_FORMAT = ( - "{{ bos_token if bos_token is defined else '' }}" +@register_chat_completion_handler("chatml-function-calling") +def chatml_function_calling( + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + **kwargs, # type: ignore +) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], +]: + function_calling_template = ( "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% elif message.role == 'user' %}" - "USER: " - "{% if message.content is string %}" - "{{ message.content }}" - "{% elif message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% elif content.type == 'audio_url' %}" - "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" - "{% elif content.type == 'input_audio' %}" - "{% if content.input_audio is string %}" - "{{ content.input_audio }}" - "{% else %}" - "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" - "{% endif %}" - "{% elif content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - - "{% elif message.role == 'assistant' and message.content is not none %}" - "ASSISTANT: {{ message.content }}" - "{% endif %}" - "{{ \"\n\" }}" + "<|im_start|>{{ message.role }}\n" + # System message + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% if tool_calls %}" + "\n\nYou have access to the following functions:\n" + "{% for tool in tools %}" + "\nfunctions.{{ tool.function.name }}:\n" + "{{ tool.function.parameters | tojson }}" + "\n{% endfor %}" + "\n\nYou can respond to users messages with either a single message or one or more function calls." + "\n\nTo respond with a message begin the message with 'message:', use the following format:" + "\n\nmessage:" + "\n" + "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" + "\n\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "{% endif %}" + "<|im_end|>\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + ## Reglar message + "{% if message.content and message.content | length > 0 %}" + "{% if tool_calls %}" + "message:\n" + "{% endif %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + ## Function calls + "{% if 'tool_calls' in message %}" + "{% for tool_call in message.tool_calls %}" + "functions.{{ tool_call.function.name }}:\n" + "{{ tool_call.function.arguments }}" "{% endfor %}" - - "{% if eos_token is defined %}" - "{{ eos_token }}" + "<|im_end|>\n" "{% endif %}" - - "{% if add_generation_prompt %}" - "ASSISTANT: " "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" ) + template_renderer = ImmutableSandboxedEnvironment( + autoescape=jinja2.select_autoescape(["html", "xml"]), + undefined=jinja2.StrictUndefined, + ).from_string(function_calling_template) - def __init__( - self, - clip_model_path: str, - verbose: bool = True, - use_gpu: bool = True, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - **kwargs - ): - - self.log_prefix = self.__class__.__name__ - if kwargs: - unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) - raise TypeError( - f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" - f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." - ) - - self.clip_model_path = clip_model_path - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - self.use_gpu = use_gpu - self.verbose = verbose + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] - import llama_cpp.mtmd_cpp as mtmd_cpp - self._mtmd_cpp = mtmd_cpp - self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None - self.extra_template_arguments: dict[str, Any] = {} + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } - if not os.path.exists(clip_model_path): - raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") + stop = ( + [stop, "<|im_end|>"] + if isinstance(stop, str) + else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + ) - # Pre-compile Jinja template - self.chat_template = ImmutableSandboxedEnvironment( - trim_blocks=True, - lstrip_blocks=True, - ).from_string(self.CHAT_FORMAT) - - self._exit_stack = ExitStack() - - def _init_mtmd_context(self, llama_model: llama_core.Llama): - """Initialize mtmd context with the llama model.""" - if self.mtmd_ctx is not None: - return # Already initialized - - self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) - - # Get default parameters - self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() - self.mctx_params.use_gpu = self.use_gpu - self.mctx_params.print_timings = self.verbose - self.mctx_params.n_threads = llama_model.n_threads - self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO - self.mctx_params.warmup = True - if self.image_min_tokens > 0: - self.mctx_params.image_min_tokens = self.image_min_tokens - if self.image_max_tokens > 0: - self.mctx_params.image_max_tokens = self.image_max_tokens - if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " - f"cannot be less than image_min_tokens ({self.image_min_tokens}).") - - # Cache the model's eos token and bos token - self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') - self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') - - # Cache the mtmd_default_marker - self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - - # Initialize mtmd context - self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.clip_model_path.encode(), - llama_model.model, - self.mctx_params + # Case 1: No tool choice by user + if ( + tool_choice is None + or (isinstance(tool_choice, str) and tool_choice == "none") + or tools is None + or len(tools) == 0 + ): + prompt = template_renderer.render( + messages=messages, + tools=[], + tool_calls=None, + add_generation_prompt=True, ) - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") - - # Check if vision is supported - self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) - if self.is_support_vision: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) - - # Check if audio is supported - self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) - if self.is_support_audio: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) - - def close(self) -> None: - """Explicitly free the mtmd context and vision model resources.""" - if getattr(self, "mtmd_ctx", None) is not None: - try: - self._mtmd_cpp.mtmd_free(self.mtmd_ctx) - except Exception: - pass - self.mtmd_ctx = None - self.mctx_params = None - self.chat_template = None - - if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): - self._exit_stack.close() - self._exit_stack = None - - def __del__(self) -> None: - self.close() - - def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]: - """ - Extracts all media payloads (images, audio) sequentially to maintain exact chronological order. - Strictly enforces capability checks, raising exceptions if unsupported media is passed. - - Returns: - media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio). - """ - media_items: List[Dict[str, str]] = [] - for message in messages: - if isinstance(message.get("content"), list): - for content in message["content"]: - content_type = content.get("type", "") - - # 1. Vision Processing - if content_type == "image_url": - if not self.is_support_vision: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.") - - url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"] - media_items.append({"url": url, "type": "image"}) - - # 2. Audio Processing - elif content_type in ["audio_url", "input_audio"]: - if not self.is_support_audio: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") - - # Case A: Handle custom/forward-compatible audio_url format - if content_type == "audio_url": - audio_url = content["audio_url"] - url = audio_url if isinstance(audio_url, str) else audio_url["url"] - media_items.append({"url": url, "type": "audio"}) - # Case B: Handle OpenAI standard input_audio format - elif content_type == "input_audio": - input_audio = content.get("input_audio", {}) - if isinstance(input_audio, dict) and "data" in input_audio: - # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic - # input_audio: { - # data: audio.base64Data, - # format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' - # } - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - - # Strictly align with llama.cpp (require wav/mp3) - if audio_format not in ["wav", "mp3"]: - raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'") - - # Format as a Data URI to reuse the unified load_media logic - media_items.append({ - "url": f"data:audio/{audio_format};base64,{audio_data}", - "type": "audio" - }) - else: - # Just a raw base64 data - url = input_audio if isinstance(input_audio, str) else "" - if url: - media_items.append({"url": url, "type": "audio"}) - - # 3. Text & Unknown Types - elif content_type == "text": - continue - else: - if self.verbose: - print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr) - return media_items - - def _create_bitmap_from_bytes(self, media_bytes: bytes): - """ - Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. - - Supported formats: - - Images (via stb_image): jpg, png, bmp, etc. - - Audio (via miniaudio): wav, mp3, flac. - - Note: - - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. - - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. - - Args: - media_bytes (bytes): The raw byte content of the media file. + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) - Returns: - mtmd_bitmap: A pointer to the allocated bitmap structure containing decoded media features. - """ - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") - - # Create bitmap from buffer using helper function - bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( - self.mtmd_ctx, - (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), - len(media_bytes), - False, + return _convert_completion_to_chat( + llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=stop, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logprobs=top_logprobs if logprobs else None, + ), + stream=stream, ) - if bitmap is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): " - "Failed to load image or audio file from media bytes " - "(unsupported media format or corrupted data).") - - return bitmap - - - def _process_mtmd_prompt( - self, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - add_generation_prompt: bool = True, - ) -> Tuple[List[int], List[tuple], Any, List[Any]]: - """ - Core multimodal preprocessing pipeline. - Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. - - Features: - - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. - - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. - - Strict RAII-style C++ memory management to prevent leaks on failure. - - Returns: - full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. - chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). - chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). - bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. - """ - # 1. Inject default system prompt if omitted by the user - system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") - if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: - messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages - - media_items = self._get_media_items(messages) - media_marker = self.media_marker - - # 2. Render the chat template and replace actual URLs with C++ media markers - text = self.chat_template.render( + # Case 2: Tool choice by user + if isinstance(tool_choice, dict): + tool_name = tool_choice["function"]["name"] + tool = next( + (tool for tool in tools if tool["function"]["name"] == tool_name), None + ) + if tool is None: + raise ValueError(f"Tool with name '{tool_name}' not found in tools") + prompt = template_renderer.render( messages=messages, - add_generation_prompt=add_generation_prompt, - eos_token=self.mtmd_eos_token, - bos_token=self.mtmd_bos_token, - functions=functions, - function_call=function_call, tools=tools, - tool_choice=tool_choice, - **getattr(self, 'extra_template_arguments', {}) + tool_calls=True, + add_generation_prompt=True, ) - # Replace image_url by media_marker in text - for item in media_items: - text = text.replace(item["url"], media_marker) - - if self.verbose: - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) - - # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding - bitmaps = [None] * len(media_items) - bitmap_cleanup = [] - chunks = None - + prompt += f"functions.{tool_name}:\n" try: - # Concurrent Media Decoding - import concurrent.futures - if media_items: - def _create_bitmap_func(idx: int, item: str): - media_bytes = self.load_media(item["url"], item["type"]) - bitmap = self._create_bitmap_from_bytes(media_bytes) - return idx, bitmap - # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, - # which can be used in the future to process large numbers of video frames. - max_workers = min(llama.n_threads, len(media_items)) - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] - - for future in concurrent.futures.as_completed(futures): - idx, bitmap = future.result() - bitmaps[idx] = bitmap - bitmap_cleanup.append(bitmap) - - # Strict validation: Abort if any thread failed to decode its assigned media - if any(b is None for b in bitmaps): - raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") - else: - if self.verbose: - print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") - else: - # If there are no images, set the bitmaps to empty. - bitmaps = [] - - # 4. Initialize mtmd_input_chunks - input_text = self._mtmd_cpp.mtmd_input_text() - input_text.text = text.encode('utf-8') - input_text.add_special = (llama.n_tokens == 0) - input_text.parse_special = True - - chunks = self._mtmd_cpp.mtmd_input_chunks_init() - if chunks is None: - raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") - - # 5. Hybrid Tokenization (Text + Media binding) - if len(bitmaps) > 0: - bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) - ) - else: - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") - - # 6. Virtual Token Ledger Construction - full_prompt_ids = [] - chunk_token_spans = [] - current_idx = 0 - n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) - - # Cursor to track the actual media contents (URLs or base64 data) provided by the user - media_items_count = len(media_items) - media_items_cur = 0 - - for i in range(n_chunks): - chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) - if chunk is None: continue - chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: - # Extract standard text token IDs - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) - if tokens_ptr and n_tokens_out.value > 0: - tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) - full_prompt_ids.extend(tokens) - current_idx += len(tokens) - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: - # Extract media properties - # Note(JamePeng): - # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). - # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. - # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - - if media_items_cur < media_items_count: - # The C++ parser only sees identical placeholders (e.g., "<__media__>"). - # We MUST inject the actual media content's identity here. - real_media_url = media_items[media_items_cur]["url"] - # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) - # Generate a deterministic, unique negative ID for this specific image/audio. - # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). - # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with - # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). - # This empowers `longest_token_prefix` to correctly identify and reuse cached images, - # while instantly breaking the match if the image content changes. - # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 - media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 - media_items_cur += 1 - else: - # Magic Negative Number as fallback :) - media_id = -314159 - - if self.verbose: - print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") - - chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) - - # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache - full_prompt_ids.extend([media_id] * chunk_n_tokens) - current_idx += chunk_n_tokens - else: - raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") - - return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup - - except Exception as e: - # Ensure no useless pointers remain upon any failure - # Free chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Free bitmaps - if len(bitmap_cleanup) > 0: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup = None - bitmaps = None - - raise e - - def __call__( - self, - *, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - seed: Optional[int] = None, - response_format: Optional[ - llama_types.ChatCompletionRequestResponseFormat - ] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logit_bias: Optional[Dict[str, float]] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - add_generation_prompt: bool = True, - reasoning_budget: int = -1, - reasoning_start: str = "", - reasoning_end: str = "", - reasoning_budget_message: Optional[str] = None, - reasoning_start_in_prompt: bool = False, - reasoning_start_max_tokens: Optional[int] = 32, - **kwargs, # type: ignore - ) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], - ]: - # 1. Initialize mtmd context - self._init_mtmd_context(llama) - assert self.mtmd_ctx is not None - - # 2. Concurrent Preprocessing & Ledger Construction - full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( - llama=llama, - messages=messages, - functions=functions, - function_call=function_call, - tools=tools, - tool_choice=tool_choice, - add_generation_prompt=add_generation_prompt, - ) - - if self.verbose: - print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) - - try: - # 3. KV Cache Synchronization & State Rollback - # Compares the virtual ledger with physical history to prevent Cache Poisoning. - current_history = llama.input_ids[:llama.n_tokens].tolist() - longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) - - if longest_prefix < llama.n_tokens: - if llama.is_hybrid and llama._hybrid_cache_mgr is not None: - if llama._hybrid_cache_mgr.max_checkpoints > 0: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " - f"Searching for nearest checkpoint...", file=sys.stderr) - - best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) - if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): - llama.n_tokens = best_ckpt.pos - if self.verbose: - print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) - llama._ctx.memory_seq_rm(0, longest_prefix, -1) - llama.n_tokens = longest_prefix - - n_past = llama.n_tokens - - for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: - # Skip previously matched chunks - if end_idx <= n_past: - continue - - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: - unprocessed_start = max(start_idx, n_past) - start_idx - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) - - if tokens_ptr and n_tokens_out.value > 0: - all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - tokens_to_eval = all_tokens[unprocessed_start:] - - if tokens_to_eval: - if self.verbose: - print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - # Text evaluation delegates shift and chunking to native llama.eval - llama.eval(tokens_to_eval) - n_past = llama.n_tokens - - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) - - if self.verbose: - media_str = "IMAGE" if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE else "AUDIO" - print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - - # Stage 5: Multimodal Physical OOM Defense - if n_past + chunk_n_tokens > llama.n_ctx(): - if not llama._ctx.memory_can_shift(): - raise RuntimeError( - f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " - f"(n_pos_per_embd > 1 or incompatible M-RoPE). " - f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " - f"You MUST increase n_ctx to fit the dialogue." - ) - else: - # Safely discard oldest tokens while preserving system prompts - n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch - n_keep = min(llama.n_keep, n_past) - n_discard = min(n_discard, n_past - n_keep) - - if n_discard <= 0: - raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") - - if self.verbose: - print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) - - # Execute physical memory shift - llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) - llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) - - # Shift python virtual array to match - remaining_len = n_past - (n_keep + n_discard) - if remaining_len > 0: - llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] - - n_past -= n_discard - llama.n_tokens = n_past - - # Execute C++ Multimodal Black-box Extraction - new_n_past = llama_cpp_lib.llama_pos(0) - result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( - self.mtmd_ctx, - llama._ctx.ctx, - chunk_ptr, - llama_cpp_lib.llama_pos(n_past), - llama_cpp_lib.llama_seq_id(0), - llama.n_batch, - True, # logits_last = True, drastically saves computational overhead - ctypes.byref(new_n_past) - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") - - # Update Ledger with "Negative Reverse Vocabulary" IDs - llama.input_ids[n_past : new_n_past.value] = media_id - n_past = new_n_past.value - llama.n_tokens = n_past - - # Extract the final, perfectly synchronized prompt sequence - prompt = llama.input_ids[: llama.n_tokens].tolist() - - # End-of-Turn Checkpoint - # Anchors the state ONLY after the entire multi-modal turn is processed - if ( - llama.is_hybrid - and llama._hybrid_cache_mgr is not None - and llama._hybrid_cache_mgr.max_checkpoints > 0 - ): - if self.verbose: - print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) - - llama._hybrid_cache_mgr.save_checkpoint( - current_pos=llama.n_tokens, - tokens=prompt, - seq_id=0 - ) - finally: - # Cleanup chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Cleanup bitmaps - if bitmap_cleanup: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup.clear() - bitmap_array = None - - # Handle response format and tools (same as before) - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - tool = None - if ( - tool_choice is not None - and isinstance(tool_choice, dict) - and tools is not None - ): - name = tool_choice["function"]["name"] - tool = next((t for t in tools if t["function"]["name"] == name), None) - if tool is None: - raise ValueError(f"Tool choice '{name}' not found in tools.") - schema = tool["function"]["parameters"] - try: - # create grammar from json schema - grammar = llama_grammar.LlamaGrammar.from_json_schema( - json.dumps(schema), verbose=llama.verbose - ) - except Exception as e: - if llama.verbose: - print(str(e), file=sys.stderr) - grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.JSON_GBNF, verbose=llama.verbose - ) - - completion_or_chunks = llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - logprobs=top_logprobs if logprobs else None, - stream=stream, - stop=stop, - seed=seed, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logit_bias=logit_bias, - reasoning_budget=reasoning_budget, - reasoning_start=reasoning_start, - reasoning_end=reasoning_end, - reasoning_budget_message=reasoning_budget_message, - reasoning_start_in_prompt=reasoning_start_in_prompt, - reasoning_start_max_tokens=reasoning_start_max_tokens, - ) - - if tool is not None: - tool_name = tool["function"]["name"] - return _convert_completion_to_chat_function( - tool_name, completion_or_chunks, stream - ) - return _convert_completion_to_chat(completion_or_chunks, stream=stream) - - def load_media(self, media_url: str, media_type: str) -> bytes: - """ - Unified dispatcher for loading media payloads. - Routes the URL/URI to the specific image or audio processor based on the media_type. - """ - if media_type == "image": - return self._load_image(media_url) - elif media_type == "audio": - audio_bytes = self._load_audio(media_url) - # Apply ironclad magic bytes validation before returning - try: - self.detect_audio_format(audio_bytes) - except ValueError as e: - raise ValueError(f"{self.log_prefix}(load_media): {e}") - return audio_bytes - else: - raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") - - @staticmethod - def detect_audio_format(audio_bytes: bytes) -> str: - """ - Pure utility function: Detects the audio format from magic bytes. - Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility - and avoid false positives (e.g., AVI files disguised as RIFF). - """ - length = len(audio_bytes) - - if length < 12: - raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") - - # RIFF & WAVE magic bytes verification - is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" - - # ID3 metadata or MPEG sync word verification - is_mp3 = length >= 3 and ( - audio_bytes.startswith(b"ID3") or - (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) - ) - - # FLAC magic bytes verification - is_flac = audio_bytes.startswith(b"fLaC") - - if is_wav: - return "wav" - elif is_mp3: - return "mp3" - elif is_flac: - return "flac" - else: - raise ValueError( - "Unsupported audio format detected via magic bytes. " - "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." - ) - - @staticmethod - def _load_audio(audio_url: str) -> bytes: - """ - Load audio from either a URL, local path, or a data URI and return raw bytes. - """ - - audio_bytes = b"" - - # 1. Handle data URI (base64) - if audio_url.strip().startswith("data:"): - comma_pos = audio_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - base64_data = audio_url[comma_pos + 1 :] - audio_bytes = base64.b64decode(base64_data) - - # 2. Handle local file path - elif os.path.exists(audio_url): - with open(audio_url, "rb") as f: - audio_bytes = f.read() - - # 3. Handle remote URL via HTTP/HTTPS - else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(audio_url, headers=headers) - try: - with urllib.request.urlopen(req, timeout=15) as f: - audio_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download audio from {audio_url}: {e}") - - if not audio_bytes: - raise ValueError("Empty audio data received") - - return audio_bytes - - @staticmethod - def _load_image(image_url: str) -> bytes: - """ - Load an image from either a URL or a data URI and return it as JPEG bytes. - - Supports: - - Remote images via HTTP/HTTPS (with proper User-Agent) - - Data URIs (base64-encoded, e.g., data:image/png;base64,...) - - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background - - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html - - Returns: - JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. - """ - image_bytes = b"" - - # 1. Handle data URI (base64) - if image_url.strip().startswith("data:"): - # Split only once from the right to correctly handle mime types containing commas - comma_pos = image_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - base64_data = image_url[comma_pos + 1 :] - image_bytes = base64.b64decode(base64_data) - - # 2. Handle local/remote URL - else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(image_url, headers=headers) - - try: - with urllib.request.urlopen(req, timeout=15) as f: - image_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download image from {image_url}: {e}") - - if not image_bytes: - raise ValueError("Empty image data received") - - # 3. Open image with Pillow - try: - from PIL import Image, ImageStat - except ImportError: - raise ImportError("Pillow is required for image processing. Install with: pip install pillow") - - import io - image = Image.open(io.BytesIO(image_bytes)) - - # 4. Handle transparency (RGBA, LA, P with transparency, etc.) - if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): - # Use alpha channel as mask - if image.mode == "P": - image = image.convert("RGBA") - - alpha = image.split()[-1] # Last channel is alpha - # Compute average brightness of visible (non-transparent) pixels - stat = ImageStat.Stat(image.convert("L"), mask=alpha) - - # Choose background: white for dark content, black for bright content - bg_color = (255, 255, 255) # white - if stat.count[0] > 0 and stat.mean[0] > 127: - bg_color = (0, 0, 0) # black - - background = Image.new("RGB", image.size, bg_color) - background.paste(image, mask=alpha) - image = background - - # 5. Ensure RGB mode for formats like CMYK, palette, etc. - elif image.mode != "RGB": - image = image.convert("RGB") - - # 6. Save as high-quality JPEG, suitable for most vision models. - output = io.BytesIO() - image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) - return output.getvalue() - - @classmethod - def from_pretrained( - cls, - repo_id: str, - filename: Optional[str], - local_dir: Optional[Union[str, os.PathLike[str]]] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", - cache_dir: Optional[Union[str, os.PathLike[str]]] = None, - **kwargs: Any, - ) -> "MTMDChatHandler": - import fnmatch - from pathlib import Path - - try: - from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore - from huggingface_hub.utils import validate_repo_id # type: ignore - except ImportError: - raise ImportError( - "Llama.from_pretrained requires the huggingface_hub package. " - "You can install it with `pip install --upgrade huggingface_hub`." - ) - - validate_repo_id(repo_id) - - hffs = HfFileSystem() - - files = [ - file["name"] if isinstance(file, dict) else file - for file in hffs.ls(repo_id) # type: ignore - ] - - # split each file into repo_id, subfolder, filename - file_list: List[str] = [] - for file in files: - rel_path = Path(file).relative_to(repo_id) - file_list.append(str(rel_path)) - - matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore - - if len(matching_files) == 0: - raise ValueError( - f"No file found in {repo_id} that match {filename}\n\n" - f"Available Files:\n{json.dumps(file_list)}" - ) - - if len(matching_files) > 1: - raise ValueError( - f"Multiple files found in {repo_id} matching {filename}\n\n" - f"Available Files:\n{json.dumps(files)}" - ) - - (matching_file,) = matching_files - - subfolder = str(Path(matching_file).parent) - filename = Path(matching_file).name - - # download the file - hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=cast(Union[str, Path, None], local_dir), - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - ) - - if local_dir is None: - model_path = hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=local_dir, - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - local_files_only=True, - ) - else: - model_path = os.path.join(local_dir, filename) - - return cls( - clip_model_path=model_path, - **kwargs, - ) - - -class Llava15ChatHandler(MTMDChatHandler): - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - - "{% if message.role == 'user' %}" - "{% if message.content is string %}" - "\nUSER: {{ message.content }}" - "{% elif message.content is iterable %}" - "\nUSER: " - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% endif %}" - - "{% if message.role == 'assistant' and message.content is not none %}" - "\nASSISTANT: {{ message.content }}" - "{% endif %}" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "\nASSISTANT: " - "{% endif %}" - ) - - -class ObsidianChatHandler(MTMDChatHandler): - # Prompt Format - # The model followed ChatML format. However, with ### as the seperator - - # <|im_start|>user - # What is this sign about?\n - # ### - # <|im_start|>assistant - # The sign is about bullying, and it is placed on a black background with a red background. - # ### - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}\n" - "###\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "###\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "###\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MoondreamChatHandler(MTMDChatHandler): - # Chat Format: - # f"\n\n{chat_history}Question: {question}\n\nAnswer:" - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "Question: {{ content.text }}\n\n" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "Question: {{ message.content }}\n\n" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "Answer:{{ message.content }}\n\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class Llava16ChatHandler(MTMDChatHandler): - # Example prompt - # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "{{ message.content }}" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class NanoLlavaChatHandler(MTMDChatHandler): - # Prompt Format - # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: - - # <|im_start|>system - # Answer the question<|im_end|><|im_start|>user - # - # What is the picture about?<|im_end|><|im_start|>assistant - DEFAULT_SYSTEM_MESSAGE = "Answer the question" - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "<|im_end|>" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class Llama3VisionAlphaChatHandler(MTMDChatHandler): - # question = "" + q - - # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "<|start_header_id|>" - "{% if message.role == 'user' %}" - "user<|end_header_id|>\n\n" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "assistant<|end_header_id|>\n\n" - "{{ message.content }}" - "{% endif %}" - "<|eot_id|>" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|start_header_id|>assistant<|end_header_id|>\n\n" - "{% endif %}" - ) - - -# alias -Llama3VisionAlpha = Llama3VisionAlphaChatHandler - - -class MiniCPMv26ChatHandler(MTMDChatHandler): - - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and messages[0]['role'] != 'system' %}" - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is iterable %}" - "{% for content in message['content'] %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - - "{% for content in message['content'] %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% if message['content'] is string %}" - "{{ message['content'] }}" - "{% endif %}" - "<|im_end|>\n" - "{% endfor %}" - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MiniCPMv45ChatHandler(MTMDChatHandler): - """ - Handler for MiniCPM-V 4.5 models. - - Supports: - - Multi-step tool calls with and XML tags. - - Integrated reasoning (thinking) process with tags. - - Specialized system prompt handling with tool definitions. - - Global image numbering for multi-image processing. - """ - - # Model specific control tokens - MINICPMV_BOS_TOKEN = "<|im_start|>" - MINICPMV_EOS_TOKEN = "<|im_end|>" - MINICPMV_PAD_TOKEN = "<|endoftext|>" - - # Image placeholder tags - MINICPMV_IMAGE_START_TOKEN = "" - MINICPMV_IMAGE_END_TOKEN = "" - MINICPMV_IMAGE_ID_START_TOKEN = "" - MINICPMV_IMAGE_ID_END_TOKEN = "" - - CHAT_FORMAT = ( - # --- 1. First System Message & Tools Definitions --- - "{%- if tools %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}" - "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}" - "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}" - "{{- 'You are provided with function signatures within XML tags:\\n' }}" - "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}" - "{{- '\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\"name\": , \"arguments\": }\\n" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- elif messages[0].role == 'system' %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - - # --- 2. Message Stream Processing --- - "{% set image_count = namespace(value=0) %}" - "{%- for message in messages %}" - # --- Unified Role Handling (User, Assistant, and subsequent Systems) --- - "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}" - "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}" - - "{%- set content = message.content %}" - "{%- if content is not string %}" - "{%- set ns = namespace(content_str='') %}" - "{%- for item in content %}" - # --- Explicit image_url type and value checking --- - "{%- if item.type == 'image_url' %}" - "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}" - "{%- set image_count.value = image_count.value + 1 %}" - # Format: N: IMAGE_URL - "{%- set ns.content_str = ns.content_str + '' + (image_count.value | string) + ': ' + image_url + '' %}" - "{%- elif item.type == 'text' %}" - "{%- set ns.content_str = ns.content_str + item.text %}" - "{%- endif %}" - "{%- endfor %}" - "{%- set content = ns.content_str %}" - "{%- endif %}" - - "{{- content -}}" - - # Append tool_calls to assistant messages if they exist - "{%- if message.role == 'assistant' and message.tool_calls %}" - "{%- for tool_call in message.tool_calls %}" - "{%- set tc = tool_call.function if tool_call.function else tool_call %}" - "{{- '\\n\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}" - "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}" - "{{- '}\\n' }}" - "{%- endfor %}" - "{%- endif %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - - # --- Specialized Tool Response Handling --- - # Group consecutive tool responses under a single user-like block - "{%- elif message.role == 'tool' %}" - "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}" - "{{- '" + MINICPMV_BOS_TOKEN + "user' }}" - "{%- endif %}" - "{{- '\\n\\n' + message.content + '\\n' }}" - "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - "{%- endif %}" - "{%- endfor %}" - - # --- 3. Generation Prompt --- - "{%- if add_generation_prompt %}" - "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}" - # Handle thinking/reasoning block visibility based on configuration - "{%- if enable_thinking is defined and enable_thinking is false %}" - "{{- '\\n\\n\\n\\n' }}" - "{%- elif enable_thinking is defined and enable_thinking is true %}" - "{{- '\\n' }}" - "{%- endif %}" - "{%- endif %}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the MiniCPM-V 4.5 Handler. - - Args: - enable_thinking (bool): If True, model generates reasoning before the final answer. - **kwargs: Additional arguments for the base MTMDChatHandler. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject thinking control flag into the template - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set stop token patch - kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class MiniCPMV46ChatHandler(MTMDChatHandler): - """ - Handler for MiniCPM-V-4.6 models. - - Features: - - Aligned with official tokenizer_config.json special tokens. - - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens. - - Integrated MTMD-style URL and Base64 injection for visual content. - - Specialized `` and `` block generation. - - Autonomously folds previous reasoning paths using `last_query_index`. - - Toggles `` block generation via `enable_thinking` (Defaults to False). - """ - - # Core tokens - MINICPM_BOS_TOKEN = "<|im_start|>" - MINICPM_EOS_TOKEN = "<|im_end|>" - MINICPM_PAD_TOKEN = "<|endoftext|>" - - # Vision tokens - MINICPM_VISION_BOS_TOKEN = "<|vision_start|>" - MINICPM_VISION_EOS_TOKEN = "<|vision_end|>" - MINICPM_IMAGE_TOKEN = "<|image_pad|>" - MINICPM_VIDEO_TOKEN = "<|video_pad|>" - - CHAT_FORMAT = ( - "{%- if enable_thinking is not defined -%}\n" - " {%- set enable_thinking = false -%}\n" - "{%- endif -%}\n" - "{%- macro render_content(content, is_system_content=false) -%}\n" - " {%- if content is string -%}\n" - " {{- content -}}\n" - " {%- elif content is iterable and content is not mapping -%}\n" - " {%- set ns = namespace(parts=[]) -%}\n" - " {%- for item in content -%}\n" - " {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n" - " {%- if is_system_content -%}\n" - " {{- raise_exception('System message cannot contain images.') -}}\n" - " {%- endif -%}\n" - " {%- set url_val = '' -%}\n" - " {%- if item.type == 'image_url' -%}\n" - " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" - " {%- endif -%}\n" - " {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n" - # " {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n" - # " {%- if is_system_content -%}\n" - # " {{- raise_exception('System message cannot contain videos.') -}}\n" - # " {%- endif -%}\n" - # " {%- set url_val = '' -%}\n" - # " {%- if item.type == 'video_url' -%}\n" - # " {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n" - # " {%- endif -%}\n" - # " {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n" - " {%- elif 'text' in item -%}\n" - " {%- set ns.parts = ns.parts + [item.text] -%}\n" - " {%- else -%}\n" - " {{- raise_exception('Unexpected item type in content.') -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- ns.parts | join('\\n') -}}\n" - " {%- elif content is none or content is undefined -%}\n" - " {{- '' -}}\n" - " {%- else -%}\n" - " {{- raise_exception('Unexpected content type.') -}}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "{%- if not messages %}\n" - " {{- raise_exception('No messages provided.') }}\n" - "{%- endif %}\n" - "{%- if tools and tools is iterable and tools is not mapping %}\n" - " {{- '<|im_start|>system\\n' }}\n" - " {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n' }}\n" - " {%- for tool in tools %}\n" - " {{- '\\n' }}\n" - " {{- tool | tojson }}\n" - " {%- endfor %}\n" - " {{- '\\n' }}\n" - " {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n\\n\\n\\nvalue_1\\n\\n\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n\\n\\n\\n\\n\\nReminder:\\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n' }}\n" - " {%- if messages[0].role == 'system' %}\n" - " {%- set content = render_content(messages[0].content, true)|trim %}\n" - " {%- if content %}\n" - " {{- '\\n\\n' + content }}\n" - " {%- endif %}\n" - " {%- endif %}\n" - " {{- '<|im_end|>\\n' }}\n" - "{%- else %}\n" - " {%- if messages[0].role == 'system' %}\n" - " {%- set content = render_content(messages[0].content, true)|trim %}\n" - " {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n" - " {%- endif %}\n" - "{%- endif %}\n" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n" - "{%- for message in messages[::-1] %}\n" - " {%- set index = (messages|length - 1) - loop.index0 %}\n" - " {%- if ns.multi_step_tool and message.role == 'user' %}\n" - " {%- set content = render_content(message.content)|trim %}\n" - " {%- if not(content.startswith('') and content.endswith('')) %}\n" - " {%- set ns.multi_step_tool = false %}\n" - " {%- set ns.last_query_index = index %}\n" - " {%- endif %}\n" - " {%- endif %}\n" - "{%- endfor %}\n" - "{%- if ns.multi_step_tool %}\n" - " {{- raise_exception('No user query found in messages.') }}\n" - "{%- endif %}\n" - "{%- for message in messages %}\n" - " {%- set content = render_content(message.content)|trim %}\n" - " {%- if message.role == 'system' %}\n" - " {%- if not loop.first %}\n" - " {{- raise_exception('System message must be at the beginning.') }}\n" - " {%- endif %}\n" - " {%- elif message.role == 'user' %}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n" - " {%- elif message.role == 'assistant' %}\n" - " {%- set reasoning_content = '' %}\n" - " {%- if message.reasoning_content is string %}\n" - " {%- set reasoning_content = message.reasoning_content %}\n" - " {%- else %}\n" - " {%- if '' in content %}\n" - " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n" - " {%- set content = content.split('')[-1].lstrip('\\n') %}\n" - " {%- endif %}\n" - " {%- endif %}\n" - " {%- set reasoning_content = reasoning_content|trim %}\n" - " {%- if loop.index0 > ns.last_query_index %}\n" - " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n\\n' + content }}\n" - " {%- else %}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content }}\n" - " {%- endif %}\n" - " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n" - " {%- for tool_call in message.tool_calls %}\n" - " {%- if tool_call.function is defined %}\n" - " {%- set tool_call = tool_call.function %}\n" - " {%- endif %}\n" - " {%- if loop.first %}\n" - " {%- if content|trim %}\n" - " {{- '\\n\\n\\n\\n' }}\n" - " {%- else %}\n" - " {{- '\\n\\n' }}\n" - " {%- endif %}\n" - " {%- else %}\n" - " {{- '\\n\\n\\n' }}\n" - " {%- endif %}\n" - " {%- if tool_call.arguments is defined %}\n" - " {%- for args_name, args_value in tool_call.arguments|items %}\n" - " {{- '\\n' }}\n" - " {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n" - " {{- args_value }}\n" - " {{- '\\n\\n' }}\n" - " {%- endfor %}\n" - " {%- endif %}\n" - " {{- '\\n' }}\n" - " {%- endfor %}\n" - " {%- endif %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- elif message.role == 'tool' %}\n" - " {%- if loop.previtem and loop.previtem.role != 'tool' %}\n" - " {{- '<|im_start|>user' }}\n" - " {%- endif %}\n" - " {{- '\\n\\n' }}\n" - " {{- content }}\n" - " {{- '\\n' }}\n" - " {%- if not loop.last and loop.nextitem.role != 'tool' %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- elif loop.last %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- endif %}\n" - " {%- else %}\n" - " {{- raise_exception('Unexpected message role.') }}\n" - " {%- endif %}\n" - "{%- endfor %}\n" - "{%- if add_generation_prompt %}\n" - " {{- '<|im_start|>assistant\\n' }}\n" - " {%- if enable_thinking is defined and enable_thinking is false %}\n" - " {{- '\\n\\n\\n\\n' }}\n" - " {%- else %}\n" - " {{- '\\n' }}\n" - " {%- endif %}\n" - "{%- endif %}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the MiniCPM-V-4.6 Handler. - - Args: - enable_thinking (bool): Controls whether to open a `` block for reasoning. - Defaults to False as per the standard template logic. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject the thinking variable into the Jinja environment - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # MiniCPM uses standard <|im_end|> ChatML stop formatting - kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class Gemma3ChatHandler(MTMDChatHandler): - - GEMMA3_BOI_TOKEN = "" - GEMMA3_EOI_TOKEN = "" - GEMMA3_BOS_TOKEN = "" - GEMMA3_EOS_TOKEN = "" - - CHAT_FORMAT = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" - "{% if messages[0]['content'] is string %}" - "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" - "{% else %}" - "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" - "{% endif %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set first_user_prefix = '' %}" - "{% endif %}" - - "{% for message in loop_messages %}" - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" - "{% endif %}" - - "{% if message['role'] == 'assistant' %}" - "{% set role = 'model' %}" - "{% else %}" - "{% set role = message['role'] %}" - "{% endif %}" - - "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" - - "{% if message['content'] is string %}" - "{{ message['content'] | trim }}" - "{% elif message['content'] is iterable %}" - "{% for item in message['content'] %}" - "{% if item['type'] == 'image_url' and item['image_url'] is string %}" - "{{ '' + item['image_url'] + '' }}" - "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" - "{{ '' + item['image_url']['url'] + '' }}" - "{% elif item['type'] == 'text' %}" - "{{ item['text'] | trim }}" - "{% endif %}" - "{% endfor %}" - "{% else %}" - "{{ raise_exception('Invalid content type') }}" - "{% endif %}" - - "\n" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "model\n" - "{% endif %}" - ) - - -class Gemma4ChatHandler(MTMDChatHandler): - """ - Handler for Gemma 4 models. - - Note on `enable_thinking`: - The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. - It is NOT supported by Gemma4 E2B and E4B models. - - [Important Note for Audio Processing!] - It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models. - Other quantizations are known to have degraded performance; - ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463 - """ - - # The special token in Gemma 4 - GEMMA4_BOI_TOKEN = "<|image>" - GEMMA4_EOI_TOKEN = "" - GEMMA4_BOA_TOKEN = "<|audio>" - GEMMA4_EOA_TOKEN = "" - GEMMA4_BOS_TOKEN = "" - GEMMA4_EOS_TOKEN = "" - GEMMA4_SOT_TOKEN = "<|turn>" - GEMMA4_EOT_TOKEN = "" - GEMMA4_SOC_TOKEN = "<|channel>" - GEMMA4_EOC_TOKEN = "" - GEMMA4_STC_TOKEN = "<|tool_call>" - GEMMA4_ETC_TOKEN = "" - GEMMA4_STD_TOKEN = "<|tool>" - GEMMA4_ETD_TOKEN = "" - GEMMA4_STR_TOKEN = "<|tool_response>" - GEMMA4_ETR_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n" - " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in properties | dictsort -%}\n" - " {%- set add_comma = false -%}\n" - " {%- if not filter_keys or key not in standard_keys -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {{ key }}:{\n" - " {%- if value['description'] -%}\n" - " description:<|\"|>{{ value['description'] }}<|\"|>\n" - " {%- set add_comma = true -%}\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'STRING' -%}\n" - " {%- if value['enum'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " enum:{{ format_argument(value['enum']) }}\n" - " {%- endif -%}\n" - " {%- elif value['type'] | upper == 'ARRAY' -%}\n" - " {%- if value['items'] is mapping and value['items'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " items:{\n" - " {%- set ns_items = namespace(found_first=false) -%}\n" - " {%- for item_key, item_value in value['items'] | dictsort -%}\n" - " {%- if item_value is not none -%}\n" - " {%- if ns_items.found_first %},{% endif -%}\n" - " {%- set ns_items.found_first = true -%}\n" - " {%- if item_key == 'properties' -%}\n" - " properties:{\n" - " {%- if item_value is mapping -%}\n" - " {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n" - " {%- endif -%}\n" - " }\n" - " {%- elif item_key == 'required' -%}\n" - " required:[\n" - " {%- for req_item in item_value -%}\n" - " <|\"|>{{- req_item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- elif item_key == 'type' -%}\n" - " {%- if item_value is string -%}\n" - " type:{{ format_argument(item_value | upper) }}\n" - " {%- else -%}\n" - " type:{{ format_argument(item_value | map('upper') | list) }}\n" - " {%- endif -%}\n" - " {%- else -%}\n" - " {{ item_key }}:{{ format_argument(item_value) }}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " }\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if value['nullable'] %}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " nullable:true\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'OBJECT' -%}\n" - " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" - " }\n" - " {%- elif value is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n" - " }\n" - " {%- endif -%}\n" - " {%- if value['required'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " required:[\n" - " {%- for item in value['required'] | default([]) -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - "{%- endmacro -%}\n" - "{%- macro format_function_declaration(tool_data) -%}\n" - " declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n" - " {%- set params = tool_data['function']['parameters'] -%}\n" - " {%- if params -%}\n" - " ,parameters:{\n" - " {%- if params.get('properties') -%}\n" - " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" - " {%- endif -%}\n" - " {%- if params.get('required') -%}\n" - " required:[\n" - " {%- for item in params['required'] -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {{- ',' if not loop.last -}}\n" - " {%- endfor -%}\n" - " ],\n" - " {%- endif -%}\n" - " {%- if params.get('type') -%}\n" - " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if 'response' in tool_data['function'] -%}\n" - " {%- set response_declaration = tool_data['function']['response'] -%}\n" - " ,response:{\n" - " {%- if response_declaration['description'] -%}\n" - " description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n" - " {%- endif -%}\n" - " {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n" - " type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " }\n" - "{%- endmacro -%}\n" - "{%- macro format_argument(argument, escape_keys=True) -%}\n" - " {%- if argument is string -%}\n" - " {{- '<|\"|>' + argument + '<|\"|>' -}}\n" - " {%- elif argument is boolean -%}\n" - " {{- 'true' if argument else 'false' -}}\n" - " {%- elif argument is mapping -%}\n" - " {{- '{' -}}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in argument | dictsort -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {%- if escape_keys -%}\n" - " {{- '<|\"|>' + key + '<|\"|>' -}}\n" - " {%- else -%}\n" - " {{- key -}}\n" - " {%- endif -%}\n" - " :{{- format_argument(value, escape_keys=escape_keys) -}}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- elif argument is sequence -%}\n" - " {{- '[' -}}\n" - " {%- for item in argument -%}\n" - " {{- format_argument(item, escape_keys=escape_keys) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- ']' -}}\n" - " {%- else -%}\n" - " {{- argument -}}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "{%- macro strip_thinking(text) -%}\n" - " {%- set ns = namespace(result='') -%}\n" - " {%- for part in text.split('') -%}\n" - " {%- if '<|channel>' in part -%}\n" - " {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n" - " {%- else -%}\n" - " {%- set ns.result = ns.result + part -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- ns.result | trim -}}\n" - "{%- endmacro -%}\n" - "\n" - "{%- macro format_tool_response_block(tool_name, response) -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- if response is mapping -%}\n" - " {{- 'response:' + tool_name + '{' -}}\n" - " {%- for key, value in response | dictsort -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- else -%}\n" - " {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n" - " {%- endif -%}\n" - " {{- '' -}}\n" - "{%- endmacro -%}\n" - "\n" - "{%- set ns = namespace(prev_message_type=None) -%}\n" - "{%- set loop_messages = messages -%}\n" - "{{- bos_token -}}\n" - "{#- Handle System/Tool Definitions Block -#}\n" - "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" - " {{- '<|turn>system\\n' -}}\n" - " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" - " {%- if enable_thinking is defined and enable_thinking -%}\n" - " {{- '<|think|>\\n' -}}\n" - " {%- set ns.prev_message_type = 'think' -%}\n" - " {%- endif -%}\n" - " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" - " {%- if messages[0]['content'] is string -%}\n" - " {{- messages[0]['content'] | trim -}}\n" - " {%- elif messages[0]['content'] is sequence -%}\n" - " {%- for item in messages[0]['content'] -%}\n" - " {{- item['text'] | trim + ' '-}}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- set loop_messages = messages[1:] -%}\n" - " {%- endif -%}\n" - " {%- if tools -%}\n" - " {%- for tool in tools %}\n" - " {{- '<|tool>' -}}\n" - " {{- format_function_declaration(tool) | trim -}}\n" - " {{- '' -}}\n" - " {%- endfor %}\n" - " {%- set ns.prev_message_type = 'tool' -%}\n" - " {%- endif -%}\n" - " {{- '\\n' -}}\n" - "{%- endif %}\n" - "\n" - "{#- Pre-scan: find last user message index for reasoning guard -#}\n" - "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" - "{%- for i in range(loop_messages | length) -%}\n" - " {%- if loop_messages[i]['role'] == 'user' -%}\n" - " {%- set ns_turn.last_user_idx = i -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{#- Loop through messages -#}\n" - "{%- for message in loop_messages -%}\n" - " {%- if message['role'] != 'tool' -%}\n" - " {%- set ns.prev_message_type = None -%}\n" - " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" - " {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n" - " {%- set prev_nt = namespace(role=None, found=false) -%}\n" - " {%- if loop.index0 > 0 -%}\n" - " {%- for j in range(loop.index0 - 1, -1, -1) -%}\n" - " {%- if not prev_nt.found -%}\n" - " {%- if loop_messages[j]['role'] != 'tool' -%}\n" - " {%- set prev_nt.role = loop_messages[j]['role'] -%}\n" - " {%- set prev_nt.found = true -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n" - " {%- if not continue_same_model_turn -%}\n" - " {{- '<|turn>' + role + '\\n' }}\n" - " {%- endif -%}\n" - "\n" - " {#- Render reasoning/reasoning_content as thinking channel -#}\n" - " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" - " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" - " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" - " {%- endif -%}\n" - "\n" - " {%- if message.get('tool_calls') -%}\n" - " {%- for tool_call in message['tool_calls'] -%}\n" - " {%- set function = tool_call['function'] -%}\n" - " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" - " {%- if function['arguments'] is mapping -%}\n" - " {%- set ns_args = namespace(found_first=false) -%}\n" - " {%- for key, value in function['arguments'] | dictsort -%}\n" - " {%- if ns_args.found_first %},{% endif -%}\n" - " {%- set ns_args.found_first = true -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- endfor -%}\n" - " {%- elif function['arguments'] is string -%}\n" - " {{- function['arguments'] -}}\n" - " {%- endif -%}\n" - " {{- '}' -}}\n" - " {%- endfor -%}\n" - " {%- set ns.prev_message_type = 'tool_call' -%}\n" - " {%- endif -%}\n" - "\n" - " {%- set ns_tr_out = namespace(flag=false) -%}\n" - " {%- if message.get('tool_responses') -%}\n" - " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" - " {%- for tool_response in message['tool_responses'] -%}\n" - " {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endfor -%}\n" - " {%- elif message.get('tool_calls') -%}\n" - " {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n" - " {%- set ns_tool_scan = namespace(stopped=false) -%}\n" - " {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n" - " {%- if ns_tool_scan.stopped -%}\n" - " {%- elif loop_messages[k]['role'] != 'tool' -%}\n" - " {%- set ns_tool_scan.stopped = true -%}\n" - " {%- else -%}\n" - " {%- set follow = loop_messages[k] -%}\n" - " {#- Resolve tool_call_id to function name -#}\n" - " {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n" - " {%- for tc in message['tool_calls'] -%}\n" - " {%- if tc.get('id') == follow.get('tool_call_id') -%}\n" - " {%- set ns_tname.name = tc['function']['name'] -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {#- Handle content as string or content-parts array -#}\n" - " {%- set tool_body = follow.get('content') -%}\n" - " {%- if tool_body is string -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- elif tool_body is sequence and tool_body is not string -%}\n" - " {%- set ns_txt = namespace(s='') -%}\n" - " {%- for part in tool_body -%}\n" - " {%- if part.get('type') == 'text' -%}\n" - " {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" - " {%- for part in tool_body -%}\n" - " {%- if part.get('type') == 'image_url' -%}\n" - " {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n" - " {{- '<|image|>' + url_val -}}\n" - " {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n" - " {%- if part.get('type') == 'audio_url' -%}\n" - " {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- elif part.get('type') == 'input_audio' -%}\n" - " {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- endif -%}\n" - # " {%- elif part.get('type') == 'video_url' -%}\n" - # " {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n" - # " {{- '<|video|>' + video_val -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- else -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- endif -%}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "\n" - " {%- set captured_content -%}\n" - " {%- if message['content'] is string -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(message['content']) -}}\n" - " {%- else -%}\n" - " {{- message['content'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif message['content'] is sequence -%}\n" - " {%- for item in message['content'] -%}\n" - " {%- if item['type'] == 'text' -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(item['text']) -}}\n" - " {%- else -%}\n" - " {{- item['text'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif item['type'] == 'image_url' -%}\n" - " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {{- '<|image|>' + url_val -}}\n" - " {%- set ns.prev_message_type = 'image' -%}\n" - " {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n" - " {%- if item['type'] == 'audio_url' -%}\n" - " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- elif item['type'] == 'input_audio' -%}\n" - " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- endif -%}\n" - " {%- set ns.prev_message_type = 'audio' -%}\n" - " {%- endif -%}\n" - # " {%- elif item['type'] == 'video_url' -%}\n" - # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" - # " {{- '<|video|>' + video_val -}}\n" - # " {%- set ns.prev_message_type = 'video' -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- endset -%}\n" - "\n" - " {{- captured_content -}}\n" - " {%- set has_content = captured_content | trim | length > 0 -%}\n" - "\n" - " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- elif not (ns_tr_out.flag and not has_content) -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- if add_generation_prompt -%}\n" - " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" - " {{- '<|turn>model\\n' -}}\n" - " {%- if not enable_thinking | default(false) -%}\n" - " {{- '<|channel>thought\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Gemma 4 Handler. - - Args: - enable_thinking (bool): Controls whether the <|think|> tag is injected and - manages <|channel>thought behavior. - Note: ONLY supported on Gemma4 31B and 26BA4B models. - NOT supported on Gemma4 E2B and E4B models. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject the thinking variable into the Jinja environment - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set the stop token based on Gemma 4's format () - # generation_config.json: "eos_token_id": [1, 106, 50] - kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GLM41VChatHandler(MTMDChatHandler): - # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. - - GLM41V_EOS_TOKEN = "<|endoftext|>" - GLM41V_PAD_TOKEN = "<|endoftext|>" - GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]\n" - "{%- for msg in messages -%}" - "{%- if msg.role == 'system' -%}" - "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'user' -%}" - "<|user|>\n" - "{%- if msg.content is string -%}" - "{{ msg.content }}" - "{%- else -%}" - "{%- for item in msg.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'assistant' -%}" - "{%- if msg.metadata -%}" - "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- else -%}" - "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{%- endif -%}" - ) - - def __call__(self, **kwargs): - self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN - # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json - stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch - kwargs['stop'] = stop_tokens - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class GLM46VChatHandler(MTMDChatHandler): - GLM46V_EOS_TOKEN = "<|endoftext|>" - GLM46V_PAD_TOKEN = "<|endoftext|>" - GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]" - "{%- if tools -%}" - "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" - "You are provided with function signatures within XML tags:\n\n" - "{%- for tool in tools -%}" - "{{ tool | tojson(ensure_ascii=False) }}\n" - "{%- endfor -%}" - "\n\nFor each function call, output the function name and arguments within the following XML format:\n" - "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" - "{%- endif -%}" - - "{%- for m in messages -%}" - "{%- if m.role == 'system' -%}" - "<|system|>\n{{ m.content }}" - "{%- elif m.role == 'user' -%}" - "<|user|>\n" - "{%- if m.content is string -%}" - "{{ m.content }}" - "{%- else -%}" - "{%- for item in m.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - # If enable_thinking is disabled, insert `/nothink` according to the source code logic. - "{{ '/nothink' if not enable_thinking else '' }}" - "{%- elif m.role == 'assistant' -%}" - "<|assistant|>" - "{%- if enable_thinking -%}" - "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" - "\n{{ reasoning.strip() }}" - "{%- else -%}" - "\n" - "{%- endif -%}" - "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" - "{%- endif -%}" - "{{ GLM46V_EOS_TOKEN }}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{{ '' if enable_thinking else '\n' }}" - "{%- endif -%}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - GLM-4.6V Handler - Parameters: - - enable_thinking (bool): Whether to enable the model's think process. The default is True. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN - - # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json - kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GraniteDoclingChatHandler(MTMDChatHandler): - """ - Handler for Granite-Docling models. - - Format(512x512): Content - - Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!! - Since the model does not have special tokens for the start and end of an image, - it is recommended to process only one image at a time. - You can iterate through the images individually for recognition. - - """ - GRANITE_BOS_TOKEN = "<|start_of_role|>" - GRANITE_EOS_TOKEN = "<|end_of_text|>" - GRANITE_PAD_TOKEN = "<|end_of_text|>" - GRANITE_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for part in message['content'] -%}" - "{%- if part['type'] == 'text' -%}" - "{{- part['text'] -}}" - "{%- elif part['type'] == 'image_url' -%}" - "{%- if part.image_url is string -%}" - "{{- part.image_url -}}" - "{%- else -%}" - "{{- part.image_url.url -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '<|end_of_text|>\n' -}}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|start_of_role|>assistant' -}}" - # Support the 'controls' parameter if present in generation arguments - "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}" - "{{- '<|end_of_role|>' -}}" - "{%- endif -%}" - ) - - def __init__(self, controls: dict = None, **kwargs): - """ - Granite-Docling Handler - Args: - controls (dict, optional): Operational parameters passed to the assistant role. - - The 'controls' parameter is used to guide the model's behavior or output format. - Common examples for 'controls' include: - - Document Parsing: {"mode": "document_parsing", "format": "json"} - """ - self.controls = controls - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject controls into the template environment - self.extra_template_arguments["controls"] = self.controls - self.DEFAULT_SYSTEM_MESSAGE = None - kwargs['stop'] = [self.GRANITE_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - - return super().__call__(**kwargs) - - -class LFM2VLChatHandler(MTMDChatHandler): - LFM2VL_BOS_TOKEN = "<|startoftext|>" - LFM2VL_EOS_TOKEN = "<|im_end|>" - LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{ '<|im_start|>' + message['role'] + '\n' }}" - "{%- if message['content'] is string -%}" - "{{ message['content'] }}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if 'image_url' in content -%}" - "{%- if content.image_url is string -%}" - "<|image_start|>{{ content.image_url }}<|image_end|>" - "{%- else -%}" - "<|image_start|>{{ content.image_url.url }}<|image_end|>" - "{%- endif -%}" - "{%- elif content['type'] == 'text' -%}" - "{{ content['text'] }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{ '<|im_end|>\n' }}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{ '<|im_start|>assistant\n' }}" - "{%- endif -%}" - ) - - def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): - """ - LFM2-VL Handler - LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 - """ - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) - - def __call__(self, **kwargs): - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class LFM25VLChatHandler(MTMDChatHandler): - """ - Handler for LFM2.5-VL multimodal models. - - Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing. - """ - # Aligned with LFM2.5-VL tokenizer_config - LFM25VL_BOS_TOKEN = "<|startoftext|>" - LFM25VL_EOS_TOKEN = "<|im_end|>" - LFM25VL_PAD_TOKEN = "<|pad|>" - - # Image specific tokens - LFM25VL_IMAGE_TOKEN = "" - LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" - LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" - - CHAT_FORMAT = ( - "{{- bos_token -}}\n" - "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" - "{%- set ns = namespace(system_prompt='', content='') -%}\n" - "{%- if messages[0]['role'] == 'system' -%}\n" - " {%- set ns.system_prompt = messages[0]['content'] -%}\n" - " {%- set messages = messages[1:] -%}\n" - "{%- endif -%}\n" - "{%- if tools -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" - " {%- for tool in tools -%}\n" - " {%- if tool is not string -%}\n" - " {%- set tool = tool | tojson -%}\n" - " {%- endif -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" - " {%- if not loop.last -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" - "{%- endif -%}\n" - "{%- if ns.system_prompt -%}\n" - " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" - "{%- endif -%}\n" - "{%- set ns.last_assistant_index = -1 -%}\n" - "{%- for message in messages -%}\n" - " {%- if message['role'] == 'assistant' -%}\n" - " {%- set ns.last_assistant_index = loop.index0 -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- for message in messages -%}\n" - " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" - " {%- set content = message['content'] -%}\n" - " {%- if content is not string -%}\n" - " {%- set ns.content = '' -%}\n" - " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" - " {%- for item in content -%}\n" - " {%- if item['type'] == 'image_url' -%}\n" - " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {%- set ns.content = ns.content + img_val -%}\n" - " {%- elif item['type'] == 'text' -%}\n" - " {%- set ns.content = ns.content + item['text'] -%}\n" - " {%- else -%}\n" - " {%- set ns.content = ns.content + (item | tojson) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set content = ns.content -%}\n" - " {%- endif -%}\n" - " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" - " {%- if '' in content -%}\n" - " {%- set content = content.split('')[-1] | trim -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {{- content + '<|im_end|>\\n' -}}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, keep_past_thinking: bool = False, **kwargs): - self.keep_past_thinking = keep_past_thinking - super().__init__(**kwargs) - - - def __call__(self, **kwargs): - if self.image_min_tokens > 256: - if self.verbose: - print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.") - self.image_min_tokens = -1 - - self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking - - kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class PaddleOCRChatHandler(MTMDChatHandler): - """ - Handler for PaddleOCR 1.5/1.6 multimodal models. - """ - - PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" - PADDLEOCR_BOS_TOKEN = "" - PADDLEOCR_EOS_TOKEN = "" - PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>" - PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>" - PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>" - - CHAT_FORMAT = ( - "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}" - "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}" - "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}" - - "{{- cls_token -}}" - "{%- for message in messages -%}" - "{%- if message['role'] == 'user' -%}" - "{{- 'User: ' -}}" - - # Robust parsing: Check if content is string or list - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - # Pass 1: Render all images first - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'image_url' and 'image_url' in content -%}" - "{{- '<|IMAGE_START|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|IMAGE_END|>' -}}" - "{%- endif -%}" - "{%- endfor -%}" - - # Pass 2: Render all text second - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '\\n' -}}" - - "{%- elif message['role'] == 'assistant' -%}" - "{{- 'Assistant:\\n' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- eos_token -}}" - - "{%- elif message['role'] == 'system' -%}" - "{%- if message['content'] is string -%}" - "{{- message['content'] + '\\n' -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] + '\\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "{{- 'Assistant:\\n' -}}" - "{%- endif -%}" - ) - - def __init__( - self, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - **kwargs - ): - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__( - image_min_tokens=self.image_min_tokens, - image_max_tokens=self.image_max_tokens, - **kwargs - ) - - def __call__(self, **kwargs): - # Set the specific stop token defined in the PaddleOCR template - kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class Qwen25VLChatHandler(MTMDChatHandler): - - QWEN25_VL_BOS_TOKEN = "<|endoftext|>" - QWEN25_VL_PAD_TOKEN = "<|endoftext|>" - QWEN25_VL_EOS_TOKEN = "<|im_end|>" - - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and message['role'] != 'system' %}" - "<|im_start|>system\n" - "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is string %}" - "{{ message['content'] }}<|im_end|>\n" - "{% else %}" - "{% for content in message['content'] %}" - "{% if content['type'] == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>" - "{% else %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>" - "{% endif %}" - "{% elif content['type'] == 'text' %}" - "{{ content['text'] }}" - "{% endif %}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endfor %}" - "<|im_start|>assistant\n" - ) - - def __call__(self, **kwargs): - kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Qwen3ASRChatHandler(MTMDChatHandler): - """ - Handler for Qwen 3 ASR (Automatic Speech Recognition) models. - - Features: - - Highly specialized for Speech-to-Text tasks. - - Aggregates all system text into a single cohesive system block. - - Drops user text entirely, extracting ONLY audio data into a unified user turn. - - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>. - - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url. - """ - - DEFAULT_SYSTEM_MESSAGE = """ - You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language. - You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization. - """ - - QWEN3_ASR_BOS_TOKEN = "<|im_start|>" - QWEN3_ASR_PAD_TOKEN = "<|endoftext|>" - QWEN3_ASR_EOS_TOKEN = "<|im_end|>" - - - QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>" - QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>" - QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>" - - CHAT_FORMAT = ( - "{%- set ns = namespace(system_text='') -%}\n" - "{%- for m in messages -%}\n" - " {%- if m.role == 'system' -%}\n" - " {%- if m.content is string -%}\n" - " {%- set ns.system_text = ns.system_text + m.content -%}\n" - " {%- else -%}\n" - " {%- for c in m.content -%}\n" - " {%- if c.type == 'text' and (c.text is defined) -%}\n" - " {%- set ns.system_text = ns.system_text + c.text -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- set ns2 = namespace(audio_tokens='') -%}\n" - "{%- for m in messages -%}\n" - " {%- if m.content is not string -%}\n" - " {%- for c in m.content -%}\n" - " {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n" - " {#- MTMD Audio Injection -#}\n" - " {%- set audio_val = '' -%}\n" - " {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n" - " {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n" - " {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n" - " {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n" - " {%- endif -%}\n" - " {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n" - "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token - kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") - - return super().__call__(**kwargs) - -class Qwen3VLChatHandler(MTMDChatHandler): - - QWEN3_VL_BOS_TOKEN = "<|endoftext|>" - QWEN3_VL_PAD_TOKEN = "<|endoftext|>" - QWEN3_VL_EOS_TOKEN = "<|im_end|>" - - CHAT_FORMAT = ( - "{{- '<|im_start|>system\n' -}}" - "{%- if messages[0].content is string and messages[0].role == 'system' -%}" - "{{- messages[0].content -}}" - "{%- elif messages[0].role == 'system' -%}" - "{%- if 'text' in messages[0].content -%}" - "{{- messages[0].content.text -}}" - "{%- else -%}" - "{{- 'You are a helpful assistant.' -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- if tools -%}" - "{{- '\n\n' -}}" - "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' -}}" - "{%- for tool in tools -%}" - "{{- '\n' -}}" - "{{- tool | tojson -}}" - "{%- endfor -%}" - "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" - "{%- endif -%}" - "{{- '<|im_end|>\n' -}}" - "{%- set image_count = namespace(value=0) -%}" - #"{%- set video_count = namespace(value=0) -%}" - "{%- for message in messages -%}" - "{%- if message.role == 'tool' -%}" - "{{- '<|im_start|>user\n\n' -}}" - "{%- elif message.role != 'system' -%}" - "{{- '<|im_start|>' + message.role + '\n' -}}" - "{%- endif -%}" - "{%- if message.content is string and message.role != 'system' -%}" - "{{- message.content -}}" - "{%- elif message.role != 'system' -%}" - "{%- for content in message.content -%}" - "{%- if 'image_url' in content -%}" - "{%- set image_count.value = image_count.value + 1 -%}" - "{%- if add_vision_id -%}" - "{{- 'Picture ' -}}" - "{{- image_count.value | string -}}" - "{{- ': ' -}}" - "{%- endif -%}" - "{{- '<|vision_start|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|vision_end|>' -}}" - "{%- endif -%}" - # Video not supported yet - "{%- if 'text' in content -%}" - "{{- content.text -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- if message.role == 'assistant' -%}" - "{%- if message.tool_calls -%}" - "{%- for tool_call in message.tool_calls -%}" - "{%- if (loop.first and message.content) or (not loop.first) -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- if tool_call.function -%}" - "{%- set tool_call = tool_call.function -%}" - "{%- endif -%}" - "{{- '\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}" - "{%- if tool_call.arguments is string -%}" - "{{- tool_call.arguments -}}" - "{%- else -%}" - "{{- tool_call.arguments | tojson -}}" - "{%- endif -%}" - "{{- '}\n' -}}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- elif message.role == 'tool' -%}" - "{{- '' -}}" - "{%- endif -%}" - "{%- if message.role != 'system' -%}" - "{{- '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|im_start|>assistant\n' -}}" - "{%- if force_reasoning -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - force_reasoning: bool = False, - add_vision_id: bool = True, - **kwargs, - ): - """ - Parameters: - - force_reasoning (bool): - - True: Force the reasoning in the model by adding to the chat template. - - False (default): Don't force the reasoning. - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - """ - super().__init__(**kwargs) - self.force_reasoning = force_reasoning - self.extra_template_arguments["force_reasoning"] = force_reasoning - self.extra_template_arguments["add_vision_id"] = add_vision_id - - def __call__(self, **kwargs): - kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Qwen35ChatHandler(MTMDChatHandler): - """ - Handler for Qwen3.5/Qwen3.6 models. - """ - CHAT_FORMAT = ( - "{%- set image_count = namespace(value=0) -%}" - "{%- set video_count = namespace(value=0) -%}" - "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}" - " {%- if content is string -%}" - " {{- content -}}" - " {%- elif content is iterable and content is not mapping -%}" - " {%- for item in content -%}" - " {%- if 'image_url' in item or item.type == 'image_url' -%}" - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain images.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set image_count.value = image_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Picture ' -}}" - " {{- image_count.value | string -}}" - " {{- ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {%- if item.image_url is string -%}" - " {{- item.image_url -}}" - " {%- else -%}" - " {{- item.image_url.url -}}" - " {%- endif -%}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'video' in item -%}" - " {{- raise_exception('llama.cpp does not currently support video.') -}}" # Video not supported, raise exception - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain videos.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set video_count.value = video_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Video ' ~ video_count.value ~ ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {{- item.video -}}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'text' in item -%}" - " {{- item.text -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected item type in content.') -}}" - " {%- endif -%}" - " {%- endfor -%}" - " {%- elif content is none or content is undefined -%}" - " {{- '' -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected content type.') -}}" - " {%- endif -%}" - "{%- endmacro -%}" - "{%- if not messages -%}" - " {{- raise_exception('No messages provided.') -}}" - "{%- endif -%}" - "{%- if tools and tools is iterable and tools is not mapping -%}" - " {{- '<|im_start|>system\n' -}}" - " {{- '# Tools\n\nYou have access to the following functions:\n\n' -}}" - " {%- for tool in tools -%}" - " {{- '\n' -}}" - " {{- tool | tojson -}}" - " {%- endfor -%}" - " {{- '\n' -}}" - " {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' -}}" - " {%- if messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) | trim -%}" - " {%- if content -%}" - " {{- '\n\n' + content -}}" - " {%- endif -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - "{%- elif messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) -%}" - " {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}" - "{%- for message in messages[::-1] -%}" - " {%- set index = messages | length - 1 - loop.index0 -%}" - " {%- if ns.multi_step_tool and message.role == 'user' -%}" - " {%- set content = render_content(message.content, false) | trim -%}" - " {%- if not (content.startswith('') and content.endswith('')) -%}" - " {%- set ns.multi_step_tool = false -%}" - " {%- set ns.last_query_index = index -%}" - " {%- endif -%}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if ns.multi_step_tool -%}" - " {{- raise_exception('No user query found in messages.') -}}" - "{%- endif -%}" - "{%- for message in messages -%}" - " {%- set content = render_content(message.content, true) | trim -%}" - " {%- if message.role == 'system' -%}" - " {%- if not loop.first -%}" - " {{- raise_exception('System message must be at the beginning.') -}}" - " {%- endif -%}" - " {%- elif message.role == 'user' -%}" - " {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}" - " {%- elif message.role == 'assistant' -%}" - " {%- set reasoning_content = '' -%}" - " {%- if message.reasoning_content is string -%}" - " {%- set reasoning_content = message.reasoning_content -%}" - " {%- elif '' in content -%}" - " {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') -%}" - " {%- set content = content.split('')[-1].lstrip('\n') -%}" - " {%- endif -%}" - " {%- set reasoning_content = reasoning_content | trim -%}" - " {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}" - " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" - " {%- else -%}" - " {{- '<|im_start|>' + message.role + '\n' + content -}}" - " {%- endif -%}" - " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" - " {%- for tool_call in message.tool_calls -%}" - " {%- if tool_call.function is defined -%}" - " {%- set tool_call = tool_call.function -%}" - " {%- endif -%}" - " {%- if loop.first -%}" - " {%- if content | trim -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- '\n\n\n' -}}" - " {%- endif -%}" - " {%- if tool_call.arguments is defined -%}" - " {%- for (args_name, args_value) in tool_call.arguments | items -%}" - " {{- '\n' -}}" - " {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}" - " {{- args_value -}}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif message.role == 'tool' -%}" - " {%- if loop.previtem and loop.previtem.role != 'tool' -%}" - " {{- '<|im_start|>user' -}}" - " {%- endif -%}" - " {{- '\n\n' -}}" - " {{- content -}}" - " {{- '\n' -}}" - " {%- if not loop.last and loop.nextitem.role != 'tool' -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif loop.last -%}" - " {{- '<|im_end|>\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- raise_exception('Unexpected message role.') -}}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - " {{- '<|im_start|>assistant\n' -}}" - " {%- if enable_thinking is defined and enable_thinking is false -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n' -}}" - " {%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - add_vision_id: bool = True, - enable_thinking: bool = True, - preserve_thinking: bool = False, - **kwargs, - ): - """ - Parameters: - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - - enable_thinking (bool): - - True (default): Enables reasoning for better results. - - False: Disables reasoning for faster results. - - preserve_thinking (bool): - - True: Keeps reasoning process for ALL historical conversational turns. - - False (default): Only keeps for the latest assistant reply to save tokens. - """ - super().__init__(**kwargs) - self.enable_thinking = enable_thinking - self.preserve_thinking = preserve_thinking - self.extra_template_arguments["add_vision_id"] = add_vision_id - self.extra_template_arguments["enable_thinking"] = enable_thinking - self.extra_template_arguments["preserve_thinking"] = preserve_thinking - - def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class Step3VLChatHandler(MTMDChatHandler): - """ - Handler for Step3-VL models. - """ - - STEP3VL_BOS_TOKEN = "<|im_start|>" - STEP3VL_EOS_TOKEN = "<|im_end|>" - STEP3VL_PAD_TOKEN = "<|endoftext|>" - STEP3VL_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro render_content(content) -%}\n" - " {%- if content is none -%}{{- '' -}}\n" - " {%- elif content is string -%}{{- content -}}\n" - " {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n" - " {%- elif content is iterable -%}\n" - " {%- for item in content -%}\n" - " {%- if item.type == 'text' -%}\n" - " {{- item['value'] if 'value' in item else item['text'] -}}\n" - " {%- elif item.type in ['image', 'image_url'] -%}\n" - " {%- set url_val = '' -%}\n" - " {%- if item.image_url -%}\n" - " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" - " {%- endif -%}\n" - " {{- '' + url_val -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "\n" - "{%- if tools -%}\n" - " {{- '<|im_start|>system\\n' -}}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- render_content(messages[0].content) + '\\n\\n' -}}\n" - " {%- endif -%}\n" - " {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n' -}}\n" - " {%- for tool in tools -%}\n" - " {{- '\\n' -}}\n" - " {{- tool | tojson -}}\n" - " {%- endfor -%}\n" - " {{- '\\n\\n\\nAlways adhere to this exact format for tool use:\\n\\n\\n{\"name\": , \"arguments\": }\\n\\n{additional_tool_calls}\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within XML tags.\\n- `` must be an exact match to one of the available tools.\\n- `` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n" - "{%- else -%}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - "\n" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n" - "{%- for message in messages[::-1] -%}\n" - " {%- set index = (messages|length - 1) - loop.index0 -%}\n" - " {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) -%}\n" - " {%- set ns.multi_step_tool = false -%}\n" - " {%- set ns.last_query_index = index -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- for message in messages -%}\n" - " {%- set content = render_content(message.content) -%}\n" - " {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n" - " {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n" - " {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n" - " {%- elif message.role == 'assistant' -%}\n" - " {%- if message.reasoning_content is string -%}\n" - " {%- set reasoning_content = render_content(message.reasoning_content) -%}\n" - " {%- else -%}\n" - " {%- if '' in content -%}\n" - " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') -%}\n" - " {%- set content = content.split('')[-1].lstrip('\\n') -%}\n" - " {%- else -%}\n" - " {%- set reasoning_content = '' -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if loop.index0 > ns.last_query_index -%}\n" - " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n' + content -}}\n" - " {%- else -%}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content -}}\n" - " {%- endif -%}\n" - " {%- if message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- for tool_call in message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- if tool_call.function -%}\n" - " {%- set tool_call = tool_call.function -%}\n" - " {%- endif -%}\n" - " {{- '\\n{\"name\": \"' -}}\n" - " {{- tool_call.name -}}\n" - " {{- '\", \"arguments\": ' -}}\n" - " {%- if tool_call.arguments is string -%}\n" - " {{- tool_call.arguments -}}\n" - " {%- else -%}\n" - " {{- tool_call.arguments | tojson -}}\n" - " {%- endif -%}\n" - " {{- '}\\n' -}}\n" - " {%- endfor -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- elif message.role == 'tool' -%}\n" - " {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n" - " {{- '<|im_start|>tool_response' -}}\n" - " {%- endif -%}\n" - " {{- '\\n\\n' -}}\n" - " {{- content -}}\n" - " {{- '\\n' -}}\n" - " {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n\\n\\n\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Step3-VL Handler. - - Args: - enable_thinking (bool): If False, injects an empty block to bypass reasoning. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Pass thinking toggle into Jinja - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Step3 uses standard <|im_end|> ChatML stop formatting - kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -@register_chat_completion_handler("chatml-function-calling") -def chatml_function_calling( - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - **kwargs, # type: ignore -) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], -]: - function_calling_template = ( - "{% for message in messages %}" - "<|im_start|>{{ message.role }}\n" - # System message - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% if tool_calls %}" - "\n\nYou have access to the following functions:\n" - "{% for tool in tools %}" - "\nfunctions.{{ tool.function.name }}:\n" - "{{ tool.function.parameters | tojson }}" - "\n{% endfor %}" - "\n\nYou can respond to users messages with either a single message or one or more function calls." - "\n\nTo respond with a message begin the message with 'message:', use the following format:" - "\n\nmessage:" - "\n" - "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" - "\n\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "{% endif %}" - "<|im_end|>\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - ## Reglar message - "{% if message.content and message.content | length > 0 %}" - "{% if tool_calls %}" - "message:\n" - "{% endif %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - ## Function calls - "{% if 'tool_calls' in message %}" - "{% for tool_call in message.tool_calls %}" - "functions.{{ tool_call.function.name }}:\n" - "{{ tool_call.function.arguments }}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" - ) - template_renderer = ImmutableSandboxedEnvironment( - autoescape=jinja2.select_autoescape(["html", "xml"]), - undefined=jinja2.StrictUndefined, - ).from_string(function_calling_template) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - stop = ( - [stop, "<|im_end|>"] - if isinstance(stop, str) - else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] - ) - - # Case 1: No tool choice by user - if ( - tool_choice is None - or (isinstance(tool_choice, str) and tool_choice == "none") - or tools is None - or len(tools) == 0 - ): - prompt = template_renderer.render( - messages=messages, - tools=[], - tool_calls=None, - add_generation_prompt=True, - ) - - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - return _convert_completion_to_chat( - llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=stream, - stop=stop, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logprobs=top_logprobs if logprobs else None, - ), - stream=stream, - ) - - # Case 2: Tool choice by user - if isinstance(tool_choice, dict): - tool_name = tool_choice["function"]["name"] - tool = next( - (tool for tool in tools if tool["function"]["name"] == tool_name), None - ) - if tool is None: - raise ValueError(f"Tool with name '{tool_name}' not found in tools") - prompt = template_renderer.render( - messages=messages, - tools=tools, - tool_calls=True, - add_generation_prompt=True, - ) - prompt += f"functions.{tool_name}:\n" - try: - grammar = llama_grammar.LlamaGrammar.from_json_schema( - json.dumps(tool["function"]["parameters"]), verbose=llama.verbose - ) + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(tool["function"]["parameters"]), verbose=llama.verbose + ) except Exception as e: grammar = llama_grammar.LlamaGrammar.from_string( llama_grammar.JSON_GBNF, verbose=llama.verbose @@ -6829,3 +3539,35 @@ def chatml_function_calling( } raise ValueError("Automatic streaming tool choice is not supported") + +# Backward compatibility re-exports. +# These multimodal chat handlers have been moved to `llama_multimodal`. +# New code should import them from `llama_cpp.llama_multimodal` instead of +# `llama_cpp.llama_chat_format`. +from llama_cpp.llama_multimodal import ( + MTMDChatHandler, + GenericMTMDChatHandler, + Llava15ChatHandler, + ObsidianChatHandler, + MoondreamChatHandler, + Llava16ChatHandler, + NanoLlavaChatHandler, + Llama3VisionAlphaChatHandler, + Llama3VisionAlpha, + MiniCPMv26ChatHandler, + MiniCPMv45ChatHandler, + MiniCPMV46ChatHandler, + Gemma3ChatHandler, + Gemma4ChatHandler, + GLM41VChatHandler, + GLM46VChatHandler, + GraniteDoclingChatHandler, + LFM2VLChatHandler, + LFM25VLChatHandler, + PaddleOCRChatHandler, + Qwen25VLChatHandler, + Qwen3ASRChatHandler, + Qwen3VLChatHandler, + Qwen35ChatHandler, + Step3VLChatHandler +) diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 3c431fc3d8..67ad424490 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -465,18 +465,18 @@ def __init__(self, content: str, deps: list = None): SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}' PRIMITIVE_RULES = { - 'boolean' : BuiltinRule('("true" | "false") space', []), + 'boolean' : BuiltinRule('("true" | "false")', []), 'decimal-part' : BuiltinRule('[0-9]{1,16}', []), 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []), - 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), - 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']), + 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?', ['integral-part', 'decimal-part']), + 'integer' : BuiltinRule('("-"? integral-part)', ['integral-part']), 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), - 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), - 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), - 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []), + 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? space "}"', ['string', 'value']), + 'array' : BuiltinRule('"[" space ( value ("," space value)* )? space "]"', ['value']), + 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""', []), 'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []), - 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), - 'null' : BuiltinRule('"null" space', []), + 'string' : BuiltinRule(r'"\"" char* "\""', ['char']), + 'null' : BuiltinRule('"null"', []), } # TODO: support "uri", "email" string formats @@ -484,9 +484,9 @@ def __init__(self, content: str, deps: list = None): 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), 'date-time' : BuiltinRule('date "T" time', ['date', 'time']), - 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']), - 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']), - 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']), + 'date-string' : BuiltinRule('"\\"" date "\\""', ['date']), + 'time-string' : BuiltinRule('"\\"" time "\\""', ['time']), + 'date-time-string': BuiltinRule('"\\"" date-time "\\""', ['date-time']), } DOTALL = '[\\U00000000-\\U0010FFFF]' @@ -585,7 +585,7 @@ def visit(node): out.append(f'[^"{"".join(rejects)}] {char_rule}*') visit(trie) - out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space') + out.append(f' ){"" if trie.is_end_of_string else "?"} ["]') return ''.join(out) def _add_rule(self, name, rule): @@ -815,7 +815,7 @@ def join_seq(): return self._add_rule( name, to_rule(transform()) if self._raw_pattern \ - else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") + else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"") def _resolve_ref(self, ref): @@ -846,10 +846,10 @@ def visit(self, schema, name): return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type])) elif 'const' in schema: - return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space') + return self._add_rule(rule_name, self._generate_constant_rule(schema['const'])) elif 'enum' in schema: - rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space' + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ')' return self._add_rule(rule_name, rule) elif schema_type in (None, 'object') and \ @@ -890,7 +890,7 @@ def add_component(comp_schema, is_required): enum_intersection &= s if enum_intersection: - rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space' + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ')' return self._add_rule(rule_name, rule) return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) @@ -904,12 +904,12 @@ def add_component(comp_schema, is_required): ' "," space '.join( self.visit(item, f'{name}{"-" if name else ""}tuple-{i}') for i, item in enumerate(items)) + - ' "]" space') + ' space "]"') else: item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') min_items = schema.get("minItems", 0) max_items = schema.get("maxItems") - return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space') + return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' space "]"') elif schema_type in (None, 'string') and 'pattern' in schema: return self._visit_pattern(schema['pattern'], rule_name) @@ -929,7 +929,7 @@ def add_component(comp_schema, is_required): min_len = schema.get('minLength', 0) max_len = schema.get('maxLength') - return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') + return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\""') elif schema_type in (None, 'integer') and \ ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema): @@ -946,7 +946,7 @@ def add_component(comp_schema, is_required): out = ["("] _generate_min_max_int(min_value, max_value, out) - out.append(") space") + out.append(")") return self._add_rule(rule_name, ''.join(out)) elif (schema_type == 'object') or (len(schema) == 0): @@ -1031,7 +1031,7 @@ def get_recursive_refs(ks, first_is_optional): rule += ' )' rule += ' )?' - rule += ' "}" space' + rule += ' space "}"' return rule diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py new file mode 100644 index 0000000000..f1b320b772 --- /dev/null +++ b/llama_cpp/llama_multimodal.py @@ -0,0 +1,3686 @@ +from __future__ import annotations + +import base64 +import ctypes +import json +import os +import sys +import zlib + +from contextlib import ExitStack +from typing import ( + Any, + Dict, + Iterator, + List, + Literal, + Optional, + Tuple, + Union, + Protocol, + TYPE_CHECKING, + cast, +) + +import urllib.request +from urllib.error import URLError, HTTPError + +import llama_cpp.llama_cpp as llama_cpp_lib +import llama_cpp.llama_types as llama_types +import llama_cpp.llama_grammar as llama_grammar + +if TYPE_CHECKING: + import llama_cpp.llama as llama_core + +from ._logger import ggml_log_callback + +from llama_cpp.llama_chat_format import ( + _convert_completion_to_chat, + _convert_completion_to_chat_function, + _grammar_for_response_format, + ImmutableSandboxedEnvironment +) + +class MTMDChatHandler: + DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( +"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " +"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." + ) + + CHAT_FORMAT = ( + "{{ bos_token if bos_token is defined else '' }}" + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% elif message.role == 'user' %}" + "USER: " + "{% if message.content is string %}" + "{{ message.content }}" + "{% elif message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% elif content.type == 'audio_url' %}" + "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" + "{% elif content.type == 'input_audio' %}" + "{% if content.input_audio is string %}" + "{{ content.input_audio }}" + "{% else %}" + "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" + "{% endif %}" + "{% elif content.type == 'video_url' %}" + "{{ content.video_url if content.video_url is string else content.video_url.url }}" + "{% elif content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + + "{% elif message.role == 'assistant' and message.content is not none %}" + "ASSISTANT: {{ message.content }}" + "{% endif %}" + "{{ \"\n\" }}" + "{% endfor %}" + + "{% if eos_token is defined %}" + "{{ eos_token }}" + "{% endif %}" + + "{% if add_generation_prompt %}" + "ASSISTANT: " + "{% endif %}" + ) + + KNOWN_MEDIA_TAGS: List[str] = [] + + def __init__( + self, + mmproj_path: Optional[str] = None, + verbose: bool = True, + use_gpu: bool = True, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + chat_template_override: Optional[str] = None, + batch_max_tokens: int = 1024, + **kwargs + ): + + self.log_prefix = self.__class__.__name__ + self.verbose = verbose + + # Backward compatibility: `clip_model_path` was the old name for `mmproj_path`. + # Accept it for existing user code, warn during initialization, and normalize + # all internal usage to `mmproj_path`. + clip_model_path = kwargs.pop("clip_model_path", None) + if mmproj_path is None and clip_model_path is not None: + mmproj_path = clip_model_path + if self.verbose: + print( + f"{self.log_prefix}(__init__): `clip_model_path` is deprecated; " + "please use `mmproj_path` instead.", + file=sys.stderr, + ) + + if kwargs: + unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) + raise TypeError( + f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" + f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." + ) + + if mmproj_path is None: + raise ValueError( + f"{self.log_prefix}(__init__): `mmproj_path` is required. " + "`clip_model_path` is accepted only as a deprecated compatibility alias." + ) + + self.mmproj_path = mmproj_path + if not os.path.exists(self.mmproj_path): + raise ValueError( + f"{self.log_prefix}(__init__): mmproj path does not exist: {self.mmproj_path}" + ) + + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + self.batch_max_tokens = batch_max_tokens + self.use_gpu = use_gpu + + import llama_cpp.mtmd_cpp as mtmd_cpp + self._mtmd_cpp = mtmd_cpp + self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None + self.extra_template_arguments: dict[str, Any] = {} + + self.is_support_vision = False + self.is_support_audio = False + self.is_support_video = False + + # Pre-compile Jinja template + if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: + self.chat_format = self.CHAT_FORMAT + elif chat_template_override is not None: + self.chat_format = chat_template_override + + self._chat_format_parser_tags = [] + self._change_chat_template(self.chat_format) + + self._exit_stack = ExitStack() + + def _change_chat_template(self, new_template: str): + self.chat_template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True + ).from_string(new_template) + + def _init_mtmd_context(self, llama_model: llama_core.Llama): + """Initialize mtmd context with the llama model.""" + if self.mtmd_ctx is not None: + return # Already initialized + + self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) + + # Get default parameters + self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() + self.mctx_params.use_gpu = self.use_gpu + self.mctx_params.print_timings = self.verbose + self.mctx_params.n_threads = llama_model.n_threads + self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + self.mctx_params.warmup = True + if self.image_min_tokens > 0: + self.mctx_params.image_min_tokens = self.image_min_tokens + if self.image_max_tokens > 0: + self.mctx_params.image_max_tokens = self.image_max_tokens + if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " + f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + self.mctx_params.batch_max_tokens = self.batch_max_tokens + + # Cache the model's eos token and bos token + self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') + self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') + + # Cache the mtmd_default_marker + self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') + + # Initialize mtmd context + self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( + self.mmproj_path.encode(), + llama_model.model, + self.mctx_params + ) + + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") + + # Check if vision is supported + self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) + if self.is_support_vision: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if audio is supported + self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) + if self.is_support_audio: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if video is supported + self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) + if self.is_support_video: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) + + def close(self) -> None: + """Explicitly free the mtmd context and vision model resources.""" + if getattr(self, "mtmd_ctx", None) is not None: + try: + self._mtmd_cpp.mtmd_free(self.mtmd_ctx) + except Exception: + pass + self.mtmd_ctx = None + self.mctx_params = None + self.chat_template = None + + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): + self._exit_stack.close() + self._exit_stack = None + + def __del__(self) -> None: + self.close() + + def _get_media_url( + self, + content: Dict[str, Any], + keys: Tuple[str, ...], + media_type: str, + ) -> str: + """ + Extract a media URL or data URI from a multimodal content item. + + Different chat templates and client APIs may represent the same media + payload with slightly different keys. For example, an image may appear as + `image`, `image_url`, or a typed chunk with `{"type": "image", ...}`. + This helper checks the provided keys in order and returns the first usable + media payload. + + Returns an empty string when none of the requested keys exist or when the + payload shape is unsupported. The caller is responsible for raising a + media-type-specific error when an empty value is not acceptable. + """ + # Try keys in priority order. This lets callers prefer canonical fields + # such as "image" over compatibility aliases such as "image_url", while + # still accepting either representation. + value = None + for key in keys: + if key in content: + value = content[key] + break + + # String payloads may already be URLs, local paths, or data URIs. + if isinstance(value, str): + return value + + if isinstance(value, dict): + # Common OpenAI-style shape: + # {"image_url": {"url": "..."}} + if "url" in value: + return value["url"] + + # Forward-compatible inline media shape: + # {"audio": {"data": "...", "format": "wav"}} + # + # Convert it to a data URI so downstream media loading does not need + # separate branches for raw base64 payloads. + if "data" in value and "format" in value: + media_format = value.get("format", "") + media_data = value.get("data", "") + if media_format and media_data: + return f"data:{media_type}/{media_format};base64,{media_data}" + + return "" + + def _get_media_items( + self, + messages: List[llama_types.ChatCompletionRequestMessage], + ) -> List[Dict[str, str]]: + """ + Extract media payloads from chat messages in message/content order. + + Supports OpenAI-style typed media chunks as well as template-friendly + variants used by multimodal chat templates, such as: + - {"type": "image_url", "image_url": {"url": "..."}} + - {"type": "image", "image": "..."} + - {"image": "..."} + - {"type": "audio_url", "audio_url": {"url": "..."}} + - {"type": "audio", "audio": "..."} + - {"type": "input_audio", "input_audio": {"data": "...", "format": "wav"}} + - {"type": "video_url", "video_url": {"url": "..."}} + - {"type": "video", "video": "..."} + - {"video": "..."} + + The returned order must match the media placeholders emitted by the rendered + chat template as closely as possible. + """ + media_items: List[Dict[str, str]] = [] + + for message in messages: + content_list = message.get("content") + if not isinstance(content_list, list): + continue + + for content in content_list: + if not isinstance(content, dict): + continue + + content_type = content.get("type", "") + + has_image = ( + content_type in ("image", "image_url") + or "image" in content + or "image_url" in content + ) + has_audio = ( + content_type in ("audio", "audio_url", "input_audio") + or "audio" in content + or "audio_url" in content + or "input_audio" in content + ) + has_video = ( + content_type in ("video", "video_url") + or "video" in content + or "video_url" in content + ) + + media_kind_count = int(has_image) + int(has_audio) + int(has_video) + if media_kind_count > 1: + raise ValueError( + f"{self.log_prefix}: content item contains multiple media types; " + "each content item must contain only one of image, audio, or video." + ) + + # 1. Vision Processing + if has_image: + if not self.is_support_vision: + raise ValueError( + f"{self.log_prefix}: This mmproj model instance does not support image inputs." + ) + + url = self._get_media_url( + content, + keys=("image", "image_url"), + media_type="image", + ) + if not url: + raise ValueError(f"{self.log_prefix}: missing image url/data.") + + media_items.append({"url": url, "type": "image"}) + + # 2. Audio Processing + elif has_audio: + if not self.is_support_audio: + raise ValueError( + f"{self.log_prefix}: This mmproj model instance does not support audio inputs." + ) + + if content_type == "input_audio" or "input_audio" in content: + input_audio = content.get("input_audio", {}) + + if isinstance(input_audio, dict) and "data" in input_audio: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + + # Strictly align with llama.cpp. + if audio_format not in ["wav", "mp3"]: + raise ValueError( + f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'" + ) + + url = f"data:audio/{audio_format};base64,{audio_data}" + else: + url = input_audio if isinstance(input_audio, str) else "" + else: + url = self._get_media_url( + content, + keys=("audio", "audio_url"), + media_type="audio", + ) + + if not url: + raise ValueError(f"{self.log_prefix}: missing audio url/data.") + + media_items.append({"url": url, "type": "audio"}) + + # 3. Video Processing + elif has_video: + if not self.is_support_video: + raise ValueError( + f"{self.log_prefix}: This libmtmd build does not support video inputs." + ) + + url = self._get_media_url( + content, + keys=("video", "video_url"), + media_type="video", + ) + if not url: + raise ValueError(f"{self.log_prefix}: missing video url/data.") + + media_items.append({"url": url, "type": "video"}) + + # 4. Text & Unknown Types + elif content_type == "text" or "text" in content: + continue + else: + if self.verbose: + print( + f"{self.log_prefix}: ignored unknown content type '{content_type}'.", + file=sys.stderr, + ) + + return media_items + + def _create_bitmap_from_bytes(self, media_bytes: bytes): + """ + Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. + + Supported formats: + - Images (via stb_image): jpg, png, bmp, etc. + - Audio (via miniaudio): wav, mp3, flac. + - Video: depends on whether MTMD_VIDEO was enabled at build time. + + Note: + - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. + - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. + + Args: + media_bytes (bytes): The raw byte content of the media file. + + Returns: + bitmap: mtmd_bitmap * + video_ctx: mtmd_helper_video * or NULL + """ + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") + + if not media_bytes: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + + buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + + wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + self.mtmd_ctx, + buf, + len(media_bytes), + False, + ) + + if not wrapper.bitmap: + if wrapper.video_ctx: + self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) + + raise ValueError( + f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load media from bytes " + "(unsupported media format, corrupted data, or missing helper support)." + ) + + return wrapper.bitmap, wrapper.video_ctx + + def _is_text_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD text chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT + ) + + def _is_image_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD image chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + ) + + def _is_audio_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ) + + def _process_mtmd_prompt( + self, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + add_generation_prompt: bool = True, + ) -> Tuple[List[int], List[tuple], Any, List[Any]]: + """ + Core multimodal preprocessing pipeline. + Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. + + Features: + - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. + - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. + - Strict RAII-style C++ memory management to prevent leaks on failure. + + Returns: + full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. + chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). + chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). + bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. + """ + # 1. Inject default system prompt if omitted by the user + system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") + if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: + messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages + + media_items = self._get_media_items(messages) + media_marker = self.media_marker + + # 2. Render the chat template and replace actual URLs with C++ media markers + text = self.chat_template.render( + messages=messages, + add_generation_prompt=add_generation_prompt, + eos_token=self.mtmd_eos_token, + bos_token=self.mtmd_bos_token, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + **getattr(self, 'extra_template_arguments', {}) + ) + + for tag in self._chat_format_parser_tags: + if tag not in text: + continue + + text = text.replace(tag, media_marker) + + # Replace image_url by media_marker in text + for item in media_items: + text = text.replace(item["url"], media_marker) + + if self.verbose: + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) + + # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding + bitmaps = [None] * len(media_items) + bitmap_cleanup = [] + video_cleanup = [] + chunks = None + + try: + # Concurrent Media Decoding + import concurrent.futures + if media_items: + def _create_bitmap_func(idx: int, item: dict): + media_bytes = self.load_media(item["url"], item["type"]) + bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) + return idx, bitmap, video_ctx + # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, + # which can be used in the future to process large numbers of video frames. + max_workers = min(llama.n_threads, len(media_items)) + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] + + for future in concurrent.futures.as_completed(futures): + idx, bitmap, video_ctx = future.result() + + bitmaps[idx] = bitmap + bitmap_cleanup.append(bitmap) + + if video_ctx: + video_cleanup.append(video_ctx) + + # Strict validation: Abort if any thread failed to decode its assigned media + if any(b is None for b in bitmaps): + raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") + else: + if self.verbose: + print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") + else: + # If there are no images, set the bitmaps to empty. + bitmaps = [] + + # 4. Initialize mtmd_input_chunks + input_text = self._mtmd_cpp.mtmd_input_text() + input_text.text = text.encode('utf-8') + input_text.add_special = (llama.n_tokens == 0) + input_text.parse_special = True + + chunks = self._mtmd_cpp.mtmd_input_chunks_init() + if chunks is None: + raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") + + # 5. Hybrid Tokenization (Text + Media binding) + if len(bitmaps) > 0: + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) + ) + else: + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") + + # Video helper contexts only need to stay alive until mtmd_tokenize() completes. + if video_cleanup: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup.clear() + + # 6. Virtual Token Ledger Construction + full_prompt_ids = [] + chunk_token_spans = [] + current_idx = 0 + n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + + # Cursor to track the actual media contents (URLs or base64 data) provided by the user + media_items_count = len(media_items) + media_items_cur = 0 + last_media_id = None + + for i in range(n_chunks): + chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) + if chunk is None: continue + chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) + + if self._is_text_chunk(chunk_type): + # Extract standard text token IDs + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) + if tokens_ptr and n_tokens_out.value > 0: + tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) + full_prompt_ids.extend(tokens) + current_idx += len(tokens) + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + # Extract media properties + # Note(JamePeng): + # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). + # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. + # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) + + if media_items_cur < media_items_count: + # The C++ parser only sees identical placeholders (e.g., "<__media__>"). + # We MUST inject the actual media content's identity here. + real_media_url = media_items[media_items_cur]["url"] + # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) + # Generate a deterministic, unique negative ID for this specific image/audio. + # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). + # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with + # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). + # This empowers `longest_token_prefix` to correctly identify and reuse cached images, + # while instantly breaking the match if the image content changes. + # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 + media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 + last_media_id = media_id + media_items_cur += 1 + elif last_media_id is not None: + # video may expand into multiple image chunks from one media marker + media_id = last_media_id + else: + # Magic Negative Number as fallback :) + media_id = -314159 + + if self.verbose: + print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") + + chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) + + # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache + full_prompt_ids.extend([media_id] * chunk_n_tokens) + current_idx += chunk_n_tokens + else: + raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") + + return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup + + except Exception as e: + # Ensure no useless pointers remain upon any failure + # Free chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Free bitmaps + if len(bitmap_cleanup) > 0: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup = None + # Free videos + if len(video_cleanup) > 0: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup = None + + bitmaps = None + + raise e + + def __call__( + self, + *, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + seed: Optional[int] = None, + response_format: Optional[ + llama_types.ChatCompletionRequestResponseFormat + ] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + add_generation_prompt: bool = True, + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, + **kwargs, # type: ignore + ) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], + ]: + # 1. Initialize mtmd context + self._init_mtmd_context(llama) + assert self.mtmd_ctx is not None + + # 2. Concurrent Preprocessing & Ledger Construction + full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( + llama=llama, + messages=messages, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + add_generation_prompt=add_generation_prompt, + ) + + if self.verbose: + print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) + + try: + # 3. KV Cache Synchronization & State Rollback + # Compares the virtual ledger with physical history to prevent Cache Poisoning. + current_history = llama.input_ids[:llama.n_tokens].tolist() + longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) + + if longest_prefix < llama.n_tokens: + if llama.is_hybrid and llama._hybrid_cache_mgr is not None: + if llama._hybrid_cache_mgr.max_checkpoints > 0: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " + f"Searching for nearest checkpoint...", file=sys.stderr) + + best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) + if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + llama.n_tokens = best_ckpt.pos + if self.verbose: + print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) + llama._ctx.memory_seq_rm(0, longest_prefix, -1) + llama.n_tokens = longest_prefix + + n_past = llama.n_tokens + + for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: + # Skip previously matched chunks + if end_idx <= n_past: + continue + + if self._is_text_chunk(chunk_type): + unprocessed_start = max(start_idx, n_past) - start_idx + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) + + if tokens_ptr and n_tokens_out.value > 0: + all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + tokens_to_eval = all_tokens[unprocessed_start:] + + if tokens_to_eval: + if self.verbose: + print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + # Text evaluation delegates shift and chunking to native llama.eval + llama.eval(tokens_to_eval) + n_past = llama.n_tokens + + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) + + if self.verbose: + media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" + print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + + # Stage 5: Multimodal Physical OOM Defense + if n_past + chunk_n_tokens > llama.n_ctx(): + if not llama._ctx.memory_can_shift(): + raise RuntimeError( + f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " + f"(n_pos_per_embd > 1 or incompatible M-RoPE). " + f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " + f"You MUST increase n_ctx to fit the dialogue." + ) + else: + # Safely discard oldest tokens while preserving system prompts + n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch + n_keep = min(llama.n_keep, n_past) + n_discard = min(n_discard, n_past - n_keep) + + if n_discard <= 0: + raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") + + if self.verbose: + print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) + + # Execute physical memory shift + llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) + llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) + + # Shift python virtual array to match + remaining_len = n_past - (n_keep + n_discard) + if remaining_len > 0: + llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] + + n_past -= n_discard + llama.n_tokens = n_past + + # Execute C++ Multimodal Black-box Extraction + new_n_past = llama_cpp_lib.llama_pos(0) + result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( + self.mtmd_ctx, + llama._ctx.ctx, + chunk_ptr, + llama_cpp_lib.llama_pos(n_past), + llama_cpp_lib.llama_seq_id(0), + llama.n_batch, + True, # logits_last = True, drastically saves computational overhead + ctypes.byref(new_n_past) + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") + + # Update Ledger with "Negative Reverse Vocabulary" IDs + llama.input_ids[n_past : new_n_past.value] = media_id + n_past = new_n_past.value + llama.n_tokens = n_past + + # Extract the final, perfectly synchronized prompt sequence + prompt = llama.input_ids[: llama.n_tokens].tolist() + + # End-of-Turn Checkpoint + # Anchors the state ONLY after the entire multi-modal turn is processed + if ( + llama.is_hybrid + and llama._hybrid_cache_mgr is not None + and llama._hybrid_cache_mgr.max_checkpoints > 0 + ): + if self.verbose: + print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) + + llama._hybrid_cache_mgr.save_checkpoint( + current_pos=llama.n_tokens, + tokens=prompt, + seq_id=0 + ) + finally: + # Cleanup chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Cleanup bitmaps + if bitmap_cleanup: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup.clear() + bitmap_array = None + + # Handle response format and tools (same as before) + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) + + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] + + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } + + tool = None + if ( + tool_choice is not None + and isinstance(tool_choice, dict) + and tools is not None + ): + name = tool_choice["function"]["name"] + tool = next((t for t in tools if t["function"]["name"] == name), None) + if tool is None: + raise ValueError(f"Tool choice '{name}' not found in tools.") + schema = tool["function"]["parameters"] + try: + # create grammar from json schema + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(schema), verbose=llama.verbose + ) + except Exception as e: + if llama.verbose: + print(str(e), file=sys.stderr) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + logprobs=top_logprobs if logprobs else None, + stream=stream, + stop=stop, + seed=seed, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, + ) + + if tool is not None: + tool_name = tool["function"]["name"] + return _convert_completion_to_chat_function( + tool_name, completion_or_chunks, stream + ) + return _convert_completion_to_chat(completion_or_chunks, stream=stream) + + def load_media(self, media_url: str, media_type: str) -> bytes: + """ + Unified dispatcher for loading media payloads. + Routes the URL/URI to the specific image, audio, or video processor based on the media_type. + """ + if media_type == "image": + return self._load_image(media_url) + + elif media_type == "audio": + audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") + try: + self.detect_audio_format(audio_bytes) + except ValueError as e: + raise ValueError(f"{self.log_prefix}(load_media): {e}") + return audio_bytes + + elif media_type == "video": + return self._load_bytes(media_url, timeout=30, kind="video") + + else: + raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") + + @staticmethod + def detect_audio_format(audio_bytes: bytes) -> str: + """ + Pure utility function: Detects the audio format from magic bytes. + Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility + and avoid false positives (e.g., AVI files disguised as RIFF). + """ + length = len(audio_bytes) + + if length < 12: + raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") + + # RIFF & WAVE magic bytes verification + is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" + + # ID3 metadata or MPEG sync word verification + is_mp3 = length >= 3 and ( + audio_bytes.startswith(b"ID3") or + (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) + ) + + # FLAC magic bytes verification + is_flac = audio_bytes.startswith(b"fLaC") + + if is_wav: + return "wav" + elif is_mp3: + return "mp3" + elif is_flac: + return "flac" + else: + raise ValueError( + "Unsupported audio format detected via magic bytes. " + "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." + ) + + DEFAULT_HTTP_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/148.0.0.0 Safari/537.36" + ), + } + + @staticmethod + def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: + """ + Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. + """ + media_bytes = b"" + + # 1. Handle data URI + if media_url.strip().startswith("data:"): + comma_pos = media_url.find(",") + if comma_pos == -1: + raise ValueError("Invalid data URI: missing comma separator") + + base64_data = media_url[comma_pos + 1:] + media_bytes = base64.b64decode(base64_data) + + # 2. Handle local file path + elif os.path.exists(media_url): + with open(media_url, "rb") as f: + media_bytes = f.read() + + # 3. Handle remote URL via HTTP/HTTPS + else: + req = urllib.request.Request( + media_url, + headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, + ) + try: + with urllib.request.urlopen(req, timeout=timeout) as f: + media_bytes = f.read() + except (URLError, HTTPError) as e: + raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") + + if not media_bytes: + raise ValueError(f"Empty {kind} data received") + + return media_bytes + + @staticmethod + def _load_image(image_url: str) -> bytes: + """ + Load an image from either a URL or a data URI and return it as JPEG bytes. + + Supports: + - Remote images via HTTP/HTTPS (with proper User-Agent) + - Data URIs (base64-encoded, e.g., data:image/png;base64,...) + - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background + - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html + + Returns: + JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. + """ + # 1. Load image bytes from image_url + image_bytes = MTMDChatHandler._load_bytes( + image_url, + timeout=15, + kind="image", + ) + + # 2. Check if image_bytes is empty. + if not image_bytes: + raise ValueError("Empty image data received") + + # 3. Open image with Pillow + try: + from PIL import Image, ImageStat + except ImportError: + raise ImportError("Pillow is required for image processing. Install with: pip install pillow") + + import io + image = Image.open(io.BytesIO(image_bytes)) + + # 4. Handle transparency (RGBA, LA, P with transparency, etc.) + if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): + # Use alpha channel as mask + if image.mode == "P": + image = image.convert("RGBA") + + alpha = image.split()[-1] # Last channel is alpha + # Compute average brightness of visible (non-transparent) pixels + stat = ImageStat.Stat(image.convert("L"), mask=alpha) + + # Choose background: white for dark content, black for bright content + bg_color = (255, 255, 255) # white + if stat.count[0] > 0 and stat.mean[0] > 127: + bg_color = (0, 0, 0) # black + + background = Image.new("RGB", image.size, bg_color) + background.paste(image, mask=alpha) + image = background + + # 5. Ensure RGB mode for formats like CMYK, palette, etc. + elif image.mode != "RGB": + image = image.convert("RGB") + + # 6. Save as high-quality JPEG, suitable for most vision models. + output = io.BytesIO() + image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) + return output.getvalue() + + @classmethod + def from_pretrained( + cls, + repo_id: str, + filename: Optional[str], + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + cache_dir: Optional[Union[str, os.PathLike[str]]] = None, + **kwargs: Any, + ) -> "MTMDChatHandler": + import fnmatch + from pathlib import Path + + try: + from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore + from huggingface_hub.utils import validate_repo_id # type: ignore + except ImportError: + raise ImportError( + "Llama.from_pretrained requires the huggingface_hub package. " + "You can install it with `pip install --upgrade huggingface_hub`." + ) + + validate_repo_id(repo_id) + + hffs = HfFileSystem() + + files = [ + file["name"] if isinstance(file, dict) else file + for file in hffs.ls(repo_id) # type: ignore + ] + + # split each file into repo_id, subfolder, filename + file_list: List[str] = [] + for file in files: + rel_path = Path(file).relative_to(repo_id) + file_list.append(str(rel_path)) + + matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore + + if len(matching_files) == 0: + raise ValueError( + f"No file found in {repo_id} that match {filename}\n\n" + f"Available Files:\n{json.dumps(file_list)}" + ) + + if len(matching_files) > 1: + raise ValueError( + f"Multiple files found in {repo_id} matching {filename}\n\n" + f"Available Files:\n{json.dumps(files)}" + ) + + (matching_file,) = matching_files + + subfolder = str(Path(matching_file).parent) + filename = Path(matching_file).name + + # download the file + hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=cast(Union[str, Path, None], local_dir), + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + ) + + if local_dir is None: + model_path = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + local_files_only=True, + ) + else: + model_path = os.path.join(local_dir, filename) + + return cls( + mmproj_path=model_path, + **kwargs, + ) + +# Generic template-driven MTMD handler. +class GenericMTMDChatHandler(MTMDChatHandler): + """ + Generic MTMD chat handler backed by the model-provided chat template. + + This handler is intentionally template-driven. It renders the model's + tokenizer.chat_template first, then normalizes rendered media URLs or + placeholder tokens into MTMD media markers before tokenization. + + It is designed for model templates that emit media placeholders such as + <|image_pad|>, <|image|>, , [IMG], or Kimi-style <|media_pad|>. + Model-specific handlers may still be preferable when a model requires + special stop tokens, generation flags, or non-standard template arguments. + """ + + KNOWN_MEDIA_TAGS = [ + # Pad placeholders inside model-specific wrappers. + "<|image_pad|>", + "<|audio_pad|>", + "<|video_pad|>", + + # Direct placeholders inside Gemma/Llama/GLM-style wrappers. + "<|image|>", + "<|audio|>", + "<|video|>", + + # LLaVA / LFM / Mistral-style placeholders. + "", + "