From 1f5226b4882542545bec204d81f04171059566ee Mon Sep 17 00:00:00 2001 From: Alcoft Date: Mon, 4 May 2026 20:58:58 +0200 Subject: [PATCH 01/36] Implemented generic multimodal chat handler. --- llama_cpp/llama.py | 12 +++++++++ llama_cpp/llama_chat_format.py | 49 +++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1241f81e26..848706a90d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -85,6 +85,7 @@ class Llama: def __init__( self, model_path: str, + clip_model_path: Optional[str] = None, *, # Model Params n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", @@ -608,6 +609,17 @@ def __init__( if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) + + if clip_model_path is not None: + if self.chat_handler is not None and self.verbose: + print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True) + + self.chat_handler = llama_chat_format.GenericMTMDChatHandler( + gguf_metadata = self.metadata, + clip_model_path = clip_model_path, + model_arch = None, + verbose = self.verbose + ) eos_token_id = self.token_eos() bos_token_id = self.token_bos() diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a0d8d25db4..468a73c077 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2887,10 +2887,14 @@ def __init__( raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") # Pre-compile Jinja template + if not hasattr(self, "chat_format") or self.chat_format is None: + self.chat_format = self.CHAT_FORMAT + + self._chat_format_parser_tags = [] self.chat_template = ImmutableSandboxedEnvironment( trim_blocks=True, lstrip_blocks=True, - ).from_string(self.CHAT_FORMAT) + ).from_string(self.chat_format) self._exit_stack = ExitStack() @@ -3116,6 +3120,13 @@ def _process_mtmd_prompt( tool_choice=tool_choice, **getattr(self, 'extra_template_arguments', {}) ) + + for tag in self._chat_format_parser_tags: + if tag not in text: + continue + + text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):] + # Replace image_url by media_marker in text for item in media_items: text = text.replace(item["url"], media_marker) @@ -3827,6 +3838,42 @@ def from_pretrained( **kwargs, ) +class GenericMTMDChatHandler(MTMDChatHandler): + def __init__( + self, + gguf_metadata: Dict[str, Any], + clip_model_path: str, + model_arch: Optional[str] = None, + verbose: bool = True, + **kwargs + ) -> None: + self.model_metadata = gguf_metadata + + self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) + self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch + + if verbose: + print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) + + if self.arch is None: + if verbose: + print("Unknown model architecture. Will use general/most-common tags.") + + self.arch = "unknown" + + if self.chat_format is None: + raise ValueError("Failed to get model chat template automatically.") + + super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) + + if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]: + self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"] + elif self.arch in ["gemma4"]: + self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"] + elif self.arch in ["mistral3", "mistral4", "deepseek2"]: + self._chat_format_parser_tags += ["[IMG]"] + elif verbose: + print("Warning: Could not determine chat format parser tags.", flush = True) class Llava15ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From a8d19d3bbd18890693576b1f5ed6cd0b2d487eab Mon Sep 17 00:00:00 2001 From: Alcoft Date: Mon, 4 May 2026 21:19:20 +0200 Subject: [PATCH 02/36] Used text.replace() --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 468a73c077..ab5e438d3e 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3125,7 +3125,7 @@ def _process_mtmd_prompt( if tag not in text: continue - text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):] + text = text.replace(tag, media_marker) # Replace image_url by media_marker in text for item in media_items: From 3e031d5de16d5bd81dd35ef2cc3b8e2d49fac063 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Tue, 5 May 2026 17:46:08 +0200 Subject: [PATCH 03/36] Fixed some bugs. --- llama_cpp/llama_chat_format.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ab5e438d3e..40491968a9 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3874,6 +3874,18 @@ def __init__( self._chat_format_parser_tags += ["[IMG]"] elif verbose: print("Warning: Could not determine chat format parser tags.", flush = True) + + def __call__(self, **kwargs): + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) class Llava15ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From 389d0d97babca3edcf6fb74f476e306a21183b5f Mon Sep 17 00:00:00 2001 From: Alcoft Date: Tue, 5 May 2026 18:49:21 +0200 Subject: [PATCH 04/36] Implemented 'chat_handler_kwargs'. --- llama_cpp/llama.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 848706a90d..6dab44602d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -152,6 +152,7 @@ def __init__( spm_infill: bool = False, verbose: bool = True, # Extra Params + chat_handler_kwargs: Dict[str, Any] = {}, **kwargs, # type: ignore ): """Load a llama.cpp model from `model_path`. @@ -618,7 +619,8 @@ def __init__( gguf_metadata = self.metadata, clip_model_path = clip_model_path, model_arch = None, - verbose = self.verbose + verbose = self.verbose, + **chat_handler_kwargs ) eos_token_id = self.token_eos() From 9187910e35e6f4d063f33364a10812727a05e58d Mon Sep 17 00:00:00 2001 From: Alcoft Date: Sat, 16 May 2026 06:41:17 +0200 Subject: [PATCH 05/36] fix --- llama_cpp/llama.py | 1 - llama_cpp/llama_chat_format.py | 33 +++++++++++---------------------- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6dab44602d..7666b822a8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -618,7 +618,6 @@ def __init__( self.chat_handler = llama_chat_format.GenericMTMDChatHandler( gguf_metadata = self.metadata, clip_model_path = clip_model_path, - model_arch = None, verbose = self.verbose, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 40491968a9..0be38a19d3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3839,47 +3839,36 @@ def from_pretrained( ) class GenericMTMDChatHandler(MTMDChatHandler): + KNOWN_MEDIA_TAGS = [ + "<|image_pad|>", + "<|audio_pad|>", + "<|video_pad|>", + "<|image|>", + "<|audio|>", + "<|video|>", + "[IMG]" + ] + def __init__( self, gguf_metadata: Dict[str, Any], clip_model_path: str, - model_arch: Optional[str] = None, verbose: bool = True, **kwargs ) -> None: self.model_metadata = gguf_metadata - self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) - self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch if verbose: print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) - - if self.arch is None: - if verbose: - print("Unknown model architecture. Will use general/most-common tags.") - - self.arch = "unknown" if self.chat_format is None: raise ValueError("Failed to get model chat template automatically.") super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) - - if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]: - self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"] - elif self.arch in ["gemma4"]: - self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"] - elif self.arch in ["mistral3", "mistral4", "deepseek2"]: - self._chat_format_parser_tags += ["[IMG]"] - elif verbose: - print("Warning: Could not determine chat format parser tags.", flush = True) def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) + self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] if self.verbose: print(f"{self.log_prefix} - Start processing") From b48d57a2b4019bbd248c848eefa1442c9e7890cb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 18 May 2026 21:43:37 +0800 Subject: [PATCH 06/36] Update Submodule vendor/llama.cpp 39cf5d6..6db1304 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 39cf5d6191..6db130445d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee +Subproject commit 6db130445d29b243ee2171efb8cd61b84a1c5322 From f309265b0df3ab2477682db3a959656dcb6d06e6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 19:36:28 +0800 Subject: [PATCH 07/36] build(ci+cu131): bundle LLVM OpenMP runtime for Windows CPU backends - Add a PowerShell step to the Windows CI workflow to locate and copy `libomp140.x86_64.dll` from the Visual Studio redistributables. - Place the runtime DLL into the `llama_cpp\lib` package directory. This ensures that the dynamically loaded `ggml-cpu-*.dll` variants (which are built with LLVM OpenMP on Windows) have their required dependencies packaged in the wheel. Without this, `ggml_backend_load_all_from_path()` can silently fail to load the CPU backends at runtime on end-user machines. Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu131-win.yml | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml index 14bea65d19..5f77003a5f 100644 --- a/.github/workflows/build-wheels-cu131-win.yml +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -67,6 +67,31 @@ jobs: echo LIB=%LIB%>>%GITHUB_ENV% echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + - name: Copy LLVM OpenMP runtime + shell: pwsh + run: | + # GGML CPU all-variant backends are built with LLVM OpenMP on Windows. + # The dynamically loaded ggml-cpu-*.dll files depend on this runtime. + # If it is missing from the wheel, ggml_backend_load_all_from_path() + # may fail to load CPU backend DLLs at runtime. + $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib" + New-Item -ItemType Directory -Force $packageLibDir | Out-Null + + $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" ` + -Recurse ` + -Filter "libomp140.x86_64.dll" ` + -ErrorAction SilentlyContinue | + Where-Object { $_.FullName -match "OpenMP\.LLVM" } | + Select-Object -First 1 + + if (!$omp) { + Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables." + exit 1 + } + + Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force + Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)" + - name: Build wheel run: | $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') From 677db7b0d5b834ae3d3831af4702ec21986ab335 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Thu, 28 May 2026 00:12:35 +0200 Subject: [PATCH 08/36] Resolve file conflicts. --- .github/workflows/build-wheels-cu131-win.yml | 25 -------------------- 1 file changed, 25 deletions(-) diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml index 5f77003a5f..14bea65d19 100644 --- a/.github/workflows/build-wheels-cu131-win.yml +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -67,31 +67,6 @@ jobs: echo LIB=%LIB%>>%GITHUB_ENV% echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% - - name: Copy LLVM OpenMP runtime - shell: pwsh - run: | - # GGML CPU all-variant backends are built with LLVM OpenMP on Windows. - # The dynamically loaded ggml-cpu-*.dll files depend on this runtime. - # If it is missing from the wheel, ggml_backend_load_all_from_path() - # may fail to load CPU backend DLLs at runtime. - $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib" - New-Item -ItemType Directory -Force $packageLibDir | Out-Null - - $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" ` - -Recurse ` - -Filter "libomp140.x86_64.dll" ` - -ErrorAction SilentlyContinue | - Where-Object { $_.FullName -match "OpenMP\.LLVM" } | - Select-Object -First 1 - - if (!$omp) { - Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables." - exit 1 - } - - Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force - Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)" - - name: Build wheel run: | $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') From 4794c8c20ee731838cbc2c8d601ccb2c245d6893 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Thu, 28 May 2026 01:52:48 +0200 Subject: [PATCH 09/36] Added support when using the keyword 'audio' instead of 'audio_url'. --- llama_cpp/llama_chat_format.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f9b9d52367..254195f95a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2996,13 +2996,13 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa media_items.append({"url": url, "type": "image"}) # 2. Audio Processing - elif content_type in ["audio_url", "input_audio"]: + elif content_type in ["audio", "audio_url", "input_audio"]: if not self.is_support_audio: raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") # Case A: Handle custom/forward-compatible audio_url format - if content_type == "audio_url": - audio_url = content["audio_url"] + if content_type == "audio_url" or content_type == "audio": + audio_url = content[content_type] url = audio_url if isinstance(audio_url, str) else audio_url["url"] media_items.append({"url": url, "type": "audio"}) # Case B: Handle OpenAI standard input_audio format From 323da373ad2f30409123bfba8322041113f0eba8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 23:08:36 +0800 Subject: [PATCH 10/36] build(CMakelists): Improve Windows LLVM OpenMP runtime discovery - Also improve diagnostics by reporting the selected runtime source and path, warning when an explicit override points to a missing file, and keeping a clear runtime warning when no OpenMP DLL can be found. Signed-off-by: JamePeng --- CMakeLists.txt | 79 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f09cdb783..1ace43c4aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,8 @@ function(llama_cpp_python_install_target target) endfunction() -# Install an extra Windows runtime DLL into the Python package runtime directory. +# Copy an extra Windows runtime DLL into the Python package runtime directory +# during the CMake install step. # # Some dynamically loaded backend libraries depend on runtime DLLs that are not # always discoverable through $. One important example @@ -75,7 +76,10 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file) endif() if(NOT EXISTS "${runtime_file}") - message(WARNING "Windows runtime file does not exist and will not be installed: ${runtime_file}") + message(WARNING + "Windows runtime DLL was selected but does not exist and will not be copied: " + "${runtime_file}" + ) return() endif() @@ -92,6 +96,11 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file) foreach(DIR ${INSTALL_DIRS}) file(TO_CMAKE_PATH "${DIR}" DIR_CMAKE) + message(STATUS + "Will copy Windows runtime DLL during install: " + "${runtime_file_cmake} -> ${DIR_CMAKE}" + ) + install( FILES "${runtime_file_cmake}" DESTINATION "${DIR_CMAKE}" @@ -115,42 +124,62 @@ function(llama_cpp_python_install_windows_openmp_runtime) endif() set(OPENMP_RUNTIME_DLL "") + set(OPENMP_RUNTIME_SOURCE "") + set(FOUND_OPENMP_DLLS "") + + if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL) + if(EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "LLAMA_CPP_OPENMP_RUNTIME_DLL") + else() + message(WARNING + "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: " + "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio " + "LLVM OpenMP runtime discovery." + ) + endif() + endif() - if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL AND EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") - set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") - else() + if(NOT OPENMP_RUNTIME_DLL) file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) - set(VS_OPENMP_SEARCH_ROOTS - "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" - "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" - "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" - "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" - ) + set(VS_OPENMP_SEARCH_PATTERNS + # Prefer the exact VS 2022 Enterprise / BuildTools LLVM OpenMP redist layout. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - foreach(ROOT ${VS_OPENMP_SEARCH_ROOTS}) - if(EXISTS "${ROOT}") - file( - GLOB_RECURSE FOUND_OPENMP_DLLS - "${ROOT}/*/debug_nonredist/x64/Microsoft.VC*.OpenMP.LLVM/libomp140.x86_64.dll" - "${ROOT}/**/libomp140.x86_64.dll" - ) + # Keep these as secondary fallbacks for non-standard installs. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "C:/Windows/System32/libomp140.x86_64.dll" + ) - if(FOUND_OPENMP_DLLS) - list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) - break() - endif() - endif() + foreach(PATTERN ${VS_OPENMP_SEARCH_PATTERNS}) + file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}") + list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS}) endforeach() + + if(FOUND_OPENMP_DLLS) + list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS) + list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING) + list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) + set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 LLVM OpenMP redist fallback") + endif() endif() if(OPENMP_RUNTIME_DLL) - message(STATUS "Installing Windows LLVM OpenMP runtime: ${OPENMP_RUNTIME_DLL}") + message(STATUS + "Selected Windows LLVM OpenMP runtime from ${OPENMP_RUNTIME_SOURCE}: " + "${OPENMP_RUNTIME_DLL}" + ) llama_cpp_python_install_windows_runtime_file("${OPENMP_RUNTIME_DLL}") else() message(WARNING - "Could not find libomp140.x86_64.dll. " + "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. " + "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL and Visual Studio 2022 " + "Enterprise/BuildTools redist paths under Program Files and Program Files (x86), " + "with a fuzzy MSVC version match such as 14.44.35112 or 14.44.35207. " "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " "the packaged ggml-cpu-*.dll files may fail to load at runtime. " "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " From 111819832614d488c840b266ad95f894f420bfea Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 23:48:49 +0800 Subject: [PATCH 11/36] ci(test): add cuda 13.0.2 build workflow Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu130-win.yml | 249 +++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 .github/workflows/build-wheels-cu130-win.yml diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml new file mode 100644 index 0000000000..790d7c9665 --- /dev/null +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -0,0 +1,249 @@ +name: Build Wheels (CU130) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu130 + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["windows-2022"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] + cuda: ["13.0.2"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + + defaults: + run: + shell: pwsh + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Add MSBuild to PATH + uses: microsoft/setup-msbuild@v3 + with: + msbuild-architecture: x64 + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Inspect Visual Studio OpenMP runtime paths + run: | + Write-Output "ProgramFiles=$env:ProgramFiles" + Write-Output "ProgramFiles(x86)=${env:ProgramFiles(x86)}" + Write-Output "" + + $vsRoots = @( + "$env:ProgramFiles\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "$env:ProgramFiles\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC" + ) + + foreach ($root in $vsRoots) { + Write-Output "Checking root: $root" + + if (Test-Path $root) { + Write-Output " Exists: yes" + Write-Output " MSVC version directories:" + + Get-ChildItem $root -Directory -ErrorAction SilentlyContinue | + Sort-Object Name | + ForEach-Object { + Write-Output " $($_.FullName)" + } + + Write-Output " OpenMP runtime candidates:" + + Get-ChildItem $root -Recurse -Filter "libomp140.x86_64.dll" -ErrorAction SilentlyContinue | + Sort-Object FullName | + ForEach-Object { + $sizeKB = [Math]::Round($_.Length / 1KB, 2) + $sizeMB = [Math]::Round($_.Length / 1MB, 4) + + Write-Output " Path: $($_.FullName)" + Write-Output " Size: $($_.Length) bytes / $sizeKB KB / $sizeMB MB" + } + } else { + Write-Output " Exists: no" + } + + Write-Output "" + } + + Write-Output "Checking System32 fallback:" + $system32OpenMP = "C:\Windows\System32\libomp140.x86_64.dll" + + if (Test-Path $system32OpenMP) { + $dll = Get-Item $system32OpenMP + $sizeKB = [Math]::Round($dll.Length / 1KB, 2) + $sizeMB = [Math]::Round($dll.Length / 1MB, 4) + + Write-Output " Path: $($dll.FullName)" + Write-Output " Size: $($dll.Length) bytes / $sizeKB KB / $sizeMB MB" + } else { + Write-Output " Not found: $system32OpenMP" + } + + - name: Install CUDA ${{ matrix.cuda }} + uses: Jimver/cuda-toolkit@v0.2.35 + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda }} + use-github-cache: false + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 + } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + + # Write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV + + - name: Get current date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 7a6ee9fcd57438a950eb2ee6c8e079f2409c2765 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 00:33:10 +0800 Subject: [PATCH 12/36] =?UTF-8?q?build(CMakeLists):=20prefer=20VS=202022?= =?UTF-8?q?=20VC143=20OpenMP=20redist=20and=20keep=20System32=20as=20final?= =?UTF-8?q?=20fallback=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: JamePeng --- CMakeLists.txt | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ace43c4aa..5b2cfeeb8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,7 +135,7 @@ function(llama_cpp_python_install_windows_openmp_runtime) message(WARNING "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: " "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio " - "LLVM OpenMP runtime discovery." + "VC143 LLVM OpenMP runtime discovery." ) endif() endif() @@ -144,18 +144,19 @@ function(llama_cpp_python_install_windows_openmp_runtime) file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) - set(VS_OPENMP_SEARCH_PATTERNS - # Prefer the exact VS 2022 Enterprise / BuildTools LLVM OpenMP redist layout. + set(VS_OPENMP_VC143_PATTERNS + # Prefer VS 2022 VC143 LLVM OpenMP redist paths. + # The MSVC version directory is intentionally globbed because + # GitHub runners may contain versions such as 14.44.35112 or 14.44.35207. "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - # Keep these as secondary fallbacks for non-standard installs. + # Secondary VS layout fallbacks for unusual installations. "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - "C:/Windows/System32/libomp140.x86_64.dll" ) - foreach(PATTERN ${VS_OPENMP_SEARCH_PATTERNS}) + foreach(PATTERN ${VS_OPENMP_VC143_PATTERNS}) file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}") list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS}) endforeach() @@ -164,7 +165,16 @@ function(llama_cpp_python_install_windows_openmp_runtime) list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS) list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING) list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) - set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 LLVM OpenMP redist fallback") + set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 VC143 LLVM OpenMP redist") + endif() + endif() + + if(NOT OPENMP_RUNTIME_DLL) + set(SYSTEM32_OPENMP_RUNTIME_DLL "C:/Windows/System32/libomp140.x86_64.dll") + + if(EXISTS "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "System32 fallback") endif() endif() @@ -177,9 +187,10 @@ function(llama_cpp_python_install_windows_openmp_runtime) else() message(WARNING "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. " - "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL and Visual Studio 2022 " - "Enterprise/BuildTools redist paths under Program Files and Program Files (x86), " - "with a fuzzy MSVC version match such as 14.44.35112 or 14.44.35207. " + "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL, Visual Studio 2022 " + "Enterprise/BuildTools VC143 redist paths under Program Files and " + "Program Files (x86), with a fuzzy MSVC version match such as " + "14.44.35112 or 14.44.35207, and C:/Windows/System32 as a final fallback. " "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " "the packaged ggml-cpu-*.dll files may fail to load at runtime. " "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " From 50bbdd61fdf7e2e1cd7582a2183e476c98a47c17 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 02:54:00 +0800 Subject: [PATCH 13/36] Update Submodule vendor/llama.cpp f0156d1..7d2b45b Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f0156d1401..7d2b45b4f7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f0156d1401500512ad85042ccf38970568b12253 +Subproject commit 7d2b45b4f7b663cda74f23fbc3ce6dc3bd4f6545 From 55e855b75f901b494259a1c81b45ac80f0e3013f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 05:03:15 +0800 Subject: [PATCH 14/36] Update mtmd API 20260609 Signed-off-by: JamePeng --- llama_cpp/mtmd_cpp.py | 293 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 283 insertions(+), 10 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 4542555c65..61fb0e7859 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -10,12 +10,14 @@ c_uint8, c_int32, c_uint32, + c_int64, c_float, c_void_p, c_size_t, POINTER, _Pointer, # type: ignore Structure, + CFUNCTYPE ) import pathlib from typing import ( @@ -318,6 +320,16 @@ def mtmd_get_audio_sample_rate(ctx: mtmd_context_p) -> c_int: """ ... +# // get the current marker string +# MTMD_API const char * mtmd_get_marker(const mtmd_context * ctx); +@ctypes_function_mtmd( + "mtmd_get_marker", [mtmd_context_p_ctypes], c_char_p) +def mtmd_get_marker(ctx: mtmd_context_p) -> c_char_p: + """ + get the current marker string + """ + ... + # // mtmd_bitmap # // # // if bitmap is image: @@ -420,6 +432,58 @@ def mtmd_bitmap_set_id( ... +# // mtmd_bitmap lazy +# // +# // this is a special bitmap that: +# // - does not hold the actual data +# // - can be expanded into one or more chunks (either media to text chunks) +# // user must provide a callback to fill in the data when mtmd_tokenize() is called +# // this is useful for large video inputs: +# // - allow reading video frame by frame, without loading the entire video into memory +# // - allow tracking the whole video with a single ID (for example, the file hash) + +# // set (*out_bitmap) to non-nullptr to emit a bitmap chunk; it will be freed automatically +# // set (*out_text) to non-nullptr to emit a text chunk; it must be heap-allocated, null-terminated and will be freed automatically +# // either out_bitmap or out_text can be set, but not both +# // out_bitmap cannot be another lazy bitmap (no nested lazy allowed) +# // return value: +# // 0 on success +# // -1 on EOF (signal to mtmd_tokenize to move on) +# // -2 on error (signal to mtmd_tokenize to abort) +# typedef int(* mtmd_bitmap_lazy_callback)( +# size_t chunk_idx, +# void * user_data, +# mtmd_bitmap ** out_bitmap, +# char ** out_text); +mtmd_bitmap_lazy_callback = CFUNCTYPE( + c_int, + c_size_t, # chunk_idx + c_void_p, # user_data + POINTER(mtmd_bitmap_p), # mtmd_bitmap ** out_bitmap + POINTER(c_char_p), # char ** out_text +) + +# MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx, +# const char * id, // usually set to file hash +# void * user_data, +# mtmd_bitmap_lazy_callback callback); +@ctypes_function_mtmd( + "mtmd_input_chunks_get", [ + mtmd_context_p_ctypes, + c_char_p, + c_void_p, + mtmd_bitmap_lazy_callback, + ], mtmd_bitmap_p_ctypes) +def mtmd_input_chunks_get( + ctx: mtmd_context_p, + id: c_char_p, + user_data: c_void_p, + callback: mtmd_bitmap_lazy_callback, # type: ignore + /, +) -> mtmd_bitmap_p: + ... + + # // mtmd_input_chunks # // # // this is simply a list of mtmd_input_chunk @@ -772,6 +836,9 @@ def mtmd_test_create_input_chunks() -> mtmd_input_chunk_p: # // BREAKING CHANGES are expected. # // +# struct mtmd_helper_video; +mtmd_helper_video_p = NewType("mtmd_helper_video_p", int) +mtmd_helper_video_p_ctypes = c_void_p # // Set callback for all future logging events. # // If this is not called, or NULL is supplied, everything is output on stderr. @@ -786,11 +853,33 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # ... +# // Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). +# MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx); +@ctypes_function_mtmd( + "mtmd_helper_support_video", [mtmd_context_p], c_bool) +def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool: + """ + Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). + """ + ... + + +# struct mtmd_helper_bitmap_wrapper { +# mtmd_bitmap * bitmap; +# mtmd_helper_video * video_ctx; +# }; +class mtmd_helper_bitmap_wrapper(Structure): + _fields_ = [ + ("bitmap", mtmd_bitmap_p), + ("video_ctx", mtmd_helper_video_p), + ] +mtmd_helper_bitmap_wrapper_p_ctypes = POINTER(mtmd_helper_bitmap_wrapper) + # // helper function to construct a mtmd_bitmap from a file # // it calls mtmd_helper_bitmap_init_from_buf() internally # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); +# MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); @ctypes_function_mtmd( "mtmd_helper_bitmap_init_from_file", [ @@ -798,14 +887,14 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # c_char_p, c_bool, ], - mtmd_bitmap_p_ctypes + mtmd_helper_bitmap_wrapper ) def mtmd_helper_bitmap_init_from_file( ctx: mtmd_context_p, fname: c_char_p, placeholder: c_bool, /, -) -> mtmd_bitmap_p: +) -> mtmd_helper_bitmap_wrapper: """ helper function to construct a mtmd_bitmap from a file it calls mtmd_helper_bitmap_init_from_buf() internally @@ -818,10 +907,13 @@ def mtmd_helper_bitmap_init_from_file( # // supported formats: # // image: formats supported by stb_image: jpg, png, bmp, gif, etc. # // audio: formats supported by miniaudio: wav, mp3, flac -# // note: audio files will be auto-detected based on magic bytes +# // note: +# // - for now, video input is only supported via C++ helper functions +# // - audio files will be auto-detected based on magic bytes +# // - output bitmap will have FNV hash as the ID # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); +# MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); @ctypes_function_mtmd( "mtmd_helper_bitmap_init_from_buf", [ mtmd_context_p_ctypes, @@ -829,7 +921,7 @@ def mtmd_helper_bitmap_init_from_file( c_size_t, c_bool, ], - mtmd_bitmap_p_ctypes + mtmd_helper_bitmap_wrapper ) def mtmd_helper_bitmap_init_from_buf( ctx: mtmd_context_p, @@ -837,13 +929,16 @@ def mtmd_helper_bitmap_init_from_buf( len: c_size_t, placeholder: c_bool, /, -) -> mtmd_bitmap_p: +) -> mtmd_helper_bitmap_wrapper: """ helper function to construct a mtmd_bitmap from a buffer containing a file supported formats: - image: formats supported by stb_image: jpg, png, bmp, gif, etc. - audio: formats supported by miniaudio: wav, mp3, flac - note: audio files will be auto-detected based on magic bytes + image: formats supported by stb_image: jpg, png, bmp, gif, etc. + audio: formats supported by miniaudio: wav, mp3, flac + note: + - for now, video input is only supported via C++ helper functions + - audio files will be auto-detected based on magic bytes + - output bitmap will have FNV hash as the ID returns nullptr on failure """ ... @@ -1020,3 +1115,181 @@ def mtmd_helper_decode_image_chunk( ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure """ ... + +# // +# // video input helpers (requires ffmpeg/ffprobe installed on the system) +# // the notion of video only exists at the helper level, it is not visible to the core mtmd library +# // +# // NOTE: this implementation is model-agnostic, it can be used with any vision-capable model +# // however, it may not be accurate for some specific models +# // (this is expected for now, to keep the implementation simple) +# // + +# struct mtmd_helper_video_info { +# uint32_t width; +# uint32_t height; +# float fps; // effective fps (fps_target if set, else original video fps) +# int32_t n_frames; // estimated total frames at effective fps (-1 if unknown) +# }; +class mtmd_helper_video_info(Structure): + _fields_ = [ + ("width", c_uint32), + ("height", c_uint32), + ("fps", c_float), + ("n_frames", c_int32), + ] +mtmd_helper_video_info_p_ctypes = POINTER(mtmd_helper_video_info) + + +# struct mtmd_helper_video_init_params { +# float fps_target; // desired output fps; <= 0 means use the video's native fps, defaulted to 4.0f +# const char * ffmpeg_bin_dir; // directory containing ffmpeg/ffprobe binaries; NULL means search PATH +# int64_t timestamp_interval_ms; // interval for adding timestamp as text chunk (example: "[10m50.5s]"); <= 0 means no timestamp, defaulted to 5000ms +# // TODO @ngxson : allow "placeholder" bitmap output for counting tokens +# }; +class mtmd_helper_video_init_params(Structure): + _fields_ = [ + ("fps_target", c_float), + ("ffmpeg_bin_dir", c_char_p), + ("timestamp_interval_ms", c_int64), + ] +mtmd_helper_video_init_params_p_ctypes = POINTER(mtmd_helper_video_init_params) + + +# MTMD_API struct mtmd_helper_video_init_params mtmd_helper_video_init_params_default(void); +@ctypes_function_mtmd( + "mtmd_helper_video_init_params_default", + [], + mtmd_helper_video_init_params, +) +def mtmd_helper_video_init_params_default( + /, +) -> mtmd_helper_video_init_params: + """ + get default init params for mtmd_helper_video + """ + ... + + +# // returns NULL on failure (ffprobe not found, file unreadable, etc.) +# MTMD_API mtmd_helper_video * mtmd_helper_video_init( +# struct mtmd_context * mctx, +# const char * path, +# struct mtmd_helper_video_init_params params); +@ctypes_function_mtmd( + "mtmd_helper_video_init", [ + mtmd_context_p_ctypes, + c_char_p, + mtmd_helper_video_init_params, + ], + mtmd_helper_video_p) +def mtmd_helper_video_init( + mctx: mtmd_context_p, + path: c_char_p, + params: mtmd_helper_video_init_params, + /, +) -> mtmd_helper_video_p: + """ + helper function to init an mtmd_helper_video object + returns NULL on failure (ffprobe not found, file unreadable, etc.) + """ + ... + + +# // Same as mtmd_helper_video_init(), but reads from an in-memory buffer. +# // The buffer is copied internally; the caller does not need to keep it alive. +# // Note: pipe input is not seekable, so seeking will use output-side seeking +# // (ffmpeg decodes and discards frames up to the target position). +# MTMD_API mtmd_helper_video * mtmd_helper_video_init_from_buf( +# struct mtmd_context * mctx, +# const unsigned char * buf, size_t len, +# struct mtmd_helper_video_init_params params); +@ctypes_function_mtmd( + "mtmd_helper_video_init_from_buf", + [ + mtmd_context_p_ctypes, + c_char_p, + c_size_t, + mtmd_helper_video_init_params, + ], + mtmd_helper_video_p_ctypes, +) +def mtmd_helper_video_init_from_buf( + mctx: mtmd_context_p, + buf: c_char_p, + len: int, + params: mtmd_helper_video_init_params, + /, +) -> mtmd_helper_video_p: + """ + helper function to init an mtmd_helper_video object from an in-memory video buffer + + The buffer is copied internally, so the caller does not need to keep it alive + after this function returns. + """ + ... + + +# MTMD_API void mtmd_helper_video_free(mtmd_helper_video * ctx); +@ctypes_function_mtmd("mtmd_helper_video_free", [mtmd_helper_video_p_ctypes], None) +def mtmd_helper_video_free( + ctx: mtmd_helper_video_p, + /, +) -> None: + """ + free an mtmd_helper_video object + """ + ... + + +# MTMD_API struct mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx); +@ctypes_function_mtmd("mtmd_helper_video_get_info", [mtmd_helper_video_p_ctypes], mtmd_helper_video_info) +def mtmd_helper_video_get_info( + ctx: mtmd_helper_video_p, + /, +) -> mtmd_helper_video_info: + """ + get video information from an mtmd_helper_video object + """ + ... + + +# // Read the next item from the video stream; exactly one of out_bitmap or out_text is set per call. +# // *out_bitmap - heap-allocated; caller must free with mtmd_bitmap_free() +# // *out_text - heap-allocated (always via strdup/malloc); caller must free with free() +# // returns 0 on success, -1 on EOF, -2 on error +# MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx, +# mtmd_bitmap ** out_bitmap, +# char ** out_text); +@ctypes_function_mtmd( + "mtmd_helper_video_read_next", + [ + mtmd_helper_video_p_ctypes, + POINTER(mtmd_bitmap_p_ctypes), + POINTER(c_char_p), + ], + c_int32, +) +def mtmd_helper_video_read_next( + ctx: mtmd_helper_video_p, + out_bitmap: POINTER(mtmd_bitmap_p_ctypes), # type: ignore + out_text: POINTER(c_char_p), # type: ignore + /, +) -> int: + """ + read the next item from the video stream + + Exactly one of out_bitmap or out_text is set per successful call. + + out_bitmap: + heap-allocated bitmap; caller must free it with mtmd_bitmap_free() + + out_text: + heap-allocated string via strdup/malloc; caller must free it with free() + + returns: + 0 on success + -1 on EOF + -2 on error + """ + ... From 10b4addb9d5f2ff71bddde34f43f8a43fac44b61 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 05:08:49 +0800 Subject: [PATCH 15/36] feat(mtmd): add video input support to MTMDChatHandler - Add video_url handling to the MTMD chat template and media extraction pipeline. Detect whether the loaded libmtmd build supports video helpers and reject video inputs early when MTMD_VIDEO is unavailable. - Update media loading and bitmap creation for the new helper wrapper API. mtmd_helper_bitmap_init_from_buf now returns a bitmap wrapper containing both the decoded bitmap and an optional video helper context, so keep the video context alive until mtmd_tokenize completes and release it afterward. - Also consolidate duplicated audio/video byte loading into a shared _load_bytes helper, reuse it for image loading, and add richer default HTTP headers for remote media requests. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 173 ++++++++++++++++++++++----------- 1 file changed, 115 insertions(+), 58 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index fb42a59f23..2224466436 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3064,6 +3064,8 @@ class MTMDChatHandler: "{% else %}" "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" "{% endif %}" + "{% elif content.type == 'video_url' %}" + "{{ content.video_url if content.video_url is string else content.video_url.url }}" "{% elif content.type == 'text' %}" "{{ content.text }}" "{% endif %}" @@ -3114,6 +3116,10 @@ def __init__( self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None self.extra_template_arguments: dict[str, Any] = {} + self.is_support_vision = False + self.is_support_audio = False + self.is_support_video = False + if not os.path.exists(clip_model_path): raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") @@ -3182,6 +3188,15 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): if self.verbose: print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) + # Check if video is supported + self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) + if self.is_support_video: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) + def close(self) -> None: """Explicitly free the mtmd context and vision model resources.""" if getattr(self, "mtmd_ctx", None) is not None: @@ -3259,7 +3274,16 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa if url: media_items.append({"url": url, "type": "audio"}) - # 3. Text & Unknown Types + # 3. Video Processing + elif content_type == "video_url": + if not self.is_support_video: + raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.") + + video_url = content["video_url"] + url = video_url if isinstance(video_url, str) else video_url["url"] + media_items.append({"url": url, "type": "video"}) + + # 4. Text & Unknown Types elif content_type == "text": continue else: @@ -3274,6 +3298,7 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): Supported formats: - Images (via stb_image): jpg, png, bmp, etc. - Audio (via miniaudio): wav, mp3, flac. + - Video: depends on whether MTMD_VIDEO was enabled at build time. Note: - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. @@ -3283,25 +3308,35 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): media_bytes (bytes): The raw byte content of the media file. Returns: - mtmd_bitmap: A pointer to the allocated bitmap structure containing decoded media features. + bitmap: mtmd_bitmap * + video_ctx: mtmd_helper_video * or NULL """ if self.mtmd_ctx is None: raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") - # Create bitmap from buffer using helper function - bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + if not media_bytes: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + + buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + + wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( self.mtmd_ctx, - (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), + buf, len(media_bytes), False, ) - if bitmap is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): " - "Failed to load image or audio file from media bytes " - "(unsupported media format or corrupted data).") + if not wrapper.bitmap: + if wrapper.video_ctx: + self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) - return bitmap + raise ValueError( + f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load media from bytes " + "(unsupported media format, corrupted data, or missing helper support)." + ) + + return wrapper.bitmap, wrapper.video_ctx def _process_mtmd_prompt( @@ -3360,16 +3395,17 @@ def _process_mtmd_prompt( # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding bitmaps = [None] * len(media_items) bitmap_cleanup = [] + video_cleanup = [] chunks = None try: # Concurrent Media Decoding import concurrent.futures if media_items: - def _create_bitmap_func(idx: int, item: str): + def _create_bitmap_func(idx: int, item: dict): media_bytes = self.load_media(item["url"], item["type"]) - bitmap = self._create_bitmap_from_bytes(media_bytes) - return idx, bitmap + bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) + return idx, bitmap, video_ctx # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, # which can be used in the future to process large numbers of video frames. max_workers = min(llama.n_threads, len(media_items)) @@ -3377,10 +3413,14 @@ def _create_bitmap_func(idx: int, item: str): futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] for future in concurrent.futures.as_completed(futures): - idx, bitmap = future.result() + idx, bitmap, video_ctx = future.result() + bitmaps[idx] = bitmap bitmap_cleanup.append(bitmap) + if video_ctx: + video_cleanup.append(video_ctx) + # Strict validation: Abort if any thread failed to decode its assigned media if any(b is None for b in bitmaps): raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") @@ -3415,6 +3455,12 @@ def _create_bitmap_func(idx: int, item: str): if result != 0: raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") + # Video helper contexts only need to stay alive until mtmd_tokenize() completes. + if video_cleanup: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup.clear() + # 6. Virtual Token Ledger Construction full_prompt_ids = [] chunk_token_spans = [] @@ -3424,6 +3470,7 @@ def _create_bitmap_func(idx: int, item: str): # Cursor to track the actual media contents (URLs or base64 data) provided by the user media_items_count = len(media_items) media_items_cur = 0 + last_media_id = None for i in range(n_chunks): chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) @@ -3463,7 +3510,11 @@ def _create_bitmap_func(idx: int, item: str): # while instantly breaking the match if the image content changes. # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 + last_media_id = media_id media_items_cur += 1 + elif last_media_id is not None: + # video may expand into multiple image chunks from one media marker + media_id = last_media_id else: # Magic Negative Number as fallback :) media_id = -314159 @@ -3492,6 +3543,12 @@ def _create_bitmap_func(idx: int, item: str): for bitmap in bitmap_cleanup: self._mtmd_cpp.mtmd_bitmap_free(bitmap) bitmap_cleanup = None + # Free videos + if len(video_cleanup) > 0: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup = None + bitmaps = None raise e @@ -3825,18 +3882,22 @@ def __call__( def load_media(self, media_url: str, media_type: str) -> bytes: """ Unified dispatcher for loading media payloads. - Routes the URL/URI to the specific image or audio processor based on the media_type. + Routes the URL/URI to the specific image, audio, or video processor based on the media_type. """ if media_type == "image": return self._load_image(media_url) + elif media_type == "audio": - audio_bytes = self._load_audio(media_url) - # Apply ironclad magic bytes validation before returning + audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") try: self.detect_audio_format(audio_bytes) except ValueError as e: raise ValueError(f"{self.log_prefix}(load_media): {e}") return audio_bytes + + elif media_type == "video": + return self._load_bytes(media_url, timeout=30, kind="video") + else: raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") @@ -3876,41 +3937,51 @@ def detect_audio_format(audio_bytes: bytes) -> str: "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." ) + DEFAULT_HTTP_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/148.0.0.0 Safari/537.36" + ), + } + @staticmethod - def _load_audio(audio_url: str) -> bytes: + def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: """ - Load audio from either a URL, local path, or a data URI and return raw bytes. + Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. """ + media_bytes = b"" - audio_bytes = b"" - - # 1. Handle data URI (base64) - if audio_url.strip().startswith("data:"): - comma_pos = audio_url.find(",") + # 1. Handle data URI + if media_url.strip().startswith("data:"): + comma_pos = media_url.find(",") if comma_pos == -1: raise ValueError("Invalid data URI: missing comma separator") - base64_data = audio_url[comma_pos + 1 :] - audio_bytes = base64.b64decode(base64_data) + + base64_data = media_url[comma_pos + 1:] + media_bytes = base64.b64decode(base64_data) # 2. Handle local file path - elif os.path.exists(audio_url): - with open(audio_url, "rb") as f: - audio_bytes = f.read() + elif os.path.exists(media_url): + with open(media_url, "rb") as f: + media_bytes = f.read() # 3. Handle remote URL via HTTP/HTTPS else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(audio_url, headers=headers) + req = urllib.request.Request( + media_url, + headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, + ) try: - with urllib.request.urlopen(req, timeout=15) as f: - audio_bytes = f.read() + with urllib.request.urlopen(req, timeout=timeout) as f: + media_bytes = f.read() except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download audio from {audio_url}: {e}") + raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") - if not audio_bytes: - raise ValueError("Empty audio data received") + if not media_bytes: + raise ValueError(f"Empty {kind} data received") - return audio_bytes + return media_bytes @staticmethod def _load_image(image_url: str) -> bytes: @@ -3926,28 +3997,14 @@ def _load_image(image_url: str) -> bytes: Returns: JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. """ - image_bytes = b"" - - # 1. Handle data URI (base64) - if image_url.strip().startswith("data:"): - # Split only once from the right to correctly handle mime types containing commas - comma_pos = image_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - base64_data = image_url[comma_pos + 1 :] - image_bytes = base64.b64decode(base64_data) - - # 2. Handle local/remote URL - else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(image_url, headers=headers) - - try: - with urllib.request.urlopen(req, timeout=15) as f: - image_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download image from {image_url}: {e}") + # 1. Load image bytes from image_url + image_bytes = MTMDChatHandler._load_bytes( + image_url, + timeout=15, + kind="image", + ) + # 2. Check if image_bytes is empty. if not image_bytes: raise ValueError("Empty image data received") From e4dcac1af57b58973ecf7e206a3c25b3c367d881 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 22:22:15 +0800 Subject: [PATCH 16/36] Update Submodule vendor/llama.cpp 7d2b45b..d6d0ce8 Signed-off-by: JamePeng --- llama_cpp/mtmd_cpp.py | 14 ++++++-------- vendor/llama.cpp | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 61fb0e7859..30ca8fab90 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -459,8 +459,8 @@ def mtmd_bitmap_set_id( c_int, c_size_t, # chunk_idx c_void_p, # user_data - POINTER(mtmd_bitmap_p), # mtmd_bitmap ** out_bitmap - POINTER(c_char_p), # char ** out_text + POINTER(mtmd_bitmap_p_ctypes), # mtmd_bitmap ** out_bitmap + POINTER(c_char_p), # char ** out_text ) # MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx, @@ -856,7 +856,7 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # # // Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). # MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx); @ctypes_function_mtmd( - "mtmd_helper_support_video", [mtmd_context_p], c_bool) + "mtmd_helper_support_video", [mtmd_context_p_ctypes], c_bool) def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool: """ Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). @@ -870,8 +870,8 @@ def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool: # }; class mtmd_helper_bitmap_wrapper(Structure): _fields_ = [ - ("bitmap", mtmd_bitmap_p), - ("video_ctx", mtmd_helper_video_p), + ("bitmap", mtmd_bitmap_p_ctypes), + ("video_ctx", mtmd_helper_video_p_ctypes), ] mtmd_helper_bitmap_wrapper_p_ctypes = POINTER(mtmd_helper_bitmap_wrapper) @@ -1162,9 +1162,7 @@ class mtmd_helper_video_init_params(Structure): [], mtmd_helper_video_init_params, ) -def mtmd_helper_video_init_params_default( - /, -) -> mtmd_helper_video_init_params: +def mtmd_helper_video_init_params_default() -> mtmd_helper_video_init_params: """ get default init params for mtmd_helper_video """ diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7d2b45b4f7..d6d0ce8215 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7d2b45b4f7b663cda74f23fbc3ce6dc3bd4f6545 +Subproject commit d6d0ce8215a1c324e8de04b52f9dd65c5edc129f From 54f56bd8f89769f2021f31eba0aa377dc290f203 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 13 Jun 2026 00:46:09 +0800 Subject: [PATCH 17/36] Update Submodule vendor/llama.cpp d6d0ce8..ebc1077 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d6d0ce8215..ebc10770ac 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d6d0ce8215a1c324e8de04b52f9dd65c5edc129f +Subproject commit ebc10770ac5a9331824c53ef0c6adad780904dc3 From 6d1bd3b8d751a3a2ac86d377ecd34a3b37278b15 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 00:04:56 +0800 Subject: [PATCH 18/36] Update Submodule vendor/llama.cpp ebc1077..e8067a8 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ebc10770ac..e8067a8b36 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ebc10770ac5a9331824c53ef0c6adad780904dc3 +Subproject commit e8067a8b3624aa40cc88ecb2940060e5d65b7532 From 971ee384227f6268f244c93f620b12f0a6ff47c0 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 01:03:09 +0800 Subject: [PATCH 19/36] Update(mtmd): Append mtmd batching API - Sync upstream: mtmd: add batching API (#24384) Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 3 + llama_cpp/mtmd_cpp.py | 142 ++++++++++++++++++++++++++++++--- 2 files changed, 134 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 2224466436..520d2429d4 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3094,6 +3094,7 @@ def __init__( use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1, + batch_max_tokens: int = 1024, **kwargs ): @@ -3108,6 +3109,7 @@ def __init__( self.clip_model_path = clip_model_path self.image_min_tokens = image_min_tokens self.image_max_tokens = image_max_tokens + self.batch_max_tokens = batch_max_tokens self.use_gpu = use_gpu self.verbose = verbose @@ -3152,6 +3154,7 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + self.mctx_params.batch_max_tokens = self.batch_max_tokens # Cache the model's eos token and bos token self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 30ca8fab90..4513761a63 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -153,6 +153,21 @@ class mtmd_pos_type(enum.IntEnum): mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int) mtmd_input_chunks_p_ctypes = c_void_p +# struct mtmd_batch { +# mtmd_context * ctx; +# std::vector entries; +# std::vector output_embd; // aggregated output embedding for the whole batch +# mtmd_batch(mtmd_context * ctx): ctx(ctx) {} +# int32_t n_tokens() const { +# int32_t n = 0; +# for (const auto * chunk : entries) { +# n += mtmd_input_chunk_get_n_tokens(chunk); +# } +# return n; +# } +# }; +mtmd_batch_p = NewType("mtmd_batch_p", int) +mtmd_batch_p_ctypes = c_void_p # struct mtmd_input_text { # const char * text; @@ -210,6 +225,11 @@ class clip_context_params(Structure): # // callback function passed over to mtmd proper # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; +# +# // batching params +# int32_t batch_max_tokens; // maximum number of output tokens in a batch +# // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit) +# // (default: 1024) # }; class mtmd_context_params(Structure): _fields_ = [ @@ -224,6 +244,7 @@ class mtmd_context_params(Structure): ("image_max_tokens", c_int), ("cb_eval", ggml_backend_sched_eval_callback), ("cb_eval_user_data", c_void_p), + ("batch_max_tokens", c_int32), ] mtmd_context_params_p_ctypes = POINTER(mtmd_context_params) @@ -731,8 +752,8 @@ def mtmd_tokenize( # // returns 0 on success # // TODO: deprecate -# MTMD_API int32_t mtmd_encode(mtmd_context * ctx, -# const mtmd_image_tokens * image_tokens); +# DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens), +# "use mtmd_encode_chunk() instead"); @ctypes_function_mtmd( "mtmd_encode", [ mtmd_context_p_ctypes, @@ -745,10 +766,15 @@ def mtmd_encode( image_tokens: mtmd_image_tokens_p, /, ) -> c_int32: + """ + DEPRECATED: use mtmd_encode_chunk() instead + """ ... +# // text chunk will be ignored silently, only media chunk will be encoded # // returns 0 on success +# // returns 1 on generic error # MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, # const mtmd_input_chunk * chunk); @ctypes_function_mtmd( @@ -763,6 +789,11 @@ def mtmd_encode_chunk( chunk: mtmd_input_chunk_p, /, ) -> c_int32: + """ + text chunk will be ignored silently, only media chunk will be encoded + returns 0 on success + returns 1 on generic error + """ ... # // get output embeddings from the last encode pass @@ -778,6 +809,95 @@ def mtmd_get_output_embd(ctx: mtmd_context_p) -> POINTER(c_float): # type: ignor ... +# // batch encoding API +# // chunks are not owned by the batch, they will not be freed by mtmd_batch_free() +# // batch is valid for a given context, cannot be shared across contexts +# MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx); +@ctypes_function_mtmd( + "mtmd_batch_init", + [mtmd_context_p_ctypes], + mtmd_batch_p_ctypes, +) +def mtmd_batch_init(ctx: mtmd_context_p, /) -> mtmd_batch_p: + ... + + +# MTMD_API void mtmd_batch_free(mtmd_batch * batch); +@ctypes_function_mtmd( + "mtmd_batch_free", + [mtmd_batch_p_ctypes], + None, +) +def mtmd_batch_free(batch: mtmd_batch_p, /): + """ + chunks are not owned by the batch, they will not be freed by mtmd_batch_free() + batch is valid for a given context, cannot be shared across contexts + """ + ... + + +# // only media chunks are allowed, text chunks will be rejected +# // returns 0 on success +# // returns 1 on generic error +# // returns 2 if the batch is too large (chunk won't be added) +# // returns 3 if it cannot be batched with the existing chunks in the batch +# MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function_mtmd( + "mtmd_batch_add_chunk", + [ + mtmd_batch_p_ctypes, + mtmd_input_chunk_p_ctypes, + ], + c_int32, +) +def mtmd_batch_add_chunk( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> c_int32: + """ + only media chunks are allowed, text chunks will be rejected + returns 0 on success + returns 1 on generic error + returns 2 if the batch is too large (chunk won't be added) + returns 3 if it cannot be batched with the existing chunks in the batch + """ + ... + + +# // returns 0 on success +# // returns 1 on generic error +# MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch); +@ctypes_function_mtmd( + "mtmd_batch_encode", + [mtmd_batch_p_ctypes], + c_int32, +) +def mtmd_batch_encode(batch: mtmd_batch_p, /) -> c_int32: + """ + returns 0 on success + returns 1 on generic error + """ + ... + + +# MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function_mtmd( + "mtmd_batch_get_output_embd", + [ + mtmd_batch_p_ctypes, + mtmd_input_chunk_p_ctypes, + ], + POINTER(c_float), +) +def mtmd_batch_get_output_embd( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> POINTER(c_float): # type: ignore + ... + + # // Set callback for all future logging events. # // If this is not called, or NULL is supplied, everything is output on stderr. # MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); @@ -947,8 +1067,8 @@ def mtmd_helper_bitmap_init_from_buf( # // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); @ctypes_function_mtmd( - "mtmd_helper_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t) -def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunk_p) -> c_size_t: + "mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t) +def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p) -> c_size_t: """ helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache """ @@ -959,8 +1079,8 @@ def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunk_p) -> c_size_t: # // normally, n_pos is equal to n_tokens, but for M-RoPE it is different # MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); @ctypes_function_mtmd( - "mtmd_helper_get_n_pos", [mtmd_input_chunk_p_ctypes], c_int32) -def mtmd_helper_get_n_pos(chunks: mtmd_input_chunk_p) -> c_int32: + "mtmd_helper_get_n_pos", [mtmd_input_chunks_p_ctypes], c_int32) +def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p) -> c_int32: """ helper to count the total position of tokens from a list of chunks, useful to keep track of n_past normally, n_pos is equal to n_tokens, but for M-RoPE it is different @@ -991,8 +1111,8 @@ def mtmd_helper_image_get_decoder_pos( # // helper function that automatically: # // 1. run llama_decode() on text chunks -# // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() -# // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error +# // 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode() +# // if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error # // otherwise, returns 0 on success # // this function is NOT thread-safe # MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, @@ -1007,7 +1127,7 @@ def mtmd_helper_image_get_decoder_pos( "mtmd_helper_eval_chunks", [ mtmd_context_p_ctypes, llama_cpp.llama_context_p_ctypes, - mtmd_input_chunk_p_ctypes, + mtmd_input_chunks_p_ctypes, c_int32, c_int32, c_int32, @@ -1018,7 +1138,7 @@ def mtmd_helper_image_get_decoder_pos( def mtmd_helper_eval_chunks( ctx: mtmd_context_p, lctx: llama_cpp.llama_context_p, - chunks: mtmd_input_chunk_p, + chunks: mtmd_input_chunks_p, n_past: c_int32, seq_id: c_int32, n_batch: c_int32, @@ -1106,7 +1226,7 @@ def mtmd_helper_decode_image_chunk( n_past: c_int32, seq_id: c_int32, n_batch: c_int32, - new_n_past: c_int32, + new_n_past: POINTER(c_int32), # type: ignore /, ) -> c_int32: """ From cb299e67e51e5aff061ebcf9f1521695ad3f1a5d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 03:05:32 +0800 Subject: [PATCH 20/36] Update(MTMDChatHandler): add chunk type helpers - Add small helper methods `_is_text_chunk`/`_is_image_chunk`/`_is_audio_chunk` for checking MTMD text, image, and audio chunk type enum values. - This keeps MTMD prompt processing easier to read and avoids repeating direct enum comparisons when building token spans for text and media chunks. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 36 +++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 520d2429d4..aadec4600e 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3341,6 +3341,26 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): return wrapper.bitmap, wrapper.video_ctx + def _is_text_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD text chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT + ) + + def _is_image_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD image chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + ) + + def _is_audio_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ) def _process_mtmd_prompt( self, @@ -3480,7 +3500,7 @@ def _create_bitmap_func(idx: int, item: dict): if chunk is None: continue chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: + if self._is_text_chunk(chunk_type): # Extract standard text token IDs n_tokens_out = ctypes.c_size_t() tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) @@ -3489,10 +3509,7 @@ def _create_bitmap_func(idx: int, item: dict): chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) full_prompt_ids.extend(tokens) current_idx += len(tokens) - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): # Extract media properties # Note(JamePeng): # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). @@ -3673,7 +3690,7 @@ def __call__( if end_idx <= n_past: continue - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: + if self._is_text_chunk(chunk_type): unprocessed_start = max(start_idx, n_past) - start_idx n_tokens_out = ctypes.c_size_t() tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) @@ -3689,14 +3706,11 @@ def __call__( llama.eval(tokens_to_eval) n_past = llama.n_tokens - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) if self.verbose: - media_str = "IMAGE" if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE else "AUDIO" + media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) # Stage 5: Multimodal Physical OOM Defense From d8ee3eed7163c6c1f3802a9b979f9009e5e96c53 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Sun, 14 Jun 2026 08:00:09 +0200 Subject: [PATCH 21/36] Change 'clip_model_path' to 'mmproj_path'. Implemented 'chat_template_override'. Only the chat template is passed from llama to the chat handler; not the entire model's metadata. --- llama_cpp/llama.py | 10 ++++----- llama_cpp/llama_chat_format.py | 39 +++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 544e755ea9..1f5ffa20b5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -96,7 +96,7 @@ class Llama: def __init__( self, model_path: str, - clip_model_path: Optional[str] = None, + mmproj_path: Optional[str] = None, *, # Model Params n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", @@ -710,13 +710,13 @@ def __init__( if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) - if clip_model_path is not None: + if mmproj_path is not None: if self.chat_handler is not None and self.verbose: - print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True) + print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True) self.chat_handler = llama_chat_format.GenericMTMDChatHandler( - gguf_metadata = self.metadata, - clip_model_path = clip_model_path, + chat_format = self.metadata.get("tokenizer.chat_template", None), + mmproj_path = mmproj_path, verbose = self.verbose, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 254195f95a..966c2e28fa 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2856,11 +2856,12 @@ class MTMDChatHandler: def __init__( self, - clip_model_path: str, + mmproj_path: str, verbose: bool = True, use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1, + chat_template_override: Optional[str] = None, **kwargs ): @@ -2872,7 +2873,7 @@ def __init__( f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." ) - self.clip_model_path = clip_model_path + self.mmproj_path = mmproj_path self.image_min_tokens = image_min_tokens self.image_max_tokens = image_max_tokens self.use_gpu = use_gpu @@ -2883,20 +2884,25 @@ def __init__( self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None self.extra_template_arguments: dict[str, Any] = {} - if not os.path.exists(clip_model_path): - raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") + if not os.path.exists(mmproj_path): + raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {mmproj_path}") # Pre-compile Jinja template - if not hasattr(self, "chat_format") or self.chat_format is None: + if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: self.chat_format = self.CHAT_FORMAT + elif chat_template_override is not None: + self.chat_format = chat_template_override self._chat_format_parser_tags = [] - self.chat_template = ImmutableSandboxedEnvironment( - trim_blocks=True, - lstrip_blocks=True, - ).from_string(self.chat_format) + self.change_chat_template(self.chat_format) self._exit_stack = ExitStack() + + def change_chat_template(self, new_template: str): + self.chat_template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True + ).from_string(new_template) def _init_mtmd_context(self, llama_model: llama_core.Llama): """Initialize mtmd context with the llama model.""" @@ -2929,13 +2935,13 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): # Initialize mtmd context self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.clip_model_path.encode(), + self.mmproj_path.encode(), llama_model.model, self.mctx_params ) if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") # Check if vision is supported self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) @@ -3835,7 +3841,7 @@ def from_pretrained( model_path = os.path.join(local_dir, filename) return cls( - clip_model_path=model_path, + mmproj_path=model_path, **kwargs, ) @@ -3852,13 +3858,12 @@ class GenericMTMDChatHandler(MTMDChatHandler): def __init__( self, - gguf_metadata: Dict[str, Any], - clip_model_path: str, + chat_format: str, + mmproj_path: str, verbose: bool = True, **kwargs ) -> None: - self.model_metadata = gguf_metadata - self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) + self.chat_format = chat_format if verbose: print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) @@ -3866,7 +3871,7 @@ def __init__( if self.chat_format is None: raise ValueError("Failed to get model chat template automatically.") - super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) + super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs) def __call__(self, **kwargs): self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] From 1965d5f6c3c949cab7f7ef934266c8062ebc0f45 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 20:19:43 +0800 Subject: [PATCH 22/36] refactor(mtmd): move multimodal handlers to separate module `llama_multimodal` - Move MTMDChatHandler, GenericMTMDChatHandler, and model-specific multimodal chat handlers out of llama_chat_format.py into llama_multimodal.py. - llama_chat_format.py has grown too large and difficult to maintain, especially as MTMD support expands beyond image-only use cases. Splitting multimodal handling into its own module makes the chat formatting layer smaller and keeps media loading, MTMD tokenization, multimodal KV-cache bookkeeping, and handler implementations in a dedicated place. - This also prepares the codebase for broader multimodal support and future video frame / image batch evaluation, where the media-processing path will need to evolve independently from text-only chat formatting. - Keep backward-compatible re-exports from llama_chat_format.py so existing imports continue to work. - Also keep `clip_model_path` as a deprecated initialization alias for `mmproj_path` in the base MTMD handler. Signed-off-by: JamePeng --- llama_cpp/llama.py | 8 +- llama_cpp/llama_chat_format.py | 3811 ++------------------------------ llama_cpp/llama_multimodal.py | 3473 +++++++++++++++++++++++++++++ 3 files changed, 3690 insertions(+), 3602 deletions(-) create mode 100644 llama_cpp/llama_multimodal.py diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ec202568f1..dbc60eaf76 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -45,6 +45,7 @@ from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama_chat_format as llama_chat_format +import llama_cpp.llama_multimodal as llama_multimodal from llama_cpp.llama_speculative import LlamaDraftModel @@ -711,20 +712,19 @@ def __init__( self.metadata = {} if self.verbose: print(f"Failed to load metadata: {e}", file=sys.stderr) - - if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) if mmproj_path is not None: if self.chat_handler is not None and self.verbose: print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True) - self.chat_handler = llama_chat_format.GenericMTMDChatHandler( + self.chat_handler = llama_multimodal.GenericMTMDChatHandler( chat_format = self.metadata.get("tokenizer.chat_template", None), mmproj_path = mmproj_path, verbose = self.verbose, **chat_handler_kwargs ) + + if self.verbose: print(f"Model desc: {self.model_desc}, " f"Model size: {self.model_size / (1024 * 1024):.2f} MB, " f"Model metadata: {self.metadata}", diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 0e5c9d4906..6ffe68e5e3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1,7 +1,5 @@ from __future__ import annotations -import base64 -import ctypes import dataclasses import datetime import json @@ -9,9 +7,7 @@ import random import string import sys -import zlib -from contextlib import ExitStack from typing import ( Any, Dict, @@ -32,16 +28,11 @@ import numpy as np import numpy.typing as npt -import urllib.request -from urllib.error import URLError, HTTPError - -import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama as llama_core import llama_cpp.llama_types as llama_types import llama_cpp.llama_grammar as llama_grammar -from ._ggml import GGMLLogLevel -from ._logger import logger, ggml_log_callback +from ._logger import logger from ._utils import suppress_stdout_stderr, Singleton ### Common Chat Templates and Special Tokens ### @@ -3037,3612 +3028,204 @@ def generate_streaming(tools, functions, function_call, prompt): ) -class MTMDChatHandler: - DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( -"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " -"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." - ) - - CHAT_FORMAT = ( - "{{ bos_token if bos_token is defined else '' }}" +@register_chat_completion_handler("chatml-function-calling") +def chatml_function_calling( + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + **kwargs, # type: ignore +) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], +]: + function_calling_template = ( "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% elif message.role == 'user' %}" - "USER: " - "{% if message.content is string %}" - "{{ message.content }}" - "{% elif message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% elif content.type == 'audio_url' %}" - "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" - "{% elif content.type == 'input_audio' %}" - "{% if content.input_audio is string %}" - "{{ content.input_audio }}" - "{% else %}" - "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" - "{% endif %}" - "{% elif content.type == 'video_url' %}" - "{{ content.video_url if content.video_url is string else content.video_url.url }}" - "{% elif content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - - "{% elif message.role == 'assistant' and message.content is not none %}" - "ASSISTANT: {{ message.content }}" - "{% endif %}" - "{{ \"\n\" }}" + "<|im_start|>{{ message.role }}\n" + # System message + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% if tool_calls %}" + "\n\nYou have access to the following functions:\n" + "{% for tool in tools %}" + "\nfunctions.{{ tool.function.name }}:\n" + "{{ tool.function.parameters | tojson }}" + "\n{% endfor %}" + "\n\nYou can respond to users messages with either a single message or one or more function calls." + "\n\nTo respond with a message begin the message with 'message:', use the following format:" + "\n\nmessage:" + "\n" + "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" + "\n\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "{% endif %}" + "<|im_end|>\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + ## Reglar message + "{% if message.content and message.content | length > 0 %}" + "{% if tool_calls %}" + "message:\n" + "{% endif %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + ## Function calls + "{% if 'tool_calls' in message %}" + "{% for tool_call in message.tool_calls %}" + "functions.{{ tool_call.function.name }}:\n" + "{{ tool_call.function.arguments }}" "{% endfor %}" - - "{% if eos_token is defined %}" - "{{ eos_token }}" + "<|im_end|>\n" "{% endif %}" - - "{% if add_generation_prompt %}" - "ASSISTANT: " "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" ) + template_renderer = ImmutableSandboxedEnvironment( + autoescape=jinja2.select_autoescape(["html", "xml"]), + undefined=jinja2.StrictUndefined, + ).from_string(function_calling_template) - def __init__( - self, - mmproj_path: str, - verbose: bool = True, - use_gpu: bool = True, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - chat_template_override: Optional[str] = None, - batch_max_tokens: int = 1024, - **kwargs - ): - - self.log_prefix = self.__class__.__name__ - if kwargs: - unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) - raise TypeError( - f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" - f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." - ) - - self.mmproj_path = mmproj_path - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - self.batch_max_tokens = batch_max_tokens - self.use_gpu = use_gpu - self.verbose = verbose - - import llama_cpp.mtmd_cpp as mtmd_cpp - self._mtmd_cpp = mtmd_cpp - self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None - self.extra_template_arguments: dict[str, Any] = {} - - self.is_support_vision = False - self.is_support_audio = False - self.is_support_video = False - - if not os.path.exists(mmproj_path): - raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {mmproj_path}") - - # Pre-compile Jinja template - if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: - self.chat_format = self.CHAT_FORMAT - elif chat_template_override is not None: - self.chat_format = chat_template_override - - self._chat_format_parser_tags = [] - self.change_chat_template(self.chat_format) - - self._exit_stack = ExitStack() - - def change_chat_template(self, new_template: str): - self.chat_template = ImmutableSandboxedEnvironment( - trim_blocks=True, - lstrip_blocks=True - ).from_string(new_template) - - def _init_mtmd_context(self, llama_model: llama_core.Llama): - """Initialize mtmd context with the llama model.""" - if self.mtmd_ctx is not None: - return # Already initialized - - self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) - - # Get default parameters - self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() - self.mctx_params.use_gpu = self.use_gpu - self.mctx_params.print_timings = self.verbose - self.mctx_params.n_threads = llama_model.n_threads - self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO - self.mctx_params.warmup = True - if self.image_min_tokens > 0: - self.mctx_params.image_min_tokens = self.image_min_tokens - if self.image_max_tokens > 0: - self.mctx_params.image_max_tokens = self.image_max_tokens - if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " - f"cannot be less than image_min_tokens ({self.image_min_tokens}).") - self.mctx_params.batch_max_tokens = self.batch_max_tokens - - # Cache the model's eos token and bos token - self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') - self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') - - # Cache the mtmd_default_marker - self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - - # Initialize mtmd context - self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.mmproj_path.encode(), - llama_model.model, - self.mctx_params - ) - - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") - - # Check if vision is supported - self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) - if self.is_support_vision: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) - - # Check if audio is supported - self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) - if self.is_support_audio: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) - - # Check if video is supported - self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) - if self.is_support_video: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) - - def close(self) -> None: - """Explicitly free the mtmd context and vision model resources.""" - if getattr(self, "mtmd_ctx", None) is not None: - try: - self._mtmd_cpp.mtmd_free(self.mtmd_ctx) - except Exception: - pass - self.mtmd_ctx = None - self.mctx_params = None - self.chat_template = None - - if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): - self._exit_stack.close() - self._exit_stack = None - - def __del__(self) -> None: - self.close() - - def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]: - """ - Extracts all media payloads (images, audio) sequentially to maintain exact chronological order. - Strictly enforces capability checks, raising exceptions if unsupported media is passed. - - Returns: - media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio). - """ - media_items: List[Dict[str, str]] = [] - for message in messages: - if isinstance(message.get("content"), list): - for content in message["content"]: - content_type = content.get("type", "") - - # 1. Vision Processing - if content_type == "image_url": - if not self.is_support_vision: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.") - - url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"] - media_items.append({"url": url, "type": "image"}) - - # 2. Audio Processing - elif content_type in ["audio", "audio_url", "input_audio"]: - if not self.is_support_audio: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") - - # Case A: Handle custom/forward-compatible audio_url format - if content_type == "audio_url" or content_type == "audio": - audio_url = content[content_type] - url = audio_url if isinstance(audio_url, str) else audio_url["url"] - media_items.append({"url": url, "type": "audio"}) - # Case B: Handle OpenAI standard input_audio format - elif content_type == "input_audio": - input_audio = content.get("input_audio", {}) - if isinstance(input_audio, dict) and "data" in input_audio: - # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic - # input_audio: { - # data: audio.base64Data, - # format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' - # } - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - - # Strictly align with llama.cpp (require wav/mp3) - if audio_format not in ["wav", "mp3"]: - raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'") - - # Format as a Data URI to reuse the unified load_media logic - media_items.append({ - "url": f"data:audio/{audio_format};base64,{audio_data}", - "type": "audio" - }) - else: - # Just a raw base64 data - url = input_audio if isinstance(input_audio, str) else "" - if url: - media_items.append({"url": url, "type": "audio"}) - - # 3. Video Processing - elif content_type == "video_url": - if not self.is_support_video: - raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.") - - video_url = content["video_url"] - url = video_url if isinstance(video_url, str) else video_url["url"] - media_items.append({"url": url, "type": "video"}) - - # 4. Text & Unknown Types - elif content_type == "text": - continue - else: - if self.verbose: - print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr) - return media_items - - def _create_bitmap_from_bytes(self, media_bytes: bytes): - """ - Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. - - Supported formats: - - Images (via stb_image): jpg, png, bmp, etc. - - Audio (via miniaudio): wav, mp3, flac. - - Video: depends on whether MTMD_VIDEO was enabled at build time. - - Note: - - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. - - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. - - Args: - media_bytes (bytes): The raw byte content of the media file. - - Returns: - bitmap: mtmd_bitmap * - video_ctx: mtmd_helper_video * or NULL - """ - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] - if not media_bytes: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } - buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + stop = ( + [stop, "<|im_end|>"] + if isinstance(stop, str) + else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + ) - wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( - self.mtmd_ctx, - buf, - len(media_bytes), - False, + # Case 1: No tool choice by user + if ( + tool_choice is None + or (isinstance(tool_choice, str) and tool_choice == "none") + or tools is None + or len(tools) == 0 + ): + prompt = template_renderer.render( + messages=messages, + tools=[], + tool_calls=None, + add_generation_prompt=True, ) - if not wrapper.bitmap: - if wrapper.video_ctx: - self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) - - raise ValueError( - f"{self.log_prefix}(_create_bitmap_from_bytes): " - "Failed to load media from bytes " - "(unsupported media format, corrupted data, or missing helper support)." - ) - - return wrapper.bitmap, wrapper.video_ctx - - def _is_text_chunk(self, chunk_type: int) -> bool: - """Return True if `chunk_type` is the MTMD text chunk type enum value.""" - return ( - chunk_type - == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT - ) + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) - def _is_image_chunk(self, chunk_type: int) -> bool: - """Return True if `chunk_type` is the MTMD image chunk type enum value.""" - return ( - chunk_type - == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + return _convert_completion_to_chat( + llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=stop, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logprobs=top_logprobs if logprobs else None, + ), + stream=stream, ) - def _is_audio_chunk(self, chunk_type: int) -> bool: - """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" - return ( - chunk_type - == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + # Case 2: Tool choice by user + if isinstance(tool_choice, dict): + tool_name = tool_choice["function"]["name"] + tool = next( + (tool for tool in tools if tool["function"]["name"] == tool_name), None ) - - def _process_mtmd_prompt( - self, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - add_generation_prompt: bool = True, - ) -> Tuple[List[int], List[tuple], Any, List[Any]]: - """ - Core multimodal preprocessing pipeline. - Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. - - Features: - - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. - - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. - - Strict RAII-style C++ memory management to prevent leaks on failure. - - Returns: - full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. - chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). - chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). - bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. - """ - # 1. Inject default system prompt if omitted by the user - system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") - if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: - messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages - - media_items = self._get_media_items(messages) - media_marker = self.media_marker - - # 2. Render the chat template and replace actual URLs with C++ media markers - text = self.chat_template.render( + if tool is None: + raise ValueError(f"Tool with name '{tool_name}' not found in tools") + prompt = template_renderer.render( messages=messages, - add_generation_prompt=add_generation_prompt, - eos_token=self.mtmd_eos_token, - bos_token=self.mtmd_bos_token, - functions=functions, - function_call=function_call, tools=tools, - tool_choice=tool_choice, - **getattr(self, 'extra_template_arguments', {}) + tool_calls=True, + add_generation_prompt=True, ) - - for tag in self._chat_format_parser_tags: - if tag not in text: - continue - - text = text.replace(tag, media_marker) - - # Replace image_url by media_marker in text - for item in media_items: - text = text.replace(item["url"], media_marker) - - if self.verbose: - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) - - # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding - bitmaps = [None] * len(media_items) - bitmap_cleanup = [] - video_cleanup = [] - chunks = None - - try: - # Concurrent Media Decoding - import concurrent.futures - if media_items: - def _create_bitmap_func(idx: int, item: dict): - media_bytes = self.load_media(item["url"], item["type"]) - bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) - return idx, bitmap, video_ctx - # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, - # which can be used in the future to process large numbers of video frames. - max_workers = min(llama.n_threads, len(media_items)) - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] - - for future in concurrent.futures.as_completed(futures): - idx, bitmap, video_ctx = future.result() - - bitmaps[idx] = bitmap - bitmap_cleanup.append(bitmap) - - if video_ctx: - video_cleanup.append(video_ctx) - - # Strict validation: Abort if any thread failed to decode its assigned media - if any(b is None for b in bitmaps): - raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") - else: - if self.verbose: - print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") - else: - # If there are no images, set the bitmaps to empty. - bitmaps = [] - - # 4. Initialize mtmd_input_chunks - input_text = self._mtmd_cpp.mtmd_input_text() - input_text.text = text.encode('utf-8') - input_text.add_special = (llama.n_tokens == 0) - input_text.parse_special = True - - chunks = self._mtmd_cpp.mtmd_input_chunks_init() - if chunks is None: - raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") - - # 5. Hybrid Tokenization (Text + Media binding) - if len(bitmaps) > 0: - bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) - ) - else: - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") - - # Video helper contexts only need to stay alive until mtmd_tokenize() completes. - if video_cleanup: - for video_ctx in video_cleanup: - self._mtmd_cpp.mtmd_helper_video_free(video_ctx) - video_cleanup.clear() - - # 6. Virtual Token Ledger Construction - full_prompt_ids = [] - chunk_token_spans = [] - current_idx = 0 - n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) - - # Cursor to track the actual media contents (URLs or base64 data) provided by the user - media_items_count = len(media_items) - media_items_cur = 0 - last_media_id = None - - for i in range(n_chunks): - chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) - if chunk is None: continue - chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - - if self._is_text_chunk(chunk_type): - # Extract standard text token IDs - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) - if tokens_ptr and n_tokens_out.value > 0: - tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) - full_prompt_ids.extend(tokens) - current_idx += len(tokens) - elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): - # Extract media properties - # Note(JamePeng): - # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). - # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. - # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - - if media_items_cur < media_items_count: - # The C++ parser only sees identical placeholders (e.g., "<__media__>"). - # We MUST inject the actual media content's identity here. - real_media_url = media_items[media_items_cur]["url"] - # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) - # Generate a deterministic, unique negative ID for this specific image/audio. - # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). - # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with - # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). - # This empowers `longest_token_prefix` to correctly identify and reuse cached images, - # while instantly breaking the match if the image content changes. - # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 - media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 - last_media_id = media_id - media_items_cur += 1 - elif last_media_id is not None: - # video may expand into multiple image chunks from one media marker - media_id = last_media_id - else: - # Magic Negative Number as fallback :) - media_id = -314159 - - if self.verbose: - print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") - - chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) - - # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache - full_prompt_ids.extend([media_id] * chunk_n_tokens) - current_idx += chunk_n_tokens - else: - raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") - - return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup - - except Exception as e: - # Ensure no useless pointers remain upon any failure - # Free chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Free bitmaps - if len(bitmap_cleanup) > 0: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup = None - # Free videos - if len(video_cleanup) > 0: - for video_ctx in video_cleanup: - self._mtmd_cpp.mtmd_helper_video_free(video_ctx) - video_cleanup = None - - bitmaps = None - - raise e - - def __call__( - self, - *, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - seed: Optional[int] = None, - response_format: Optional[ - llama_types.ChatCompletionRequestResponseFormat - ] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logit_bias: Optional[Dict[str, float]] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - add_generation_prompt: bool = True, - reasoning_budget: int = -1, - reasoning_start: str = "", - reasoning_end: str = "", - reasoning_budget_message: Optional[str] = None, - reasoning_start_in_prompt: bool = False, - reasoning_start_max_tokens: Optional[int] = 32, - **kwargs, # type: ignore - ) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], - ]: - # 1. Initialize mtmd context - self._init_mtmd_context(llama) - assert self.mtmd_ctx is not None - - # 2. Concurrent Preprocessing & Ledger Construction - full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( - llama=llama, - messages=messages, - functions=functions, - function_call=function_call, - tools=tools, - tool_choice=tool_choice, - add_generation_prompt=add_generation_prompt, - ) - - if self.verbose: - print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) - - try: - # 3. KV Cache Synchronization & State Rollback - # Compares the virtual ledger with physical history to prevent Cache Poisoning. - current_history = llama.input_ids[:llama.n_tokens].tolist() - longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) - - if longest_prefix < llama.n_tokens: - if llama.is_hybrid and llama._hybrid_cache_mgr is not None: - if llama._hybrid_cache_mgr.max_checkpoints > 0: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " - f"Searching for nearest checkpoint...", file=sys.stderr) - - best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) - if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): - llama.n_tokens = best_ckpt.pos - if self.verbose: - print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) - llama._ctx.memory_seq_rm(0, longest_prefix, -1) - llama.n_tokens = longest_prefix - - n_past = llama.n_tokens - - for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: - # Skip previously matched chunks - if end_idx <= n_past: - continue - - if self._is_text_chunk(chunk_type): - unprocessed_start = max(start_idx, n_past) - start_idx - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) - - if tokens_ptr and n_tokens_out.value > 0: - all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - tokens_to_eval = all_tokens[unprocessed_start:] - - if tokens_to_eval: - if self.verbose: - print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - # Text evaluation delegates shift and chunking to native llama.eval - llama.eval(tokens_to_eval) - n_past = llama.n_tokens - - elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) - - if self.verbose: - media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" - print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - - # Stage 5: Multimodal Physical OOM Defense - if n_past + chunk_n_tokens > llama.n_ctx(): - if not llama._ctx.memory_can_shift(): - raise RuntimeError( - f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " - f"(n_pos_per_embd > 1 or incompatible M-RoPE). " - f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " - f"You MUST increase n_ctx to fit the dialogue." - ) - else: - # Safely discard oldest tokens while preserving system prompts - n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch - n_keep = min(llama.n_keep, n_past) - n_discard = min(n_discard, n_past - n_keep) - - if n_discard <= 0: - raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") - - if self.verbose: - print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) - - # Execute physical memory shift - llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) - llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) - - # Shift python virtual array to match - remaining_len = n_past - (n_keep + n_discard) - if remaining_len > 0: - llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] - - n_past -= n_discard - llama.n_tokens = n_past - - # Execute C++ Multimodal Black-box Extraction - new_n_past = llama_cpp_lib.llama_pos(0) - result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( - self.mtmd_ctx, - llama._ctx.ctx, - chunk_ptr, - llama_cpp_lib.llama_pos(n_past), - llama_cpp_lib.llama_seq_id(0), - llama.n_batch, - True, # logits_last = True, drastically saves computational overhead - ctypes.byref(new_n_past) - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") - - # Update Ledger with "Negative Reverse Vocabulary" IDs - llama.input_ids[n_past : new_n_past.value] = media_id - n_past = new_n_past.value - llama.n_tokens = n_past - - # Extract the final, perfectly synchronized prompt sequence - prompt = llama.input_ids[: llama.n_tokens].tolist() - - # End-of-Turn Checkpoint - # Anchors the state ONLY after the entire multi-modal turn is processed - if ( - llama.is_hybrid - and llama._hybrid_cache_mgr is not None - and llama._hybrid_cache_mgr.max_checkpoints > 0 - ): - if self.verbose: - print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) - - llama._hybrid_cache_mgr.save_checkpoint( - current_pos=llama.n_tokens, - tokens=prompt, - seq_id=0 - ) - finally: - # Cleanup chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Cleanup bitmaps - if bitmap_cleanup: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup.clear() - bitmap_array = None - - # Handle response format and tools (same as before) - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - tool = None - if ( - tool_choice is not None - and isinstance(tool_choice, dict) - and tools is not None - ): - name = tool_choice["function"]["name"] - tool = next((t for t in tools if t["function"]["name"] == name), None) - if tool is None: - raise ValueError(f"Tool choice '{name}' not found in tools.") - schema = tool["function"]["parameters"] - try: - # create grammar from json schema - grammar = llama_grammar.LlamaGrammar.from_json_schema( - json.dumps(schema), verbose=llama.verbose - ) - except Exception as e: - if llama.verbose: - print(str(e), file=sys.stderr) - grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.JSON_GBNF, verbose=llama.verbose - ) - - completion_or_chunks = llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - logprobs=top_logprobs if logprobs else None, - stream=stream, - stop=stop, - seed=seed, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logit_bias=logit_bias, - reasoning_budget=reasoning_budget, - reasoning_start=reasoning_start, - reasoning_end=reasoning_end, - reasoning_budget_message=reasoning_budget_message, - reasoning_start_in_prompt=reasoning_start_in_prompt, - reasoning_start_max_tokens=reasoning_start_max_tokens, - ) - - if tool is not None: - tool_name = tool["function"]["name"] - return _convert_completion_to_chat_function( - tool_name, completion_or_chunks, stream - ) - return _convert_completion_to_chat(completion_or_chunks, stream=stream) - - def load_media(self, media_url: str, media_type: str) -> bytes: - """ - Unified dispatcher for loading media payloads. - Routes the URL/URI to the specific image, audio, or video processor based on the media_type. - """ - if media_type == "image": - return self._load_image(media_url) - - elif media_type == "audio": - audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") - try: - self.detect_audio_format(audio_bytes) - except ValueError as e: - raise ValueError(f"{self.log_prefix}(load_media): {e}") - return audio_bytes - - elif media_type == "video": - return self._load_bytes(media_url, timeout=30, kind="video") - - else: - raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") - - @staticmethod - def detect_audio_format(audio_bytes: bytes) -> str: - """ - Pure utility function: Detects the audio format from magic bytes. - Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility - and avoid false positives (e.g., AVI files disguised as RIFF). - """ - length = len(audio_bytes) - - if length < 12: - raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") - - # RIFF & WAVE magic bytes verification - is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" - - # ID3 metadata or MPEG sync word verification - is_mp3 = length >= 3 and ( - audio_bytes.startswith(b"ID3") or - (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) - ) - - # FLAC magic bytes verification - is_flac = audio_bytes.startswith(b"fLaC") - - if is_wav: - return "wav" - elif is_mp3: - return "mp3" - elif is_flac: - return "flac" - else: - raise ValueError( - "Unsupported audio format detected via magic bytes. " - "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." - ) - - DEFAULT_HTTP_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/148.0.0.0 Safari/537.36" - ), - } - - @staticmethod - def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: - """ - Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. - """ - media_bytes = b"" - - # 1. Handle data URI - if media_url.strip().startswith("data:"): - comma_pos = media_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - - base64_data = media_url[comma_pos + 1:] - media_bytes = base64.b64decode(base64_data) - - # 2. Handle local file path - elif os.path.exists(media_url): - with open(media_url, "rb") as f: - media_bytes = f.read() - - # 3. Handle remote URL via HTTP/HTTPS - else: - req = urllib.request.Request( - media_url, - headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, - ) - try: - with urllib.request.urlopen(req, timeout=timeout) as f: - media_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") - - if not media_bytes: - raise ValueError(f"Empty {kind} data received") - - return media_bytes - - @staticmethod - def _load_image(image_url: str) -> bytes: - """ - Load an image from either a URL or a data URI and return it as JPEG bytes. - - Supports: - - Remote images via HTTP/HTTPS (with proper User-Agent) - - Data URIs (base64-encoded, e.g., data:image/png;base64,...) - - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background - - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html - - Returns: - JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. - """ - # 1. Load image bytes from image_url - image_bytes = MTMDChatHandler._load_bytes( - image_url, - timeout=15, - kind="image", - ) - - # 2. Check if image_bytes is empty. - if not image_bytes: - raise ValueError("Empty image data received") - - # 3. Open image with Pillow - try: - from PIL import Image, ImageStat - except ImportError: - raise ImportError("Pillow is required for image processing. Install with: pip install pillow") - - import io - image = Image.open(io.BytesIO(image_bytes)) - - # 4. Handle transparency (RGBA, LA, P with transparency, etc.) - if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): - # Use alpha channel as mask - if image.mode == "P": - image = image.convert("RGBA") - - alpha = image.split()[-1] # Last channel is alpha - # Compute average brightness of visible (non-transparent) pixels - stat = ImageStat.Stat(image.convert("L"), mask=alpha) - - # Choose background: white for dark content, black for bright content - bg_color = (255, 255, 255) # white - if stat.count[0] > 0 and stat.mean[0] > 127: - bg_color = (0, 0, 0) # black - - background = Image.new("RGB", image.size, bg_color) - background.paste(image, mask=alpha) - image = background - - # 5. Ensure RGB mode for formats like CMYK, palette, etc. - elif image.mode != "RGB": - image = image.convert("RGB") - - # 6. Save as high-quality JPEG, suitable for most vision models. - output = io.BytesIO() - image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) - return output.getvalue() - - @classmethod - def from_pretrained( - cls, - repo_id: str, - filename: Optional[str], - local_dir: Optional[Union[str, os.PathLike[str]]] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", - cache_dir: Optional[Union[str, os.PathLike[str]]] = None, - **kwargs: Any, - ) -> "MTMDChatHandler": - import fnmatch - from pathlib import Path - - try: - from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore - from huggingface_hub.utils import validate_repo_id # type: ignore - except ImportError: - raise ImportError( - "Llama.from_pretrained requires the huggingface_hub package. " - "You can install it with `pip install --upgrade huggingface_hub`." - ) - - validate_repo_id(repo_id) - - hffs = HfFileSystem() - - files = [ - file["name"] if isinstance(file, dict) else file - for file in hffs.ls(repo_id) # type: ignore - ] - - # split each file into repo_id, subfolder, filename - file_list: List[str] = [] - for file in files: - rel_path = Path(file).relative_to(repo_id) - file_list.append(str(rel_path)) - - matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore - - if len(matching_files) == 0: - raise ValueError( - f"No file found in {repo_id} that match {filename}\n\n" - f"Available Files:\n{json.dumps(file_list)}" - ) - - if len(matching_files) > 1: - raise ValueError( - f"Multiple files found in {repo_id} matching {filename}\n\n" - f"Available Files:\n{json.dumps(files)}" - ) - - (matching_file,) = matching_files - - subfolder = str(Path(matching_file).parent) - filename = Path(matching_file).name - - # download the file - hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=cast(Union[str, Path, None], local_dir), - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - ) - - if local_dir is None: - model_path = hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=local_dir, - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - local_files_only=True, - ) - else: - model_path = os.path.join(local_dir, filename) - - return cls( - mmproj_path=model_path, - **kwargs, - ) - -class GenericMTMDChatHandler(MTMDChatHandler): - KNOWN_MEDIA_TAGS = [ - "<|image_pad|>", - "<|audio_pad|>", - "<|video_pad|>", - "<|image|>", - "<|audio|>", - "<|video|>", - "[IMG]" - ] - - def __init__( - self, - chat_format: str, - mmproj_path: str, - verbose: bool = True, - **kwargs - ) -> None: - self.chat_format = chat_format - - if verbose: - print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) - - if self.chat_format is None: - raise ValueError("Failed to get model chat template automatically.") - - super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs) - - def __call__(self, **kwargs): - self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Llava15ChatHandler(MTMDChatHandler): - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - - "{% if message.role == 'user' %}" - "{% if message.content is string %}" - "\nUSER: {{ message.content }}" - "{% elif message.content is iterable %}" - "\nUSER: " - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% endif %}" - - "{% if message.role == 'assistant' and message.content is not none %}" - "\nASSISTANT: {{ message.content }}" - "{% endif %}" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "\nASSISTANT: " - "{% endif %}" - ) - - -class ObsidianChatHandler(MTMDChatHandler): - # Prompt Format - # The model followed ChatML format. However, with ### as the seperator - - # <|im_start|>user - # What is this sign about?\n - # ### - # <|im_start|>assistant - # The sign is about bullying, and it is placed on a black background with a red background. - # ### - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}\n" - "###\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "###\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "###\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MoondreamChatHandler(MTMDChatHandler): - # Chat Format: - # f"\n\n{chat_history}Question: {question}\n\nAnswer:" - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "Question: {{ content.text }}\n\n" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "Question: {{ message.content }}\n\n" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "Answer:{{ message.content }}\n\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class Llava16ChatHandler(MTMDChatHandler): - # Example prompt - # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "{{ message.content }}" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class NanoLlavaChatHandler(MTMDChatHandler): - # Prompt Format - # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: - - # <|im_start|>system - # Answer the question<|im_end|><|im_start|>user - # - # What is the picture about?<|im_end|><|im_start|>assistant - DEFAULT_SYSTEM_MESSAGE = "Answer the question" - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "<|im_end|>" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class Llama3VisionAlphaChatHandler(MTMDChatHandler): - # question = "" + q - - # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "<|start_header_id|>" - "{% if message.role == 'user' %}" - "user<|end_header_id|>\n\n" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "assistant<|end_header_id|>\n\n" - "{{ message.content }}" - "{% endif %}" - "<|eot_id|>" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|start_header_id|>assistant<|end_header_id|>\n\n" - "{% endif %}" - ) - - -# alias -Llama3VisionAlpha = Llama3VisionAlphaChatHandler - - -class MiniCPMv26ChatHandler(MTMDChatHandler): - - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and messages[0]['role'] != 'system' %}" - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is iterable %}" - "{% for content in message['content'] %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - - "{% for content in message['content'] %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% if message['content'] is string %}" - "{{ message['content'] }}" - "{% endif %}" - "<|im_end|>\n" - "{% endfor %}" - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MiniCPMv45ChatHandler(MTMDChatHandler): - """ - Handler for MiniCPM-V 4.5 models. - - Supports: - - Multi-step tool calls with and XML tags. - - Integrated reasoning (thinking) process with tags. - - Specialized system prompt handling with tool definitions. - - Global image numbering for multi-image processing. - """ - - # Model specific control tokens - MINICPMV_BOS_TOKEN = "<|im_start|>" - MINICPMV_EOS_TOKEN = "<|im_end|>" - MINICPMV_PAD_TOKEN = "<|endoftext|>" - - # Image placeholder tags - MINICPMV_IMAGE_START_TOKEN = "" - MINICPMV_IMAGE_END_TOKEN = "" - MINICPMV_IMAGE_ID_START_TOKEN = "" - MINICPMV_IMAGE_ID_END_TOKEN = "" - - CHAT_FORMAT = ( - # --- 1. First System Message & Tools Definitions --- - "{%- if tools %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}" - "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}" - "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}" - "{{- 'You are provided with function signatures within XML tags:\\n' }}" - "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}" - "{{- '\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\"name\": , \"arguments\": }\\n" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- elif messages[0].role == 'system' %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - - # --- 2. Message Stream Processing --- - "{% set image_count = namespace(value=0) %}" - "{%- for message in messages %}" - # --- Unified Role Handling (User, Assistant, and subsequent Systems) --- - "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}" - "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}" - - "{%- set content = message.content %}" - "{%- if content is not string %}" - "{%- set ns = namespace(content_str='') %}" - "{%- for item in content %}" - # --- Explicit image_url type and value checking --- - "{%- if item.type == 'image_url' %}" - "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}" - "{%- set image_count.value = image_count.value + 1 %}" - # Format: N: IMAGE_URL - "{%- set ns.content_str = ns.content_str + '' + (image_count.value | string) + ': ' + image_url + '' %}" - "{%- elif item.type == 'text' %}" - "{%- set ns.content_str = ns.content_str + item.text %}" - "{%- endif %}" - "{%- endfor %}" - "{%- set content = ns.content_str %}" - "{%- endif %}" - - "{{- content -}}" - - # Append tool_calls to assistant messages if they exist - "{%- if message.role == 'assistant' and message.tool_calls %}" - "{%- for tool_call in message.tool_calls %}" - "{%- set tc = tool_call.function if tool_call.function else tool_call %}" - "{{- '\\n\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}" - "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}" - "{{- '}\\n' }}" - "{%- endfor %}" - "{%- endif %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - - # --- Specialized Tool Response Handling --- - # Group consecutive tool responses under a single user-like block - "{%- elif message.role == 'tool' %}" - "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}" - "{{- '" + MINICPMV_BOS_TOKEN + "user' }}" - "{%- endif %}" - "{{- '\\n\\n' + message.content + '\\n' }}" - "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - "{%- endif %}" - "{%- endfor %}" - - # --- 3. Generation Prompt --- - "{%- if add_generation_prompt %}" - "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}" - # Handle thinking/reasoning block visibility based on configuration - "{%- if enable_thinking is defined and enable_thinking is false %}" - "{{- '\\n\\n\\n\\n' }}" - "{%- elif enable_thinking is defined and enable_thinking is true %}" - "{{- '\\n' }}" - "{%- endif %}" - "{%- endif %}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the MiniCPM-V 4.5 Handler. - - Args: - enable_thinking (bool): If True, model generates reasoning before the final answer. - **kwargs: Additional arguments for the base MTMDChatHandler. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject thinking control flag into the template - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set stop token patch - kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class MiniCPMV46ChatHandler(MTMDChatHandler): - """ - Handler for MiniCPM-V-4.6 models. - - Features: - - Aligned with official tokenizer_config.json special tokens. - - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens. - - Integrated MTMD-style URL and Base64 injection for visual content. - - Specialized `` and `` block generation. - - Autonomously folds previous reasoning paths using `last_query_index`. - - Toggles `` block generation via `enable_thinking` (Defaults to False). - """ - - # Core tokens - MINICPM_BOS_TOKEN = "<|im_start|>" - MINICPM_EOS_TOKEN = "<|im_end|>" - MINICPM_PAD_TOKEN = "<|endoftext|>" - - # Vision tokens - MINICPM_VISION_BOS_TOKEN = "<|vision_start|>" - MINICPM_VISION_EOS_TOKEN = "<|vision_end|>" - MINICPM_IMAGE_TOKEN = "<|image_pad|>" - MINICPM_VIDEO_TOKEN = "<|video_pad|>" - - CHAT_FORMAT = ( - "{%- if enable_thinking is not defined -%}\n" - " {%- set enable_thinking = false -%}\n" - "{%- endif -%}\n" - "{%- macro render_content(content, is_system_content=false) -%}\n" - " {%- if content is string -%}\n" - " {{- content -}}\n" - " {%- elif content is iterable and content is not mapping -%}\n" - " {%- set ns = namespace(parts=[]) -%}\n" - " {%- for item in content -%}\n" - " {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n" - " {%- if is_system_content -%}\n" - " {{- raise_exception('System message cannot contain images.') -}}\n" - " {%- endif -%}\n" - " {%- set url_val = '' -%}\n" - " {%- if item.type == 'image_url' -%}\n" - " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" - " {%- endif -%}\n" - " {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n" - # " {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n" - # " {%- if is_system_content -%}\n" - # " {{- raise_exception('System message cannot contain videos.') -}}\n" - # " {%- endif -%}\n" - # " {%- set url_val = '' -%}\n" - # " {%- if item.type == 'video_url' -%}\n" - # " {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n" - # " {%- endif -%}\n" - # " {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n" - " {%- elif 'text' in item -%}\n" - " {%- set ns.parts = ns.parts + [item.text] -%}\n" - " {%- else -%}\n" - " {{- raise_exception('Unexpected item type in content.') -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- ns.parts | join('\\n') -}}\n" - " {%- elif content is none or content is undefined -%}\n" - " {{- '' -}}\n" - " {%- else -%}\n" - " {{- raise_exception('Unexpected content type.') -}}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "{%- if not messages %}\n" - " {{- raise_exception('No messages provided.') }}\n" - "{%- endif %}\n" - "{%- if tools and tools is iterable and tools is not mapping %}\n" - " {{- '<|im_start|>system\\n' }}\n" - " {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n' }}\n" - " {%- for tool in tools %}\n" - " {{- '\\n' }}\n" - " {{- tool | tojson }}\n" - " {%- endfor %}\n" - " {{- '\\n' }}\n" - " {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n\\n\\n\\nvalue_1\\n\\n\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n\\n\\n\\n\\n\\nReminder:\\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n' }}\n" - " {%- if messages[0].role == 'system' %}\n" - " {%- set content = render_content(messages[0].content, true)|trim %}\n" - " {%- if content %}\n" - " {{- '\\n\\n' + content }}\n" - " {%- endif %}\n" - " {%- endif %}\n" - " {{- '<|im_end|>\\n' }}\n" - "{%- else %}\n" - " {%- if messages[0].role == 'system' %}\n" - " {%- set content = render_content(messages[0].content, true)|trim %}\n" - " {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n" - " {%- endif %}\n" - "{%- endif %}\n" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n" - "{%- for message in messages[::-1] %}\n" - " {%- set index = (messages|length - 1) - loop.index0 %}\n" - " {%- if ns.multi_step_tool and message.role == 'user' %}\n" - " {%- set content = render_content(message.content)|trim %}\n" - " {%- if not(content.startswith('') and content.endswith('')) %}\n" - " {%- set ns.multi_step_tool = false %}\n" - " {%- set ns.last_query_index = index %}\n" - " {%- endif %}\n" - " {%- endif %}\n" - "{%- endfor %}\n" - "{%- if ns.multi_step_tool %}\n" - " {{- raise_exception('No user query found in messages.') }}\n" - "{%- endif %}\n" - "{%- for message in messages %}\n" - " {%- set content = render_content(message.content)|trim %}\n" - " {%- if message.role == 'system' %}\n" - " {%- if not loop.first %}\n" - " {{- raise_exception('System message must be at the beginning.') }}\n" - " {%- endif %}\n" - " {%- elif message.role == 'user' %}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n" - " {%- elif message.role == 'assistant' %}\n" - " {%- set reasoning_content = '' %}\n" - " {%- if message.reasoning_content is string %}\n" - " {%- set reasoning_content = message.reasoning_content %}\n" - " {%- else %}\n" - " {%- if '' in content %}\n" - " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n" - " {%- set content = content.split('')[-1].lstrip('\\n') %}\n" - " {%- endif %}\n" - " {%- endif %}\n" - " {%- set reasoning_content = reasoning_content|trim %}\n" - " {%- if loop.index0 > ns.last_query_index %}\n" - " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n\\n' + content }}\n" - " {%- else %}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content }}\n" - " {%- endif %}\n" - " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n" - " {%- for tool_call in message.tool_calls %}\n" - " {%- if tool_call.function is defined %}\n" - " {%- set tool_call = tool_call.function %}\n" - " {%- endif %}\n" - " {%- if loop.first %}\n" - " {%- if content|trim %}\n" - " {{- '\\n\\n\\n\\n' }}\n" - " {%- else %}\n" - " {{- '\\n\\n' }}\n" - " {%- endif %}\n" - " {%- else %}\n" - " {{- '\\n\\n\\n' }}\n" - " {%- endif %}\n" - " {%- if tool_call.arguments is defined %}\n" - " {%- for args_name, args_value in tool_call.arguments|items %}\n" - " {{- '\\n' }}\n" - " {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n" - " {{- args_value }}\n" - " {{- '\\n\\n' }}\n" - " {%- endfor %}\n" - " {%- endif %}\n" - " {{- '\\n' }}\n" - " {%- endfor %}\n" - " {%- endif %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- elif message.role == 'tool' %}\n" - " {%- if loop.previtem and loop.previtem.role != 'tool' %}\n" - " {{- '<|im_start|>user' }}\n" - " {%- endif %}\n" - " {{- '\\n\\n' }}\n" - " {{- content }}\n" - " {{- '\\n' }}\n" - " {%- if not loop.last and loop.nextitem.role != 'tool' %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- elif loop.last %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- endif %}\n" - " {%- else %}\n" - " {{- raise_exception('Unexpected message role.') }}\n" - " {%- endif %}\n" - "{%- endfor %}\n" - "{%- if add_generation_prompt %}\n" - " {{- '<|im_start|>assistant\\n' }}\n" - " {%- if enable_thinking is defined and enable_thinking is false %}\n" - " {{- '\\n\\n\\n\\n' }}\n" - " {%- else %}\n" - " {{- '\\n' }}\n" - " {%- endif %}\n" - "{%- endif %}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the MiniCPM-V-4.6 Handler. - - Args: - enable_thinking (bool): Controls whether to open a `` block for reasoning. - Defaults to False as per the standard template logic. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject the thinking variable into the Jinja environment - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # MiniCPM uses standard <|im_end|> ChatML stop formatting - kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class Gemma3ChatHandler(MTMDChatHandler): - - GEMMA3_BOI_TOKEN = "" - GEMMA3_EOI_TOKEN = "" - GEMMA3_BOS_TOKEN = "" - GEMMA3_EOS_TOKEN = "" - - CHAT_FORMAT = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" - "{% if messages[0]['content'] is string %}" - "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" - "{% else %}" - "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" - "{% endif %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set first_user_prefix = '' %}" - "{% endif %}" - - "{% for message in loop_messages %}" - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" - "{% endif %}" - - "{% if message['role'] == 'assistant' %}" - "{% set role = 'model' %}" - "{% else %}" - "{% set role = message['role'] %}" - "{% endif %}" - - "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" - - "{% if message['content'] is string %}" - "{{ message['content'] | trim }}" - "{% elif message['content'] is iterable %}" - "{% for item in message['content'] %}" - "{% if item['type'] == 'image_url' and item['image_url'] is string %}" - "{{ '' + item['image_url'] + '' }}" - "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" - "{{ '' + item['image_url']['url'] + '' }}" - "{% elif item['type'] == 'text' %}" - "{{ item['text'] | trim }}" - "{% endif %}" - "{% endfor %}" - "{% else %}" - "{{ raise_exception('Invalid content type') }}" - "{% endif %}" - - "\n" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "model\n" - "{% endif %}" - ) - - -class Gemma4ChatHandler(MTMDChatHandler): - """ - Handler for Gemma 4 models. - - Note on `enable_thinking`: - The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. - It is NOT supported by Gemma4 E2B and E4B models. - - [Important Note for Audio Processing!] - It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models. - Other quantizations are known to have degraded performance; - ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463 - """ - - # The special token in Gemma 4 - GEMMA4_BOI_TOKEN = "<|image>" - GEMMA4_EOI_TOKEN = "" - GEMMA4_BOA_TOKEN = "<|audio>" - GEMMA4_EOA_TOKEN = "" - GEMMA4_BOS_TOKEN = "" - GEMMA4_EOS_TOKEN = "" - GEMMA4_SOT_TOKEN = "<|turn>" - GEMMA4_EOT_TOKEN = "" - GEMMA4_SOC_TOKEN = "<|channel>" - GEMMA4_EOC_TOKEN = "" - GEMMA4_STC_TOKEN = "<|tool_call>" - GEMMA4_ETC_TOKEN = "" - GEMMA4_STD_TOKEN = "<|tool>" - GEMMA4_ETD_TOKEN = "" - GEMMA4_STR_TOKEN = "<|tool_response>" - GEMMA4_ETR_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n" - " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in properties | dictsort -%}\n" - " {%- set add_comma = false -%}\n" - " {%- if not filter_keys or key not in standard_keys -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {{ key }}:{\n" - " {%- if value['description'] -%}\n" - " description:<|\"|>{{ value['description'] }}<|\"|>\n" - " {%- set add_comma = true -%}\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'STRING' -%}\n" - " {%- if value['enum'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " enum:{{ format_argument(value['enum']) }}\n" - " {%- endif -%}\n" - " {%- elif value['type'] | upper == 'ARRAY' -%}\n" - " {%- if value['items'] is mapping and value['items'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " items:{\n" - " {%- set ns_items = namespace(found_first=false) -%}\n" - " {%- for item_key, item_value in value['items'] | dictsort -%}\n" - " {%- if item_value is not none -%}\n" - " {%- if ns_items.found_first %},{% endif -%}\n" - " {%- set ns_items.found_first = true -%}\n" - " {%- if item_key == 'properties' -%}\n" - " properties:{\n" - " {%- if item_value is mapping -%}\n" - " {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n" - " {%- endif -%}\n" - " }\n" - " {%- elif item_key == 'required' -%}\n" - " required:[\n" - " {%- for req_item in item_value -%}\n" - " <|\"|>{{- req_item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- elif item_key == 'type' -%}\n" - " {%- if item_value is string -%}\n" - " type:{{ format_argument(item_value | upper) }}\n" - " {%- else -%}\n" - " type:{{ format_argument(item_value | map('upper') | list) }}\n" - " {%- endif -%}\n" - " {%- else -%}\n" - " {{ item_key }}:{{ format_argument(item_value) }}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " }\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if value['nullable'] %}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " nullable:true\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'OBJECT' -%}\n" - " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" - " }\n" - " {%- elif value is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n" - " }\n" - " {%- endif -%}\n" - " {%- if value['required'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " required:[\n" - " {%- for item in value['required'] | default([]) -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - "{%- endmacro -%}\n" - "{%- macro format_function_declaration(tool_data) -%}\n" - " declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n" - " {%- set params = tool_data['function']['parameters'] -%}\n" - " {%- if params -%}\n" - " ,parameters:{\n" - " {%- if params.get('properties') -%}\n" - " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" - " {%- endif -%}\n" - " {%- if params.get('required') -%}\n" - " required:[\n" - " {%- for item in params['required'] -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {{- ',' if not loop.last -}}\n" - " {%- endfor -%}\n" - " ],\n" - " {%- endif -%}\n" - " {%- if params.get('type') -%}\n" - " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if 'response' in tool_data['function'] -%}\n" - " {%- set response_declaration = tool_data['function']['response'] -%}\n" - " ,response:{\n" - " {%- if response_declaration['description'] -%}\n" - " description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n" - " {%- endif -%}\n" - " {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n" - " type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " }\n" - "{%- endmacro -%}\n" - "{%- macro format_argument(argument, escape_keys=True) -%}\n" - " {%- if argument is string -%}\n" - " {{- '<|\"|>' + argument + '<|\"|>' -}}\n" - " {%- elif argument is boolean -%}\n" - " {{- 'true' if argument else 'false' -}}\n" - " {%- elif argument is mapping -%}\n" - " {{- '{' -}}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in argument | dictsort -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {%- if escape_keys -%}\n" - " {{- '<|\"|>' + key + '<|\"|>' -}}\n" - " {%- else -%}\n" - " {{- key -}}\n" - " {%- endif -%}\n" - " :{{- format_argument(value, escape_keys=escape_keys) -}}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- elif argument is sequence -%}\n" - " {{- '[' -}}\n" - " {%- for item in argument -%}\n" - " {{- format_argument(item, escape_keys=escape_keys) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- ']' -}}\n" - " {%- else -%}\n" - " {{- argument -}}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "{%- macro strip_thinking(text) -%}\n" - " {%- set ns = namespace(result='') -%}\n" - " {%- for part in text.split('') -%}\n" - " {%- if '<|channel>' in part -%}\n" - " {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n" - " {%- else -%}\n" - " {%- set ns.result = ns.result + part -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- ns.result | trim -}}\n" - "{%- endmacro -%}\n" - "\n" - "{%- macro format_tool_response_block(tool_name, response) -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- if response is mapping -%}\n" - " {{- 'response:' + tool_name + '{' -}}\n" - " {%- for key, value in response | dictsort -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- else -%}\n" - " {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n" - " {%- endif -%}\n" - " {{- '' -}}\n" - "{%- endmacro -%}\n" - "\n" - "{%- set ns = namespace(prev_message_type=None) -%}\n" - "{%- set loop_messages = messages -%}\n" - "{{- bos_token -}}\n" - "{#- Handle System/Tool Definitions Block -#}\n" - "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" - " {{- '<|turn>system\\n' -}}\n" - " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" - " {%- if enable_thinking is defined and enable_thinking -%}\n" - " {{- '<|think|>\\n' -}}\n" - " {%- set ns.prev_message_type = 'think' -%}\n" - " {%- endif -%}\n" - " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" - " {%- if messages[0]['content'] is string -%}\n" - " {{- messages[0]['content'] | trim -}}\n" - " {%- elif messages[0]['content'] is sequence -%}\n" - " {%- for item in messages[0]['content'] -%}\n" - " {{- item['text'] | trim + ' '-}}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- set loop_messages = messages[1:] -%}\n" - " {%- endif -%}\n" - " {%- if tools -%}\n" - " {%- for tool in tools %}\n" - " {{- '<|tool>' -}}\n" - " {{- format_function_declaration(tool) | trim -}}\n" - " {{- '' -}}\n" - " {%- endfor %}\n" - " {%- set ns.prev_message_type = 'tool' -%}\n" - " {%- endif -%}\n" - " {{- '\\n' -}}\n" - "{%- endif %}\n" - "\n" - "{#- Pre-scan: find last user message index for reasoning guard -#}\n" - "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" - "{%- for i in range(loop_messages | length) -%}\n" - " {%- if loop_messages[i]['role'] == 'user' -%}\n" - " {%- set ns_turn.last_user_idx = i -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{#- Loop through messages -#}\n" - "{%- for message in loop_messages -%}\n" - " {%- if message['role'] != 'tool' -%}\n" - " {%- set ns.prev_message_type = None -%}\n" - " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" - " {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n" - " {%- set prev_nt = namespace(role=None, found=false) -%}\n" - " {%- if loop.index0 > 0 -%}\n" - " {%- for j in range(loop.index0 - 1, -1, -1) -%}\n" - " {%- if not prev_nt.found -%}\n" - " {%- if loop_messages[j]['role'] != 'tool' -%}\n" - " {%- set prev_nt.role = loop_messages[j]['role'] -%}\n" - " {%- set prev_nt.found = true -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n" - " {%- if not continue_same_model_turn -%}\n" - " {{- '<|turn>' + role + '\\n' }}\n" - " {%- endif -%}\n" - "\n" - " {#- Render reasoning/reasoning_content as thinking channel -#}\n" - " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" - " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" - " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" - " {%- endif -%}\n" - "\n" - " {%- if message.get('tool_calls') -%}\n" - " {%- for tool_call in message['tool_calls'] -%}\n" - " {%- set function = tool_call['function'] -%}\n" - " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" - " {%- if function['arguments'] is mapping -%}\n" - " {%- set ns_args = namespace(found_first=false) -%}\n" - " {%- for key, value in function['arguments'] | dictsort -%}\n" - " {%- if ns_args.found_first %},{% endif -%}\n" - " {%- set ns_args.found_first = true -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- endfor -%}\n" - " {%- elif function['arguments'] is string -%}\n" - " {{- function['arguments'] -}}\n" - " {%- endif -%}\n" - " {{- '}' -}}\n" - " {%- endfor -%}\n" - " {%- set ns.prev_message_type = 'tool_call' -%}\n" - " {%- endif -%}\n" - "\n" - " {%- set ns_tr_out = namespace(flag=false) -%}\n" - " {%- if message.get('tool_responses') -%}\n" - " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" - " {%- for tool_response in message['tool_responses'] -%}\n" - " {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endfor -%}\n" - " {%- elif message.get('tool_calls') -%}\n" - " {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n" - " {%- set ns_tool_scan = namespace(stopped=false) -%}\n" - " {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n" - " {%- if ns_tool_scan.stopped -%}\n" - " {%- elif loop_messages[k]['role'] != 'tool' -%}\n" - " {%- set ns_tool_scan.stopped = true -%}\n" - " {%- else -%}\n" - " {%- set follow = loop_messages[k] -%}\n" - " {#- Resolve tool_call_id to function name -#}\n" - " {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n" - " {%- for tc in message['tool_calls'] -%}\n" - " {%- if tc.get('id') == follow.get('tool_call_id') -%}\n" - " {%- set ns_tname.name = tc['function']['name'] -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {#- Handle content as string or content-parts array -#}\n" - " {%- set tool_body = follow.get('content') -%}\n" - " {%- if tool_body is string -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- elif tool_body is sequence and tool_body is not string -%}\n" - " {%- set ns_txt = namespace(s='') -%}\n" - " {%- for part in tool_body -%}\n" - " {%- if part.get('type') == 'text' -%}\n" - " {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" - " {%- for part in tool_body -%}\n" - " {%- if part.get('type') == 'image_url' -%}\n" - " {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n" - " {{- '<|image|>' + url_val -}}\n" - " {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n" - " {%- if part.get('type') == 'audio_url' -%}\n" - " {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- elif part.get('type') == 'input_audio' -%}\n" - " {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- endif -%}\n" - # " {%- elif part.get('type') == 'video_url' -%}\n" - # " {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n" - # " {{- '<|video|>' + video_val -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- else -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- endif -%}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "\n" - " {%- set captured_content -%}\n" - " {%- if message['content'] is string -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(message['content']) -}}\n" - " {%- else -%}\n" - " {{- message['content'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif message['content'] is sequence -%}\n" - " {%- for item in message['content'] -%}\n" - " {%- if item['type'] == 'text' -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(item['text']) -}}\n" - " {%- else -%}\n" - " {{- item['text'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif item['type'] == 'image_url' -%}\n" - " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {{- '<|image|>' + url_val -}}\n" - " {%- set ns.prev_message_type = 'image' -%}\n" - " {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n" - " {%- if item['type'] == 'audio_url' -%}\n" - " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- elif item['type'] == 'input_audio' -%}\n" - " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- endif -%}\n" - " {%- set ns.prev_message_type = 'audio' -%}\n" - " {%- endif -%}\n" - # " {%- elif item['type'] == 'video_url' -%}\n" - # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" - # " {{- '<|video|>' + video_val -}}\n" - # " {%- set ns.prev_message_type = 'video' -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- endset -%}\n" - "\n" - " {{- captured_content -}}\n" - " {%- set has_content = captured_content | trim | length > 0 -%}\n" - "\n" - " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- elif not (ns_tr_out.flag and not has_content) -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- if add_generation_prompt -%}\n" - " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" - " {{- '<|turn>model\\n' -}}\n" - " {%- if not enable_thinking | default(false) -%}\n" - " {{- '<|channel>thought\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Gemma 4 Handler. - - Args: - enable_thinking (bool): Controls whether the <|think|> tag is injected and - manages <|channel>thought behavior. - Note: ONLY supported on Gemma4 31B and 26BA4B models. - NOT supported on Gemma4 E2B and E4B models. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject the thinking variable into the Jinja environment - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set the stop token based on Gemma 4's format () - # generation_config.json: "eos_token_id": [1, 106, 50] - kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GLM41VChatHandler(MTMDChatHandler): - # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. - - GLM41V_EOS_TOKEN = "<|endoftext|>" - GLM41V_PAD_TOKEN = "<|endoftext|>" - GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]\n" - "{%- for msg in messages -%}" - "{%- if msg.role == 'system' -%}" - "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'user' -%}" - "<|user|>\n" - "{%- if msg.content is string -%}" - "{{ msg.content }}" - "{%- else -%}" - "{%- for item in msg.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'assistant' -%}" - "{%- if msg.metadata -%}" - "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- else -%}" - "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{%- endif -%}" - ) - - def __call__(self, **kwargs): - self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN - # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json - stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch - kwargs['stop'] = stop_tokens - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class GLM46VChatHandler(MTMDChatHandler): - GLM46V_EOS_TOKEN = "<|endoftext|>" - GLM46V_PAD_TOKEN = "<|endoftext|>" - GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]" - "{%- if tools -%}" - "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" - "You are provided with function signatures within XML tags:\n\n" - "{%- for tool in tools -%}" - "{{ tool | tojson(ensure_ascii=False) }}\n" - "{%- endfor -%}" - "\n\nFor each function call, output the function name and arguments within the following XML format:\n" - "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" - "{%- endif -%}" - - "{%- for m in messages -%}" - "{%- if m.role == 'system' -%}" - "<|system|>\n{{ m.content }}" - "{%- elif m.role == 'user' -%}" - "<|user|>\n" - "{%- if m.content is string -%}" - "{{ m.content }}" - "{%- else -%}" - "{%- for item in m.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - # If enable_thinking is disabled, insert `/nothink` according to the source code logic. - "{{ '/nothink' if not enable_thinking else '' }}" - "{%- elif m.role == 'assistant' -%}" - "<|assistant|>" - "{%- if enable_thinking -%}" - "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" - "\n{{ reasoning.strip() }}" - "{%- else -%}" - "\n" - "{%- endif -%}" - "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" - "{%- endif -%}" - "{{ GLM46V_EOS_TOKEN }}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{{ '' if enable_thinking else '\n' }}" - "{%- endif -%}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - GLM-4.6V Handler - Parameters: - - enable_thinking (bool): Whether to enable the model's think process. The default is True. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN - - # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json - kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GraniteDoclingChatHandler(MTMDChatHandler): - """ - Handler for Granite-Docling models. - - Format(512x512): Content - - Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!! - Since the model does not have special tokens for the start and end of an image, - it is recommended to process only one image at a time. - You can iterate through the images individually for recognition. - - """ - GRANITE_BOS_TOKEN = "<|start_of_role|>" - GRANITE_EOS_TOKEN = "<|end_of_text|>" - GRANITE_PAD_TOKEN = "<|end_of_text|>" - GRANITE_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for part in message['content'] -%}" - "{%- if part['type'] == 'text' -%}" - "{{- part['text'] -}}" - "{%- elif part['type'] == 'image_url' -%}" - "{%- if part.image_url is string -%}" - "{{- part.image_url -}}" - "{%- else -%}" - "{{- part.image_url.url -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '<|end_of_text|>\n' -}}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|start_of_role|>assistant' -}}" - # Support the 'controls' parameter if present in generation arguments - "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}" - "{{- '<|end_of_role|>' -}}" - "{%- endif -%}" - ) - - def __init__(self, controls: dict = None, **kwargs): - """ - Granite-Docling Handler - Args: - controls (dict, optional): Operational parameters passed to the assistant role. - - The 'controls' parameter is used to guide the model's behavior or output format. - Common examples for 'controls' include: - - Document Parsing: {"mode": "document_parsing", "format": "json"} - """ - self.controls = controls - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject controls into the template environment - self.extra_template_arguments["controls"] = self.controls - self.DEFAULT_SYSTEM_MESSAGE = None - kwargs['stop'] = [self.GRANITE_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - - return super().__call__(**kwargs) - - -class LFM2VLChatHandler(MTMDChatHandler): - LFM2VL_BOS_TOKEN = "<|startoftext|>" - LFM2VL_EOS_TOKEN = "<|im_end|>" - LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{ '<|im_start|>' + message['role'] + '\n' }}" - "{%- if message['content'] is string -%}" - "{{ message['content'] }}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if 'image_url' in content -%}" - "{%- if content.image_url is string -%}" - "<|image_start|>{{ content.image_url }}<|image_end|>" - "{%- else -%}" - "<|image_start|>{{ content.image_url.url }}<|image_end|>" - "{%- endif -%}" - "{%- elif content['type'] == 'text' -%}" - "{{ content['text'] }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{ '<|im_end|>\n' }}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{ '<|im_start|>assistant\n' }}" - "{%- endif -%}" - ) - - def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): - """ - LFM2-VL Handler - LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 - """ - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) - - def __call__(self, **kwargs): - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class LFM25VLChatHandler(MTMDChatHandler): - """ - Handler for LFM2.5-VL multimodal models. - - Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing. - """ - # Aligned with LFM2.5-VL tokenizer_config - LFM25VL_BOS_TOKEN = "<|startoftext|>" - LFM25VL_EOS_TOKEN = "<|im_end|>" - LFM25VL_PAD_TOKEN = "<|pad|>" - - # Image specific tokens - LFM25VL_IMAGE_TOKEN = "" - LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" - LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" - - CHAT_FORMAT = ( - "{{- bos_token -}}\n" - "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" - "{%- set ns = namespace(system_prompt='', content='') -%}\n" - "{%- if messages[0]['role'] == 'system' -%}\n" - " {%- set ns.system_prompt = messages[0]['content'] -%}\n" - " {%- set messages = messages[1:] -%}\n" - "{%- endif -%}\n" - "{%- if tools -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" - " {%- for tool in tools -%}\n" - " {%- if tool is not string -%}\n" - " {%- set tool = tool | tojson -%}\n" - " {%- endif -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" - " {%- if not loop.last -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" - "{%- endif -%}\n" - "{%- if ns.system_prompt -%}\n" - " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" - "{%- endif -%}\n" - "{%- set ns.last_assistant_index = -1 -%}\n" - "{%- for message in messages -%}\n" - " {%- if message['role'] == 'assistant' -%}\n" - " {%- set ns.last_assistant_index = loop.index0 -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- for message in messages -%}\n" - " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" - " {%- set content = message['content'] -%}\n" - " {%- if content is not string -%}\n" - " {%- set ns.content = '' -%}\n" - " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" - " {%- for item in content -%}\n" - " {%- if item['type'] == 'image_url' -%}\n" - " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {%- set ns.content = ns.content + img_val -%}\n" - " {%- elif item['type'] == 'text' -%}\n" - " {%- set ns.content = ns.content + item['text'] -%}\n" - " {%- else -%}\n" - " {%- set ns.content = ns.content + (item | tojson) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set content = ns.content -%}\n" - " {%- endif -%}\n" - " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" - " {%- if '' in content -%}\n" - " {%- set content = content.split('')[-1] | trim -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {{- content + '<|im_end|>\\n' -}}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, keep_past_thinking: bool = False, **kwargs): - self.keep_past_thinking = keep_past_thinking - super().__init__(**kwargs) - - - def __call__(self, **kwargs): - if self.image_min_tokens > 256: - if self.verbose: - print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.") - self.image_min_tokens = -1 - - self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking - - kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class PaddleOCRChatHandler(MTMDChatHandler): - """ - Handler for PaddleOCR 1.5/1.6 multimodal models. - """ - - PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" - PADDLEOCR_BOS_TOKEN = "" - PADDLEOCR_EOS_TOKEN = "" - PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>" - PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>" - PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>" - - CHAT_FORMAT = ( - "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}" - "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}" - "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}" - - "{{- cls_token -}}" - "{%- for message in messages -%}" - "{%- if message['role'] == 'user' -%}" - "{{- 'User: ' -}}" - - # Robust parsing: Check if content is string or list - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - # Pass 1: Render all images first - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'image_url' and 'image_url' in content -%}" - "{{- '<|IMAGE_START|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|IMAGE_END|>' -}}" - "{%- endif -%}" - "{%- endfor -%}" - - # Pass 2: Render all text second - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '\\n' -}}" - - "{%- elif message['role'] == 'assistant' -%}" - "{{- 'Assistant:\\n' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- eos_token -}}" - - "{%- elif message['role'] == 'system' -%}" - "{%- if message['content'] is string -%}" - "{{- message['content'] + '\\n' -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] + '\\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "{{- 'Assistant:\\n' -}}" - "{%- endif -%}" - ) - - def __init__( - self, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - **kwargs - ): - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__( - image_min_tokens=self.image_min_tokens, - image_max_tokens=self.image_max_tokens, - **kwargs - ) - - def __call__(self, **kwargs): - # Set the specific stop token defined in the PaddleOCR template - kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class Qwen25VLChatHandler(MTMDChatHandler): - - QWEN25_VL_BOS_TOKEN = "<|endoftext|>" - QWEN25_VL_PAD_TOKEN = "<|endoftext|>" - QWEN25_VL_EOS_TOKEN = "<|im_end|>" - - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and message['role'] != 'system' %}" - "<|im_start|>system\n" - "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is string %}" - "{{ message['content'] }}<|im_end|>\n" - "{% else %}" - "{% for content in message['content'] %}" - "{% if content['type'] == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>" - "{% else %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>" - "{% endif %}" - "{% elif content['type'] == 'text' %}" - "{{ content['text'] }}" - "{% endif %}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endfor %}" - "<|im_start|>assistant\n" - ) - - def __call__(self, **kwargs): - kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Qwen3ASRChatHandler(MTMDChatHandler): - """ - Handler for Qwen 3 ASR (Automatic Speech Recognition) models. - - Features: - - Highly specialized for Speech-to-Text tasks. - - Aggregates all system text into a single cohesive system block. - - Drops user text entirely, extracting ONLY audio data into a unified user turn. - - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>. - - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url. - """ - - DEFAULT_SYSTEM_MESSAGE = """ - You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language. - You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization. - """ - - QWEN3_ASR_BOS_TOKEN = "<|im_start|>" - QWEN3_ASR_PAD_TOKEN = "<|endoftext|>" - QWEN3_ASR_EOS_TOKEN = "<|im_end|>" - - - QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>" - QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>" - QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>" - - CHAT_FORMAT = ( - "{%- set ns = namespace(system_text='') -%}\n" - "{%- for m in messages -%}\n" - " {%- if m.role == 'system' -%}\n" - " {%- if m.content is string -%}\n" - " {%- set ns.system_text = ns.system_text + m.content -%}\n" - " {%- else -%}\n" - " {%- for c in m.content -%}\n" - " {%- if c.type == 'text' and (c.text is defined) -%}\n" - " {%- set ns.system_text = ns.system_text + c.text -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- set ns2 = namespace(audio_tokens='') -%}\n" - "{%- for m in messages -%}\n" - " {%- if m.content is not string -%}\n" - " {%- for c in m.content -%}\n" - " {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n" - " {#- MTMD Audio Injection -#}\n" - " {%- set audio_val = '' -%}\n" - " {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n" - " {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n" - " {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n" - " {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n" - " {%- endif -%}\n" - " {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n" - "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token - kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") - - return super().__call__(**kwargs) - -class Qwen3VLChatHandler(MTMDChatHandler): - - QWEN3_VL_BOS_TOKEN = "<|endoftext|>" - QWEN3_VL_PAD_TOKEN = "<|endoftext|>" - QWEN3_VL_EOS_TOKEN = "<|im_end|>" - - CHAT_FORMAT = ( - "{{- '<|im_start|>system\n' -}}" - "{%- if messages[0].content is string and messages[0].role == 'system' -%}" - "{{- messages[0].content -}}" - "{%- elif messages[0].role == 'system' -%}" - "{%- if 'text' in messages[0].content -%}" - "{{- messages[0].content.text -}}" - "{%- else -%}" - "{{- 'You are a helpful assistant.' -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- if tools -%}" - "{{- '\n\n' -}}" - "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' -}}" - "{%- for tool in tools -%}" - "{{- '\n' -}}" - "{{- tool | tojson -}}" - "{%- endfor -%}" - "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" - "{%- endif -%}" - "{{- '<|im_end|>\n' -}}" - "{%- set image_count = namespace(value=0) -%}" - #"{%- set video_count = namespace(value=0) -%}" - "{%- for message in messages -%}" - "{%- if message.role == 'tool' -%}" - "{{- '<|im_start|>user\n\n' -}}" - "{%- elif message.role != 'system' -%}" - "{{- '<|im_start|>' + message.role + '\n' -}}" - "{%- endif -%}" - "{%- if message.content is string and message.role != 'system' -%}" - "{{- message.content -}}" - "{%- elif message.role != 'system' -%}" - "{%- for content in message.content -%}" - "{%- if 'image_url' in content -%}" - "{%- set image_count.value = image_count.value + 1 -%}" - "{%- if add_vision_id -%}" - "{{- 'Picture ' -}}" - "{{- image_count.value | string -}}" - "{{- ': ' -}}" - "{%- endif -%}" - "{{- '<|vision_start|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|vision_end|>' -}}" - "{%- endif -%}" - # Video not supported yet - "{%- if 'text' in content -%}" - "{{- content.text -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- if message.role == 'assistant' -%}" - "{%- if message.tool_calls -%}" - "{%- for tool_call in message.tool_calls -%}" - "{%- if (loop.first and message.content) or (not loop.first) -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- if tool_call.function -%}" - "{%- set tool_call = tool_call.function -%}" - "{%- endif -%}" - "{{- '\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}" - "{%- if tool_call.arguments is string -%}" - "{{- tool_call.arguments -}}" - "{%- else -%}" - "{{- tool_call.arguments | tojson -}}" - "{%- endif -%}" - "{{- '}\n' -}}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- elif message.role == 'tool' -%}" - "{{- '' -}}" - "{%- endif -%}" - "{%- if message.role != 'system' -%}" - "{{- '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|im_start|>assistant\n' -}}" - "{%- if force_reasoning -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - force_reasoning: bool = False, - add_vision_id: bool = True, - **kwargs, - ): - """ - Parameters: - - force_reasoning (bool): - - True: Force the reasoning in the model by adding to the chat template. - - False (default): Don't force the reasoning. - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - """ - super().__init__(**kwargs) - self.force_reasoning = force_reasoning - self.extra_template_arguments["force_reasoning"] = force_reasoning - self.extra_template_arguments["add_vision_id"] = add_vision_id - - def __call__(self, **kwargs): - kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Qwen35ChatHandler(MTMDChatHandler): - """ - Handler for Qwen3.5/Qwen3.6 models. - """ - CHAT_FORMAT = ( - "{%- set image_count = namespace(value=0) -%}" - "{%- set video_count = namespace(value=0) -%}" - "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}" - " {%- if content is string -%}" - " {{- content -}}" - " {%- elif content is iterable and content is not mapping -%}" - " {%- for item in content -%}" - " {%- if 'image_url' in item or item.type == 'image_url' -%}" - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain images.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set image_count.value = image_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Picture ' -}}" - " {{- image_count.value | string -}}" - " {{- ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {%- if item.image_url is string -%}" - " {{- item.image_url -}}" - " {%- else -%}" - " {{- item.image_url.url -}}" - " {%- endif -%}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'video' in item -%}" - " {{- raise_exception('llama.cpp does not currently support video.') -}}" # Video not supported, raise exception - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain videos.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set video_count.value = video_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Video ' ~ video_count.value ~ ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {{- item.video -}}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'text' in item -%}" - " {{- item.text -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected item type in content.') -}}" - " {%- endif -%}" - " {%- endfor -%}" - " {%- elif content is none or content is undefined -%}" - " {{- '' -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected content type.') -}}" - " {%- endif -%}" - "{%- endmacro -%}" - "{%- if not messages -%}" - " {{- raise_exception('No messages provided.') -}}" - "{%- endif -%}" - "{%- if tools and tools is iterable and tools is not mapping -%}" - " {{- '<|im_start|>system\n' -}}" - " {{- '# Tools\n\nYou have access to the following functions:\n\n' -}}" - " {%- for tool in tools -%}" - " {{- '\n' -}}" - " {{- tool | tojson -}}" - " {%- endfor -%}" - " {{- '\n' -}}" - " {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' -}}" - " {%- if messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) | trim -%}" - " {%- if content -%}" - " {{- '\n\n' + content -}}" - " {%- endif -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - "{%- elif messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) -%}" - " {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}" - "{%- for message in messages[::-1] -%}" - " {%- set index = messages | length - 1 - loop.index0 -%}" - " {%- if ns.multi_step_tool and message.role == 'user' -%}" - " {%- set content = render_content(message.content, false) | trim -%}" - " {%- if not (content.startswith('') and content.endswith('')) -%}" - " {%- set ns.multi_step_tool = false -%}" - " {%- set ns.last_query_index = index -%}" - " {%- endif -%}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if ns.multi_step_tool -%}" - " {{- raise_exception('No user query found in messages.') -}}" - "{%- endif -%}" - "{%- for message in messages -%}" - " {%- set content = render_content(message.content, true) | trim -%}" - " {%- if message.role == 'system' -%}" - " {%- if not loop.first -%}" - " {{- raise_exception('System message must be at the beginning.') -}}" - " {%- endif -%}" - " {%- elif message.role == 'user' -%}" - " {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}" - " {%- elif message.role == 'assistant' -%}" - " {%- set reasoning_content = '' -%}" - " {%- if message.reasoning_content is string -%}" - " {%- set reasoning_content = message.reasoning_content -%}" - " {%- elif '' in content -%}" - " {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') -%}" - " {%- set content = content.split('')[-1].lstrip('\n') -%}" - " {%- endif -%}" - " {%- set reasoning_content = reasoning_content | trim -%}" - " {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}" - " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" - " {%- else -%}" - " {{- '<|im_start|>' + message.role + '\n' + content -}}" - " {%- endif -%}" - " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" - " {%- for tool_call in message.tool_calls -%}" - " {%- if tool_call.function is defined -%}" - " {%- set tool_call = tool_call.function -%}" - " {%- endif -%}" - " {%- if loop.first -%}" - " {%- if content | trim -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- '\n\n\n' -}}" - " {%- endif -%}" - " {%- if tool_call.arguments is defined -%}" - " {%- for (args_name, args_value) in tool_call.arguments | items -%}" - " {{- '\n' -}}" - " {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}" - " {{- args_value -}}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif message.role == 'tool' -%}" - " {%- if loop.previtem and loop.previtem.role != 'tool' -%}" - " {{- '<|im_start|>user' -}}" - " {%- endif -%}" - " {{- '\n\n' -}}" - " {{- content -}}" - " {{- '\n' -}}" - " {%- if not loop.last and loop.nextitem.role != 'tool' -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif loop.last -%}" - " {{- '<|im_end|>\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- raise_exception('Unexpected message role.') -}}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - " {{- '<|im_start|>assistant\n' -}}" - " {%- if enable_thinking is defined and enable_thinking is false -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n' -}}" - " {%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - add_vision_id: bool = True, - enable_thinking: bool = True, - preserve_thinking: bool = False, - **kwargs, - ): - """ - Parameters: - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - - enable_thinking (bool): - - True (default): Enables reasoning for better results. - - False: Disables reasoning for faster results. - - preserve_thinking (bool): - - True: Keeps reasoning process for ALL historical conversational turns. - - False (default): Only keeps for the latest assistant reply to save tokens. - """ - super().__init__(**kwargs) - self.enable_thinking = enable_thinking - self.preserve_thinking = preserve_thinking - self.extra_template_arguments["add_vision_id"] = add_vision_id - self.extra_template_arguments["enable_thinking"] = enable_thinking - self.extra_template_arguments["preserve_thinking"] = preserve_thinking - - def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class Step3VLChatHandler(MTMDChatHandler): - """ - Handler for Step3-VL models. - """ - - STEP3VL_BOS_TOKEN = "<|im_start|>" - STEP3VL_EOS_TOKEN = "<|im_end|>" - STEP3VL_PAD_TOKEN = "<|endoftext|>" - STEP3VL_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro render_content(content) -%}\n" - " {%- if content is none -%}{{- '' -}}\n" - " {%- elif content is string -%}{{- content -}}\n" - " {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n" - " {%- elif content is iterable -%}\n" - " {%- for item in content -%}\n" - " {%- if item.type == 'text' -%}\n" - " {{- item['value'] if 'value' in item else item['text'] -}}\n" - " {%- elif item.type in ['image', 'image_url'] -%}\n" - " {%- set url_val = '' -%}\n" - " {%- if item.image_url -%}\n" - " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" - " {%- endif -%}\n" - " {{- '' + url_val -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "\n" - "{%- if tools -%}\n" - " {{- '<|im_start|>system\\n' -}}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- render_content(messages[0].content) + '\\n\\n' -}}\n" - " {%- endif -%}\n" - " {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n' -}}\n" - " {%- for tool in tools -%}\n" - " {{- '\\n' -}}\n" - " {{- tool | tojson -}}\n" - " {%- endfor -%}\n" - " {{- '\\n\\n\\nAlways adhere to this exact format for tool use:\\n\\n\\n{\"name\": , \"arguments\": }\\n\\n{additional_tool_calls}\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within XML tags.\\n- `` must be an exact match to one of the available tools.\\n- `` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n" - "{%- else -%}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - "\n" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n" - "{%- for message in messages[::-1] -%}\n" - " {%- set index = (messages|length - 1) - loop.index0 -%}\n" - " {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) -%}\n" - " {%- set ns.multi_step_tool = false -%}\n" - " {%- set ns.last_query_index = index -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- for message in messages -%}\n" - " {%- set content = render_content(message.content) -%}\n" - " {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n" - " {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n" - " {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n" - " {%- elif message.role == 'assistant' -%}\n" - " {%- if message.reasoning_content is string -%}\n" - " {%- set reasoning_content = render_content(message.reasoning_content) -%}\n" - " {%- else -%}\n" - " {%- if '' in content -%}\n" - " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') -%}\n" - " {%- set content = content.split('')[-1].lstrip('\\n') -%}\n" - " {%- else -%}\n" - " {%- set reasoning_content = '' -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if loop.index0 > ns.last_query_index -%}\n" - " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n' + content -}}\n" - " {%- else -%}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content -}}\n" - " {%- endif -%}\n" - " {%- if message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- for tool_call in message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- if tool_call.function -%}\n" - " {%- set tool_call = tool_call.function -%}\n" - " {%- endif -%}\n" - " {{- '\\n{\"name\": \"' -}}\n" - " {{- tool_call.name -}}\n" - " {{- '\", \"arguments\": ' -}}\n" - " {%- if tool_call.arguments is string -%}\n" - " {{- tool_call.arguments -}}\n" - " {%- else -%}\n" - " {{- tool_call.arguments | tojson -}}\n" - " {%- endif -%}\n" - " {{- '}\\n' -}}\n" - " {%- endfor -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- elif message.role == 'tool' -%}\n" - " {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n" - " {{- '<|im_start|>tool_response' -}}\n" - " {%- endif -%}\n" - " {{- '\\n\\n' -}}\n" - " {{- content -}}\n" - " {{- '\\n' -}}\n" - " {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n\\n\\n\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Step3-VL Handler. - - Args: - enable_thinking (bool): If False, injects an empty block to bypass reasoning. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Pass thinking toggle into Jinja - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Step3 uses standard <|im_end|> ChatML stop formatting - kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -@register_chat_completion_handler("chatml-function-calling") -def chatml_function_calling( - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - **kwargs, # type: ignore -) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], -]: - function_calling_template = ( - "{% for message in messages %}" - "<|im_start|>{{ message.role }}\n" - # System message - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% if tool_calls %}" - "\n\nYou have access to the following functions:\n" - "{% for tool in tools %}" - "\nfunctions.{{ tool.function.name }}:\n" - "{{ tool.function.parameters | tojson }}" - "\n{% endfor %}" - "\n\nYou can respond to users messages with either a single message or one or more function calls." - "\n\nTo respond with a message begin the message with 'message:', use the following format:" - "\n\nmessage:" - "\n" - "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" - "\n\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "{% endif %}" - "<|im_end|>\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - ## Reglar message - "{% if message.content and message.content | length > 0 %}" - "{% if tool_calls %}" - "message:\n" - "{% endif %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - ## Function calls - "{% if 'tool_calls' in message %}" - "{% for tool_call in message.tool_calls %}" - "functions.{{ tool_call.function.name }}:\n" - "{{ tool_call.function.arguments }}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" - ) - template_renderer = ImmutableSandboxedEnvironment( - autoescape=jinja2.select_autoescape(["html", "xml"]), - undefined=jinja2.StrictUndefined, - ).from_string(function_calling_template) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - stop = ( - [stop, "<|im_end|>"] - if isinstance(stop, str) - else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] - ) - - # Case 1: No tool choice by user - if ( - tool_choice is None - or (isinstance(tool_choice, str) and tool_choice == "none") - or tools is None - or len(tools) == 0 - ): - prompt = template_renderer.render( - messages=messages, - tools=[], - tool_calls=None, - add_generation_prompt=True, - ) - - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - return _convert_completion_to_chat( - llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=stream, - stop=stop, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logprobs=top_logprobs if logprobs else None, - ), - stream=stream, - ) - - # Case 2: Tool choice by user - if isinstance(tool_choice, dict): - tool_name = tool_choice["function"]["name"] - tool = next( - (tool for tool in tools if tool["function"]["name"] == tool_name), None - ) - if tool is None: - raise ValueError(f"Tool with name '{tool_name}' not found in tools") - prompt = template_renderer.render( - messages=messages, - tools=tools, - tool_calls=True, - add_generation_prompt=True, - ) - prompt += f"functions.{tool_name}:\n" + prompt += f"functions.{tool_name}:\n" try: grammar = llama_grammar.LlamaGrammar.from_json_schema( json.dumps(tool["function"]["parameters"]), verbose=llama.verbose @@ -6956,3 +3539,35 @@ def chatml_function_calling( } raise ValueError("Automatic streaming tool choice is not supported") + +# Backward compatibility re-exports. +# These multimodal chat handlers have been moved to `llama_multimodal`. +# New code should import them from `llama_cpp.llama_multimodal` instead of +# `llama_cpp.llama_chat_format`. +from llama_cpp.llama_multimodal import ( + MTMDChatHandler, + GenericMTMDChatHandler, + Llava15ChatHandler, + ObsidianChatHandler, + MoondreamChatHandler, + Llava16ChatHandler, + NanoLlavaChatHandler, + Llama3VisionAlphaChatHandler, + Llama3VisionAlpha, + MiniCPMv26ChatHandler, + MiniCPMv45ChatHandler, + MiniCPMV46ChatHandler, + Gemma3ChatHandler, + Gemma4ChatHandler, + GLM41VChatHandler, + GLM46VChatHandler, + GraniteDoclingChatHandler, + LFM2VLChatHandler, + LFM25VLChatHandler, + PaddleOCRChatHandler, + Qwen25VLChatHandler, + Qwen3ASRChatHandler, + Qwen3VLChatHandler, + Qwen35ChatHandler, + Step3VLChatHandler +) diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py new file mode 100644 index 0000000000..a055869543 --- /dev/null +++ b/llama_cpp/llama_multimodal.py @@ -0,0 +1,3473 @@ +from __future__ import annotations + +import base64 +import ctypes +import json +import os +import sys +import zlib + +from contextlib import ExitStack +from typing import ( + Any, + Dict, + Iterator, + List, + Literal, + Optional, + Tuple, + Union, + Protocol, + TYPE_CHECKING, + cast, +) + +import urllib.request +from urllib.error import URLError, HTTPError + +import llama_cpp.llama_cpp as llama_cpp_lib +import llama_cpp.llama_types as llama_types +import llama_cpp.llama_grammar as llama_grammar + +if TYPE_CHECKING: + import llama_cpp.llama as llama_core + +from ._logger import ggml_log_callback + +from llama_cpp.llama_chat_format import ( + _convert_completion_to_chat, + _convert_completion_to_chat_function, + _grammar_for_response_format, + ImmutableSandboxedEnvironment +) + +class MTMDChatHandler: + DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( +"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " +"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." + ) + + CHAT_FORMAT = ( + "{{ bos_token if bos_token is defined else '' }}" + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% elif message.role == 'user' %}" + "USER: " + "{% if message.content is string %}" + "{{ message.content }}" + "{% elif message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% elif content.type == 'audio_url' %}" + "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" + "{% elif content.type == 'input_audio' %}" + "{% if content.input_audio is string %}" + "{{ content.input_audio }}" + "{% else %}" + "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" + "{% endif %}" + "{% elif content.type == 'video_url' %}" + "{{ content.video_url if content.video_url is string else content.video_url.url }}" + "{% elif content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + + "{% elif message.role == 'assistant' and message.content is not none %}" + "ASSISTANT: {{ message.content }}" + "{% endif %}" + "{{ \"\n\" }}" + "{% endfor %}" + + "{% if eos_token is defined %}" + "{{ eos_token }}" + "{% endif %}" + + "{% if add_generation_prompt %}" + "ASSISTANT: " + "{% endif %}" + ) + + def __init__( + self, + mmproj_path: Optional[str] = None, + verbose: bool = True, + use_gpu: bool = True, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + chat_template_override: Optional[str] = None, + batch_max_tokens: int = 1024, + **kwargs + ): + + self.log_prefix = self.__class__.__name__ + self.verbose = verbose + + # Backward compatibility: `clip_model_path` was the old name for `mmproj_path`. + # Accept it for existing user code, warn during initialization, and normalize + # all internal usage to `mmproj_path`. + clip_model_path = kwargs.pop("clip_model_path", None) + if mmproj_path is None and clip_model_path is not None: + mmproj_path = clip_model_path + if self.verbose: + print( + f"{self.log_prefix}(__init__): `clip_model_path` is deprecated; " + "please use `mmproj_path` instead.", + file=sys.stderr, + ) + + if kwargs: + unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) + raise TypeError( + f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" + f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." + ) + + if mmproj_path is None: + raise ValueError( + f"{self.log_prefix}(__init__): `mmproj_path` is required. " + "`clip_model_path` is accepted only as a deprecated compatibility alias." + ) + + self.mmproj_path = mmproj_path + if not os.path.exists(self.mmproj_path): + raise ValueError( + f"{self.log_prefix}(__init__): mmproj path does not exist: {self.mmproj_path}" + ) + + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + self.batch_max_tokens = batch_max_tokens + self.use_gpu = use_gpu + + import llama_cpp.mtmd_cpp as mtmd_cpp + self._mtmd_cpp = mtmd_cpp + self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None + self.extra_template_arguments: dict[str, Any] = {} + + self.is_support_vision = False + self.is_support_audio = False + self.is_support_video = False + + # Pre-compile Jinja template + if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: + self.chat_format = self.CHAT_FORMAT + elif chat_template_override is not None: + self.chat_format = chat_template_override + + self._chat_format_parser_tags = [] + self._change_chat_template(self.chat_format) + + self._exit_stack = ExitStack() + + def _change_chat_template(self, new_template: str): + self.chat_template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True + ).from_string(new_template) + + def _init_mtmd_context(self, llama_model: llama_core.Llama): + """Initialize mtmd context with the llama model.""" + if self.mtmd_ctx is not None: + return # Already initialized + + self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) + + # Get default parameters + self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() + self.mctx_params.use_gpu = self.use_gpu + self.mctx_params.print_timings = self.verbose + self.mctx_params.n_threads = llama_model.n_threads + self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + self.mctx_params.warmup = True + if self.image_min_tokens > 0: + self.mctx_params.image_min_tokens = self.image_min_tokens + if self.image_max_tokens > 0: + self.mctx_params.image_max_tokens = self.image_max_tokens + if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " + f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + self.mctx_params.batch_max_tokens = self.batch_max_tokens + + # Cache the model's eos token and bos token + self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') + self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') + + # Cache the mtmd_default_marker + self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') + + # Initialize mtmd context + self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( + self.mmproj_path.encode(), + llama_model.model, + self.mctx_params + ) + + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") + + # Check if vision is supported + self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) + if self.is_support_vision: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if audio is supported + self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) + if self.is_support_audio: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if video is supported + self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) + if self.is_support_video: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) + + def close(self) -> None: + """Explicitly free the mtmd context and vision model resources.""" + if getattr(self, "mtmd_ctx", None) is not None: + try: + self._mtmd_cpp.mtmd_free(self.mtmd_ctx) + except Exception: + pass + self.mtmd_ctx = None + self.mctx_params = None + self.chat_template = None + + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): + self._exit_stack.close() + self._exit_stack = None + + def __del__(self) -> None: + self.close() + + def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]: + """ + Extracts all media payloads (images, audio) sequentially to maintain exact chronological order. + Strictly enforces capability checks, raising exceptions if unsupported media is passed. + + Returns: + media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio). + """ + media_items: List[Dict[str, str]] = [] + for message in messages: + if isinstance(message.get("content"), list): + for content in message["content"]: + content_type = content.get("type", "") + + # 1. Vision Processing + if content_type == "image_url": + if not self.is_support_vision: + raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.") + + url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"] + media_items.append({"url": url, "type": "image"}) + + # 2. Audio Processing + elif content_type in ["audio", "audio_url", "input_audio"]: + if not self.is_support_audio: + raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") + + # Case A: Handle custom/forward-compatible audio_url format + if content_type == "audio_url" or content_type == "audio": + audio_url = content[content_type] + url = audio_url if isinstance(audio_url, str) else audio_url["url"] + media_items.append({"url": url, "type": "audio"}) + # Case B: Handle OpenAI standard input_audio format + elif content_type == "input_audio": + input_audio = content.get("input_audio", {}) + if isinstance(input_audio, dict) and "data" in input_audio: + # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic + # input_audio: { + # data: audio.base64Data, + # format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' + # } + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + + # Strictly align with llama.cpp (require wav/mp3) + if audio_format not in ["wav", "mp3"]: + raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'") + + # Format as a Data URI to reuse the unified load_media logic + media_items.append({ + "url": f"data:audio/{audio_format};base64,{audio_data}", + "type": "audio" + }) + else: + # Just a raw base64 data + url = input_audio if isinstance(input_audio, str) else "" + if url: + media_items.append({"url": url, "type": "audio"}) + + # 3. Video Processing + elif content_type == "video_url": + if not self.is_support_video: + raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.") + + video_url = content["video_url"] + url = video_url if isinstance(video_url, str) else video_url["url"] + media_items.append({"url": url, "type": "video"}) + + # 4. Text & Unknown Types + elif content_type == "text": + continue + else: + if self.verbose: + print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr) + return media_items + + def _create_bitmap_from_bytes(self, media_bytes: bytes): + """ + Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. + + Supported formats: + - Images (via stb_image): jpg, png, bmp, etc. + - Audio (via miniaudio): wav, mp3, flac. + - Video: depends on whether MTMD_VIDEO was enabled at build time. + + Note: + - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. + - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. + + Args: + media_bytes (bytes): The raw byte content of the media file. + + Returns: + bitmap: mtmd_bitmap * + video_ctx: mtmd_helper_video * or NULL + """ + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") + + if not media_bytes: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + + buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + + wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + self.mtmd_ctx, + buf, + len(media_bytes), + False, + ) + + if not wrapper.bitmap: + if wrapper.video_ctx: + self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) + + raise ValueError( + f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load media from bytes " + "(unsupported media format, corrupted data, or missing helper support)." + ) + + return wrapper.bitmap, wrapper.video_ctx + + def _is_text_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD text chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT + ) + + def _is_image_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD image chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + ) + + def _is_audio_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ) + + def _process_mtmd_prompt( + self, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + add_generation_prompt: bool = True, + ) -> Tuple[List[int], List[tuple], Any, List[Any]]: + """ + Core multimodal preprocessing pipeline. + Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. + + Features: + - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. + - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. + - Strict RAII-style C++ memory management to prevent leaks on failure. + + Returns: + full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. + chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). + chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). + bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. + """ + # 1. Inject default system prompt if omitted by the user + system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") + if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: + messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages + + media_items = self._get_media_items(messages) + media_marker = self.media_marker + + # 2. Render the chat template and replace actual URLs with C++ media markers + text = self.chat_template.render( + messages=messages, + add_generation_prompt=add_generation_prompt, + eos_token=self.mtmd_eos_token, + bos_token=self.mtmd_bos_token, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + **getattr(self, 'extra_template_arguments', {}) + ) + + for tag in self._chat_format_parser_tags: + if tag not in text: + continue + + text = text.replace(tag, media_marker) + + # Replace image_url by media_marker in text + for item in media_items: + text = text.replace(item["url"], media_marker) + + if self.verbose: + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) + + # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding + bitmaps = [None] * len(media_items) + bitmap_cleanup = [] + video_cleanup = [] + chunks = None + + try: + # Concurrent Media Decoding + import concurrent.futures + if media_items: + def _create_bitmap_func(idx: int, item: dict): + media_bytes = self.load_media(item["url"], item["type"]) + bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) + return idx, bitmap, video_ctx + # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, + # which can be used in the future to process large numbers of video frames. + max_workers = min(llama.n_threads, len(media_items)) + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] + + for future in concurrent.futures.as_completed(futures): + idx, bitmap, video_ctx = future.result() + + bitmaps[idx] = bitmap + bitmap_cleanup.append(bitmap) + + if video_ctx: + video_cleanup.append(video_ctx) + + # Strict validation: Abort if any thread failed to decode its assigned media + if any(b is None for b in bitmaps): + raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") + else: + if self.verbose: + print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") + else: + # If there are no images, set the bitmaps to empty. + bitmaps = [] + + # 4. Initialize mtmd_input_chunks + input_text = self._mtmd_cpp.mtmd_input_text() + input_text.text = text.encode('utf-8') + input_text.add_special = (llama.n_tokens == 0) + input_text.parse_special = True + + chunks = self._mtmd_cpp.mtmd_input_chunks_init() + if chunks is None: + raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") + + # 5. Hybrid Tokenization (Text + Media binding) + if len(bitmaps) > 0: + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) + ) + else: + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") + + # Video helper contexts only need to stay alive until mtmd_tokenize() completes. + if video_cleanup: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup.clear() + + # 6. Virtual Token Ledger Construction + full_prompt_ids = [] + chunk_token_spans = [] + current_idx = 0 + n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + + # Cursor to track the actual media contents (URLs or base64 data) provided by the user + media_items_count = len(media_items) + media_items_cur = 0 + last_media_id = None + + for i in range(n_chunks): + chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) + if chunk is None: continue + chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) + + if self._is_text_chunk(chunk_type): + # Extract standard text token IDs + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) + if tokens_ptr and n_tokens_out.value > 0: + tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) + full_prompt_ids.extend(tokens) + current_idx += len(tokens) + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + # Extract media properties + # Note(JamePeng): + # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). + # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. + # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) + + if media_items_cur < media_items_count: + # The C++ parser only sees identical placeholders (e.g., "<__media__>"). + # We MUST inject the actual media content's identity here. + real_media_url = media_items[media_items_cur]["url"] + # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) + # Generate a deterministic, unique negative ID for this specific image/audio. + # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). + # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with + # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). + # This empowers `longest_token_prefix` to correctly identify and reuse cached images, + # while instantly breaking the match if the image content changes. + # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 + media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 + last_media_id = media_id + media_items_cur += 1 + elif last_media_id is not None: + # video may expand into multiple image chunks from one media marker + media_id = last_media_id + else: + # Magic Negative Number as fallback :) + media_id = -314159 + + if self.verbose: + print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") + + chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) + + # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache + full_prompt_ids.extend([media_id] * chunk_n_tokens) + current_idx += chunk_n_tokens + else: + raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") + + return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup + + except Exception as e: + # Ensure no useless pointers remain upon any failure + # Free chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Free bitmaps + if len(bitmap_cleanup) > 0: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup = None + # Free videos + if len(video_cleanup) > 0: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup = None + + bitmaps = None + + raise e + + def __call__( + self, + *, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + seed: Optional[int] = None, + response_format: Optional[ + llama_types.ChatCompletionRequestResponseFormat + ] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + add_generation_prompt: bool = True, + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, + **kwargs, # type: ignore + ) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], + ]: + # 1. Initialize mtmd context + self._init_mtmd_context(llama) + assert self.mtmd_ctx is not None + + # 2. Concurrent Preprocessing & Ledger Construction + full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( + llama=llama, + messages=messages, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + add_generation_prompt=add_generation_prompt, + ) + + if self.verbose: + print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) + + try: + # 3. KV Cache Synchronization & State Rollback + # Compares the virtual ledger with physical history to prevent Cache Poisoning. + current_history = llama.input_ids[:llama.n_tokens].tolist() + longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) + + if longest_prefix < llama.n_tokens: + if llama.is_hybrid and llama._hybrid_cache_mgr is not None: + if llama._hybrid_cache_mgr.max_checkpoints > 0: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " + f"Searching for nearest checkpoint...", file=sys.stderr) + + best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) + if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + llama.n_tokens = best_ckpt.pos + if self.verbose: + print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) + llama._ctx.memory_seq_rm(0, longest_prefix, -1) + llama.n_tokens = longest_prefix + + n_past = llama.n_tokens + + for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: + # Skip previously matched chunks + if end_idx <= n_past: + continue + + if self._is_text_chunk(chunk_type): + unprocessed_start = max(start_idx, n_past) - start_idx + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) + + if tokens_ptr and n_tokens_out.value > 0: + all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + tokens_to_eval = all_tokens[unprocessed_start:] + + if tokens_to_eval: + if self.verbose: + print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + # Text evaluation delegates shift and chunking to native llama.eval + llama.eval(tokens_to_eval) + n_past = llama.n_tokens + + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) + + if self.verbose: + media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" + print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + + # Stage 5: Multimodal Physical OOM Defense + if n_past + chunk_n_tokens > llama.n_ctx(): + if not llama._ctx.memory_can_shift(): + raise RuntimeError( + f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " + f"(n_pos_per_embd > 1 or incompatible M-RoPE). " + f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " + f"You MUST increase n_ctx to fit the dialogue." + ) + else: + # Safely discard oldest tokens while preserving system prompts + n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch + n_keep = min(llama.n_keep, n_past) + n_discard = min(n_discard, n_past - n_keep) + + if n_discard <= 0: + raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") + + if self.verbose: + print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) + + # Execute physical memory shift + llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) + llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) + + # Shift python virtual array to match + remaining_len = n_past - (n_keep + n_discard) + if remaining_len > 0: + llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] + + n_past -= n_discard + llama.n_tokens = n_past + + # Execute C++ Multimodal Black-box Extraction + new_n_past = llama_cpp_lib.llama_pos(0) + result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( + self.mtmd_ctx, + llama._ctx.ctx, + chunk_ptr, + llama_cpp_lib.llama_pos(n_past), + llama_cpp_lib.llama_seq_id(0), + llama.n_batch, + True, # logits_last = True, drastically saves computational overhead + ctypes.byref(new_n_past) + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") + + # Update Ledger with "Negative Reverse Vocabulary" IDs + llama.input_ids[n_past : new_n_past.value] = media_id + n_past = new_n_past.value + llama.n_tokens = n_past + + # Extract the final, perfectly synchronized prompt sequence + prompt = llama.input_ids[: llama.n_tokens].tolist() + + # End-of-Turn Checkpoint + # Anchors the state ONLY after the entire multi-modal turn is processed + if ( + llama.is_hybrid + and llama._hybrid_cache_mgr is not None + and llama._hybrid_cache_mgr.max_checkpoints > 0 + ): + if self.verbose: + print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) + + llama._hybrid_cache_mgr.save_checkpoint( + current_pos=llama.n_tokens, + tokens=prompt, + seq_id=0 + ) + finally: + # Cleanup chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Cleanup bitmaps + if bitmap_cleanup: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup.clear() + bitmap_array = None + + # Handle response format and tools (same as before) + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) + + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] + + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } + + tool = None + if ( + tool_choice is not None + and isinstance(tool_choice, dict) + and tools is not None + ): + name = tool_choice["function"]["name"] + tool = next((t for t in tools if t["function"]["name"] == name), None) + if tool is None: + raise ValueError(f"Tool choice '{name}' not found in tools.") + schema = tool["function"]["parameters"] + try: + # create grammar from json schema + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(schema), verbose=llama.verbose + ) + except Exception as e: + if llama.verbose: + print(str(e), file=sys.stderr) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + logprobs=top_logprobs if logprobs else None, + stream=stream, + stop=stop, + seed=seed, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, + ) + + if tool is not None: + tool_name = tool["function"]["name"] + return _convert_completion_to_chat_function( + tool_name, completion_or_chunks, stream + ) + return _convert_completion_to_chat(completion_or_chunks, stream=stream) + + def load_media(self, media_url: str, media_type: str) -> bytes: + """ + Unified dispatcher for loading media payloads. + Routes the URL/URI to the specific image, audio, or video processor based on the media_type. + """ + if media_type == "image": + return self._load_image(media_url) + + elif media_type == "audio": + audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") + try: + self.detect_audio_format(audio_bytes) + except ValueError as e: + raise ValueError(f"{self.log_prefix}(load_media): {e}") + return audio_bytes + + elif media_type == "video": + return self._load_bytes(media_url, timeout=30, kind="video") + + else: + raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") + + @staticmethod + def detect_audio_format(audio_bytes: bytes) -> str: + """ + Pure utility function: Detects the audio format from magic bytes. + Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility + and avoid false positives (e.g., AVI files disguised as RIFF). + """ + length = len(audio_bytes) + + if length < 12: + raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") + + # RIFF & WAVE magic bytes verification + is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" + + # ID3 metadata or MPEG sync word verification + is_mp3 = length >= 3 and ( + audio_bytes.startswith(b"ID3") or + (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) + ) + + # FLAC magic bytes verification + is_flac = audio_bytes.startswith(b"fLaC") + + if is_wav: + return "wav" + elif is_mp3: + return "mp3" + elif is_flac: + return "flac" + else: + raise ValueError( + "Unsupported audio format detected via magic bytes. " + "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." + ) + + DEFAULT_HTTP_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/148.0.0.0 Safari/537.36" + ), + } + + @staticmethod + def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: + """ + Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. + """ + media_bytes = b"" + + # 1. Handle data URI + if media_url.strip().startswith("data:"): + comma_pos = media_url.find(",") + if comma_pos == -1: + raise ValueError("Invalid data URI: missing comma separator") + + base64_data = media_url[comma_pos + 1:] + media_bytes = base64.b64decode(base64_data) + + # 2. Handle local file path + elif os.path.exists(media_url): + with open(media_url, "rb") as f: + media_bytes = f.read() + + # 3. Handle remote URL via HTTP/HTTPS + else: + req = urllib.request.Request( + media_url, + headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, + ) + try: + with urllib.request.urlopen(req, timeout=timeout) as f: + media_bytes = f.read() + except (URLError, HTTPError) as e: + raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") + + if not media_bytes: + raise ValueError(f"Empty {kind} data received") + + return media_bytes + + @staticmethod + def _load_image(image_url: str) -> bytes: + """ + Load an image from either a URL or a data URI and return it as JPEG bytes. + + Supports: + - Remote images via HTTP/HTTPS (with proper User-Agent) + - Data URIs (base64-encoded, e.g., data:image/png;base64,...) + - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background + - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html + + Returns: + JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. + """ + # 1. Load image bytes from image_url + image_bytes = MTMDChatHandler._load_bytes( + image_url, + timeout=15, + kind="image", + ) + + # 2. Check if image_bytes is empty. + if not image_bytes: + raise ValueError("Empty image data received") + + # 3. Open image with Pillow + try: + from PIL import Image, ImageStat + except ImportError: + raise ImportError("Pillow is required for image processing. Install with: pip install pillow") + + import io + image = Image.open(io.BytesIO(image_bytes)) + + # 4. Handle transparency (RGBA, LA, P with transparency, etc.) + if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): + # Use alpha channel as mask + if image.mode == "P": + image = image.convert("RGBA") + + alpha = image.split()[-1] # Last channel is alpha + # Compute average brightness of visible (non-transparent) pixels + stat = ImageStat.Stat(image.convert("L"), mask=alpha) + + # Choose background: white for dark content, black for bright content + bg_color = (255, 255, 255) # white + if stat.count[0] > 0 and stat.mean[0] > 127: + bg_color = (0, 0, 0) # black + + background = Image.new("RGB", image.size, bg_color) + background.paste(image, mask=alpha) + image = background + + # 5. Ensure RGB mode for formats like CMYK, palette, etc. + elif image.mode != "RGB": + image = image.convert("RGB") + + # 6. Save as high-quality JPEG, suitable for most vision models. + output = io.BytesIO() + image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) + return output.getvalue() + + @classmethod + def from_pretrained( + cls, + repo_id: str, + filename: Optional[str], + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + cache_dir: Optional[Union[str, os.PathLike[str]]] = None, + **kwargs: Any, + ) -> "MTMDChatHandler": + import fnmatch + from pathlib import Path + + try: + from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore + from huggingface_hub.utils import validate_repo_id # type: ignore + except ImportError: + raise ImportError( + "Llama.from_pretrained requires the huggingface_hub package. " + "You can install it with `pip install --upgrade huggingface_hub`." + ) + + validate_repo_id(repo_id) + + hffs = HfFileSystem() + + files = [ + file["name"] if isinstance(file, dict) else file + for file in hffs.ls(repo_id) # type: ignore + ] + + # split each file into repo_id, subfolder, filename + file_list: List[str] = [] + for file in files: + rel_path = Path(file).relative_to(repo_id) + file_list.append(str(rel_path)) + + matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore + + if len(matching_files) == 0: + raise ValueError( + f"No file found in {repo_id} that match {filename}\n\n" + f"Available Files:\n{json.dumps(file_list)}" + ) + + if len(matching_files) > 1: + raise ValueError( + f"Multiple files found in {repo_id} matching {filename}\n\n" + f"Available Files:\n{json.dumps(files)}" + ) + + (matching_file,) = matching_files + + subfolder = str(Path(matching_file).parent) + filename = Path(matching_file).name + + # download the file + hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=cast(Union[str, Path, None], local_dir), + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + ) + + if local_dir is None: + model_path = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + local_files_only=True, + ) + else: + model_path = os.path.join(local_dir, filename) + + return cls( + mmproj_path=model_path, + **kwargs, + ) + +# Experiments are not recommended for this purpose at this time. +class GenericMTMDChatHandler(MTMDChatHandler): + KNOWN_MEDIA_TAGS = [ + "<|image_pad|>", + "<|audio_pad|>", + "<|video_pad|>", + "<|image|>", + "<|audio|>", + "<|video|>", + "[IMG]" + ] + + def __init__( + self, + chat_format: str, + mmproj_path: str, + verbose: bool = True, + **kwargs + ) -> None: + + self.chat_format = chat_format + if self.chat_format is None: + raise ValueError("Failed to get model chat template automatically.") + + self.verbose = verbose + if self.verbose: + print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) + + super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs) + + def __call__(self, **kwargs): + self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + +class Llava15ChatHandler(MTMDChatHandler): + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + + "{% if message.role == 'user' %}" + "{% if message.content is string %}" + "\nUSER: {{ message.content }}" + "{% elif message.content is iterable %}" + "\nUSER: " + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% endif %}" + + "{% if message.role == 'assistant' and message.content is not none %}" + "\nASSISTANT: {{ message.content }}" + "{% endif %}" + "{% endfor %}" + + "{% if add_generation_prompt %}" + "\nASSISTANT: " + "{% endif %}" + ) + + +class ObsidianChatHandler(MTMDChatHandler): + # Prompt Format + # The model followed ChatML format. However, with ### as the seperator + + # <|im_start|>user + # What is this sign about?\n + # ### + # <|im_start|>assistant + # The sign is about bullying, and it is placed on a black background with a red background. + # ### + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System message + "{% if message.role == 'system' %}" + "<|im_start|>system\n" + "{{ message.content }}\n" + "###\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "<|im_start|>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "###\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + "<|im_start|>assistant\n" + "{{ message.content }}" + "###\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + +class MoondreamChatHandler(MTMDChatHandler): + # Chat Format: + # f"\n\n{chat_history}Question: {question}\n\nAnswer:" + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'user' %}" + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}\n\n" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}\n\n" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "Question: {{ content.text }}\n\n" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "Question: {{ message.content }}\n\n" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "Answer:{{ message.content }}\n\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "Answer:" + "{% endif %}" + ) + + +class Llava16ChatHandler(MTMDChatHandler): + # Example prompt + # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" + + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.role == 'user' %}" + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}\n" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}\n" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "{{ message.content }}" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "Answer:" + "{% endif %}" + ) + + +class NanoLlavaChatHandler(MTMDChatHandler): + # Prompt Format + # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: + + # <|im_start|>system + # Answer the question<|im_end|><|im_start|>user + # + # What is the picture about?<|im_end|><|im_start|>assistant + DEFAULT_SYSTEM_MESSAGE = "Answer the question" + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System message + "{% if message.role == 'system' %}" + "<|im_start|>system\n" + "{{ message.content }}" + "<|im_end|>" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "<|im_start|>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "<|im_end|>" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + "<|im_start|>assistant\n" + "{{ message.content }}" + "<|im_end|>" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + +class Llama3VisionAlphaChatHandler(MTMDChatHandler): + # question = "" + q + + # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + + CHAT_FORMAT = ( + "{% for message in messages %}" + "<|start_header_id|>" + "{% if message.role == 'user' %}" + "user<|end_header_id|>\n\n" + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "assistant<|end_header_id|>\n\n" + "{{ message.content }}" + "{% endif %}" + "<|eot_id|>" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + "{% endif %}" + ) + + +# alias +Llama3VisionAlpha = Llama3VisionAlphaChatHandler + + +class MiniCPMv26ChatHandler(MTMDChatHandler): + + CHAT_FORMAT = ( + "{% set image_count = namespace(value=0) %}" + "{% for message in messages %}" + "{% if loop.first and messages[0]['role'] != 'system' %}" + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "{% endif %}" + "<|im_start|>{{ message['role'] }}\n" + "{% if message['content'] is iterable %}" + "{% for content in message['content'] %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{% set image_count.value = image_count.value + 1 %}" + "{{ image_count.value }}: {{ content.image_url }}" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{% set image_count.value = image_count.value + 1 %}" + "{{ image_count.value }}: {{ content.image_url.url }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + + "{% for content in message['content'] %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% if message['content'] is string %}" + "{{ message['content'] }}" + "{% endif %}" + "<|im_end|>\n" + "{% endfor %}" + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + +class MiniCPMv45ChatHandler(MTMDChatHandler): + """ + Handler for MiniCPM-V 4.5 models. + + Supports: + - Multi-step tool calls with and XML tags. + - Integrated reasoning (thinking) process with tags. + - Specialized system prompt handling with tool definitions. + - Global image numbering for multi-image processing. + """ + + # Model specific control tokens + MINICPMV_BOS_TOKEN = "<|im_start|>" + MINICPMV_EOS_TOKEN = "<|im_end|>" + MINICPMV_PAD_TOKEN = "<|endoftext|>" + + # Image placeholder tags + MINICPMV_IMAGE_START_TOKEN = "" + MINICPMV_IMAGE_END_TOKEN = "" + MINICPMV_IMAGE_ID_START_TOKEN = "" + MINICPMV_IMAGE_ID_END_TOKEN = "" + + CHAT_FORMAT = ( + # --- 1. First System Message & Tools Definitions --- + "{%- if tools %}" + "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}" + "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}" + "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}" + "{{- 'You are provided with function signatures within XML tags:\\n' }}" + "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}" + "{{- '\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\"name\": , \"arguments\": }\\n" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- elif messages[0].role == 'system' %}" + "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- endif %}" + + # --- 2. Message Stream Processing --- + "{% set image_count = namespace(value=0) %}" + "{%- for message in messages %}" + # --- Unified Role Handling (User, Assistant, and subsequent Systems) --- + "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}" + "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}" + + "{%- set content = message.content %}" + "{%- if content is not string %}" + "{%- set ns = namespace(content_str='') %}" + "{%- for item in content %}" + # --- Explicit image_url type and value checking --- + "{%- if item.type == 'image_url' %}" + "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}" + "{%- set image_count.value = image_count.value + 1 %}" + # Format: N: IMAGE_URL + "{%- set ns.content_str = ns.content_str + '' + (image_count.value | string) + ': ' + image_url + '' %}" + "{%- elif item.type == 'text' %}" + "{%- set ns.content_str = ns.content_str + item.text %}" + "{%- endif %}" + "{%- endfor %}" + "{%- set content = ns.content_str %}" + "{%- endif %}" + + "{{- content -}}" + + # Append tool_calls to assistant messages if they exist + "{%- if message.role == 'assistant' and message.tool_calls %}" + "{%- for tool_call in message.tool_calls %}" + "{%- set tc = tool_call.function if tool_call.function else tool_call %}" + "{{- '\\n\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}" + "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}" + "{{- '}\\n' }}" + "{%- endfor %}" + "{%- endif %}" + "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" + + # --- Specialized Tool Response Handling --- + # Group consecutive tool responses under a single user-like block + "{%- elif message.role == 'tool' %}" + "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}" + "{{- '" + MINICPMV_BOS_TOKEN + "user' }}" + "{%- endif %}" + "{{- '\\n\\n' + message.content + '\\n' }}" + "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}" + "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + + # --- 3. Generation Prompt --- + "{%- if add_generation_prompt %}" + "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}" + # Handle thinking/reasoning block visibility based on configuration + "{%- if enable_thinking is defined and enable_thinking is false %}" + "{{- '\\n\\n\\n\\n' }}" + "{%- elif enable_thinking is defined and enable_thinking is true %}" + "{{- '\\n' }}" + "{%- endif %}" + "{%- endif %}" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the MiniCPM-V 4.5 Handler. + + Args: + enable_thinking (bool): If True, model generates reasoning before the final answer. + **kwargs: Additional arguments for the base MTMDChatHandler. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject thinking control flag into the template + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Set stop token patch + kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + return super().__call__(**kwargs) + + +class MiniCPMV46ChatHandler(MTMDChatHandler): + """ + Handler for MiniCPM-V-4.6 models. + + Features: + - Aligned with official tokenizer_config.json special tokens. + - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens. + - Integrated MTMD-style URL and Base64 injection for visual content. + - Specialized `` and `` block generation. + - Autonomously folds previous reasoning paths using `last_query_index`. + - Toggles `` block generation via `enable_thinking` (Defaults to False). + """ + + # Core tokens + MINICPM_BOS_TOKEN = "<|im_start|>" + MINICPM_EOS_TOKEN = "<|im_end|>" + MINICPM_PAD_TOKEN = "<|endoftext|>" + + # Vision tokens + MINICPM_VISION_BOS_TOKEN = "<|vision_start|>" + MINICPM_VISION_EOS_TOKEN = "<|vision_end|>" + MINICPM_IMAGE_TOKEN = "<|image_pad|>" + MINICPM_VIDEO_TOKEN = "<|video_pad|>" + + CHAT_FORMAT = ( + "{%- if enable_thinking is not defined -%}\n" + " {%- set enable_thinking = false -%}\n" + "{%- endif -%}\n" + "{%- macro render_content(content, is_system_content=false) -%}\n" + " {%- if content is string -%}\n" + " {{- content -}}\n" + " {%- elif content is iterable and content is not mapping -%}\n" + " {%- set ns = namespace(parts=[]) -%}\n" + " {%- for item in content -%}\n" + " {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n" + " {%- if is_system_content -%}\n" + " {{- raise_exception('System message cannot contain images.') -}}\n" + " {%- endif -%}\n" + " {%- set url_val = '' -%}\n" + " {%- if item.type == 'image_url' -%}\n" + " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" + " {%- endif -%}\n" + " {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n" + # " {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n" + # " {%- if is_system_content -%}\n" + # " {{- raise_exception('System message cannot contain videos.') -}}\n" + # " {%- endif -%}\n" + # " {%- set url_val = '' -%}\n" + # " {%- if item.type == 'video_url' -%}\n" + # " {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n" + # " {%- endif -%}\n" + # " {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n" + " {%- elif 'text' in item -%}\n" + " {%- set ns.parts = ns.parts + [item.text] -%}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected item type in content.') -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- ns.parts | join('\\n') -}}\n" + " {%- elif content is none or content is undefined -%}\n" + " {{- '' -}}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected content type.') -}}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "{%- if not messages %}\n" + " {{- raise_exception('No messages provided.') }}\n" + "{%- endif %}\n" + "{%- if tools and tools is iterable and tools is not mapping %}\n" + " {{- '<|im_start|>system\\n' }}\n" + " {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n' }}\n" + " {%- for tool in tools %}\n" + " {{- '\\n' }}\n" + " {{- tool | tojson }}\n" + " {%- endfor %}\n" + " {{- '\\n' }}\n" + " {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n\\n\\n\\nvalue_1\\n\\n\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n\\n\\n\\n\\n\\nReminder:\\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n' }}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {%- if content %}\n" + " {{- '\\n\\n' + content }}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + "{%- else %}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n" + "{%- for message in messages[::-1] %}\n" + " {%- set index = (messages|length - 1) - loop.index0 %}\n" + " {%- if ns.multi_step_tool and message.role == 'user' %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if not(content.startswith('') and content.endswith('')) %}\n" + " {%- set ns.multi_step_tool = false %}\n" + " {%- set ns.last_query_index = index %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if ns.multi_step_tool %}\n" + " {{- raise_exception('No user query found in messages.') }}\n" + "{%- endif %}\n" + "{%- for message in messages %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if message.role == 'system' %}\n" + " {%- if not loop.first %}\n" + " {{- raise_exception('System message must be at the beginning.') }}\n" + " {%- endif %}\n" + " {%- elif message.role == 'user' %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n" + " {%- elif message.role == 'assistant' %}\n" + " {%- set reasoning_content = '' %}\n" + " {%- if message.reasoning_content is string %}\n" + " {%- set reasoning_content = message.reasoning_content %}\n" + " {%- else %}\n" + " {%- if '' in content %}\n" + " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n" + " {%- set content = content.split('')[-1].lstrip('\\n') %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {%- set reasoning_content = reasoning_content|trim %}\n" + " {%- if loop.index0 > ns.last_query_index %}\n" + " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n\\n' + content }}\n" + " {%- else %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content }}\n" + " {%- endif %}\n" + " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n" + " {%- for tool_call in message.tool_calls %}\n" + " {%- if tool_call.function is defined %}\n" + " {%- set tool_call = tool_call.function %}\n" + " {%- endif %}\n" + " {%- if loop.first %}\n" + " {%- if content|trim %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- '\\n\\n\\n' }}\n" + " {%- endif %}\n" + " {%- if tool_call.arguments is defined %}\n" + " {%- for args_name, args_value in tool_call.arguments|items %}\n" + " {{- '\\n' }}\n" + " {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n" + " {{- args_value }}\n" + " {{- '\\n\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif message.role == 'tool' %}\n" + " {%- if loop.previtem and loop.previtem.role != 'tool' %}\n" + " {{- '<|im_start|>user' }}\n" + " {%- endif %}\n" + " {{- '\\n\\n' }}\n" + " {{- content }}\n" + " {{- '\\n' }}\n" + " {%- if not loop.last and loop.nextitem.role != 'tool' %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif loop.last %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- raise_exception('Unexpected message role.') }}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if add_generation_prompt %}\n" + " {{- '<|im_start|>assistant\\n' }}\n" + " {%- if enable_thinking is defined and enable_thinking is false %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the MiniCPM-V-4.6 Handler. + + Args: + enable_thinking (bool): Controls whether to open a `` block for reasoning. + Defaults to False as per the standard template logic. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject the thinking variable into the Jinja environment + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # MiniCPM uses standard <|im_end|> ChatML stop formatting + kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + +class Gemma3ChatHandler(MTMDChatHandler): + + GEMMA3_BOI_TOKEN = "" + GEMMA3_EOI_TOKEN = "" + GEMMA3_BOS_TOKEN = "" + GEMMA3_EOS_TOKEN = "" + + CHAT_FORMAT = ( + "{% if messages[0]['role'] == 'system' %}" + "{% set loop_messages = messages[1:] %}" + "{% if messages[0]['content'] is string %}" + "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" + "{% else %}" + "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" + "{% endif %}" + "{% else %}" + "{% set loop_messages = messages %}" + "{% set first_user_prefix = '' %}" + "{% endif %}" + + "{% for message in loop_messages %}" + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" + "{% endif %}" + + "{% if message['role'] == 'assistant' %}" + "{% set role = 'model' %}" + "{% else %}" + "{% set role = message['role'] %}" + "{% endif %}" + + "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" + + "{% if message['content'] is string %}" + "{{ message['content'] | trim }}" + "{% elif message['content'] is iterable %}" + "{% for item in message['content'] %}" + "{% if item['type'] == 'image_url' and item['image_url'] is string %}" + "{{ '' + item['image_url'] + '' }}" + "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" + "{{ '' + item['image_url']['url'] + '' }}" + "{% elif item['type'] == 'text' %}" + "{{ item['text'] | trim }}" + "{% endif %}" + "{% endfor %}" + "{% else %}" + "{{ raise_exception('Invalid content type') }}" + "{% endif %}" + + "\n" + "{% endfor %}" + + "{% if add_generation_prompt %}" + "model\n" + "{% endif %}" + ) + + +class Gemma4ChatHandler(MTMDChatHandler): + """ + Handler for Gemma 4 models. + + Note on `enable_thinking`: + The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. + It is NOT supported by Gemma4 E2B and E4B models. + + [Important Note for Audio Processing!] + It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models. + Other quantizations are known to have degraded performance; + ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463 + """ + + # The special token in Gemma 4 + GEMMA4_BOI_TOKEN = "<|image>" + GEMMA4_EOI_TOKEN = "" + GEMMA4_BOA_TOKEN = "<|audio>" + GEMMA4_EOA_TOKEN = "" + GEMMA4_BOS_TOKEN = "" + GEMMA4_EOS_TOKEN = "" + GEMMA4_SOT_TOKEN = "<|turn>" + GEMMA4_EOT_TOKEN = "" + GEMMA4_SOC_TOKEN = "<|channel>" + GEMMA4_EOC_TOKEN = "" + GEMMA4_STC_TOKEN = "<|tool_call>" + GEMMA4_ETC_TOKEN = "" + GEMMA4_STD_TOKEN = "<|tool>" + GEMMA4_ETD_TOKEN = "" + GEMMA4_STR_TOKEN = "<|tool_response>" + GEMMA4_ETR_TOKEN = "" + + CHAT_FORMAT = ( + "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n" + " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" + " {%- set ns = namespace(found_first=false) -%}\n" + " {%- for key, value in properties | dictsort -%}\n" + " {%- set add_comma = false -%}\n" + " {%- if not filter_keys or key not in standard_keys -%}\n" + " {%- if ns.found_first %},{% endif -%}\n" + " {%- set ns.found_first = true -%}\n" + " {{ key }}:{\n" + " {%- if value['description'] -%}\n" + " description:<|\"|>{{ value['description'] }}<|\"|>\n" + " {%- set add_comma = true -%}\n" + " {%- endif -%}\n" + " {%- if value['type'] | upper == 'STRING' -%}\n" + " {%- if value['enum'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " enum:{{ format_argument(value['enum']) }}\n" + " {%- endif -%}\n" + " {%- elif value['type'] | upper == 'ARRAY' -%}\n" + " {%- if value['items'] is mapping and value['items'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " items:{\n" + " {%- set ns_items = namespace(found_first=false) -%}\n" + " {%- for item_key, item_value in value['items'] | dictsort -%}\n" + " {%- if item_value is not none -%}\n" + " {%- if ns_items.found_first %},{% endif -%}\n" + " {%- set ns_items.found_first = true -%}\n" + " {%- if item_key == 'properties' -%}\n" + " properties:{\n" + " {%- if item_value is mapping -%}\n" + " {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n" + " {%- endif -%}\n" + " }\n" + " {%- elif item_key == 'required' -%}\n" + " required:[\n" + " {%- for req_item in item_value -%}\n" + " <|\"|>{{- req_item -}}<|\"|>\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " ]\n" + " {%- elif item_key == 'type' -%}\n" + " {%- if item_value is string -%}\n" + " type:{{ format_argument(item_value | upper) }}\n" + " {%- else -%}\n" + " type:{{ format_argument(item_value | map('upper') | list) }}\n" + " {%- endif -%}\n" + " {%- else -%}\n" + " {{ item_key }}:{{ format_argument(item_value) }}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " }\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if value['nullable'] %}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " nullable:true\n" + " {%- endif -%}\n" + " {%- if value['type'] | upper == 'OBJECT' -%}\n" + " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " properties:{\n" + " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" + " }\n" + " {%- elif value is mapping -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " properties:{\n" + " {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n" + " }\n" + " {%- endif -%}\n" + " {%- if value['required'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " required:[\n" + " {%- for item in value['required'] | default([]) -%}\n" + " <|\"|>{{- item -}}<|\"|>\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " ]\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + "{%- endmacro -%}\n" + "{%- macro format_function_declaration(tool_data) -%}\n" + " declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n" + " {%- set params = tool_data['function']['parameters'] -%}\n" + " {%- if params -%}\n" + " ,parameters:{\n" + " {%- if params.get('properties') -%}\n" + " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" + " {%- endif -%}\n" + " {%- if params.get('required') -%}\n" + " required:[\n" + " {%- for item in params['required'] -%}\n" + " <|\"|>{{- item -}}<|\"|>\n" + " {{- ',' if not loop.last -}}\n" + " {%- endfor -%}\n" + " ],\n" + " {%- endif -%}\n" + " {%- if params.get('type') -%}\n" + " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if 'response' in tool_data['function'] -%}\n" + " {%- set response_declaration = tool_data['function']['response'] -%}\n" + " ,response:{\n" + " {%- if response_declaration['description'] -%}\n" + " description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n" + " {%- endif -%}\n" + " {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n" + " type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " }\n" + "{%- endmacro -%}\n" + "{%- macro format_argument(argument, escape_keys=True) -%}\n" + " {%- if argument is string -%}\n" + " {{- '<|\"|>' + argument + '<|\"|>' -}}\n" + " {%- elif argument is boolean -%}\n" + " {{- 'true' if argument else 'false' -}}\n" + " {%- elif argument is mapping -%}\n" + " {{- '{' -}}\n" + " {%- set ns = namespace(found_first=false) -%}\n" + " {%- for key, value in argument | dictsort -%}\n" + " {%- if ns.found_first %},{% endif -%}\n" + " {%- set ns.found_first = true -%}\n" + " {%- if escape_keys -%}\n" + " {{- '<|\"|>' + key + '<|\"|>' -}}\n" + " {%- else -%}\n" + " {{- key -}}\n" + " {%- endif -%}\n" + " :{{- format_argument(value, escape_keys=escape_keys) -}}\n" + " {%- endfor -%}\n" + " {{- '}' -}}\n" + " {%- elif argument is sequence -%}\n" + " {{- '[' -}}\n" + " {%- for item in argument -%}\n" + " {{- format_argument(item, escape_keys=escape_keys) -}}\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " {{- ']' -}}\n" + " {%- else -%}\n" + " {{- argument -}}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "{%- macro strip_thinking(text) -%}\n" + " {%- set ns = namespace(result='') -%}\n" + " {%- for part in text.split('') -%}\n" + " {%- if '<|channel>' in part -%}\n" + " {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n" + " {%- else -%}\n" + " {%- set ns.result = ns.result + part -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- ns.result | trim -}}\n" + "{%- endmacro -%}\n" + "\n" + "{%- macro format_tool_response_block(tool_name, response) -%}\n" + " {{- '<|tool_response>' -}}\n" + " {%- if response is mapping -%}\n" + " {{- 'response:' + tool_name + '{' -}}\n" + " {%- for key, value in response | dictsort -%}\n" + " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " {{- '}' -}}\n" + " {%- else -%}\n" + " {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n" + " {%- endif -%}\n" + " {{- '' -}}\n" + "{%- endmacro -%}\n" + "\n" + "{%- set ns = namespace(prev_message_type=None) -%}\n" + "{%- set loop_messages = messages -%}\n" + "{{- bos_token -}}\n" + "{#- Handle System/Tool Definitions Block -#}\n" + "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" + " {{- '<|turn>system\\n' -}}\n" + " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" + " {%- if enable_thinking is defined and enable_thinking -%}\n" + " {{- '<|think|>\\n' -}}\n" + " {%- set ns.prev_message_type = 'think' -%}\n" + " {%- endif -%}\n" + " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" + " {%- if messages[0]['content'] is string -%}\n" + " {{- messages[0]['content'] | trim -}}\n" + " {%- elif messages[0]['content'] is sequence -%}\n" + " {%- for item in messages[0]['content'] -%}\n" + " {{- item['text'] | trim + ' '-}}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- set loop_messages = messages[1:] -%}\n" + " {%- endif -%}\n" + " {%- if tools -%}\n" + " {%- for tool in tools %}\n" + " {{- '<|tool>' -}}\n" + " {{- format_function_declaration(tool) | trim -}}\n" + " {{- '' -}}\n" + " {%- endfor %}\n" + " {%- set ns.prev_message_type = 'tool' -%}\n" + " {%- endif -%}\n" + " {{- '\\n' -}}\n" + "{%- endif %}\n" + "\n" + "{#- Pre-scan: find last user message index for reasoning guard -#}\n" + "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" + "{%- for i in range(loop_messages | length) -%}\n" + " {%- if loop_messages[i]['role'] == 'user' -%}\n" + " {%- set ns_turn.last_user_idx = i -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{#- Loop through messages -#}\n" + "{%- for message in loop_messages -%}\n" + " {%- if message['role'] != 'tool' -%}\n" + " {%- set ns.prev_message_type = None -%}\n" + " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" + " {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n" + " {%- set prev_nt = namespace(role=None, found=false) -%}\n" + " {%- if loop.index0 > 0 -%}\n" + " {%- for j in range(loop.index0 - 1, -1, -1) -%}\n" + " {%- if not prev_nt.found -%}\n" + " {%- if loop_messages[j]['role'] != 'tool' -%}\n" + " {%- set prev_nt.role = loop_messages[j]['role'] -%}\n" + " {%- set prev_nt.found = true -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n" + " {%- if not continue_same_model_turn -%}\n" + " {{- '<|turn>' + role + '\\n' }}\n" + " {%- endif -%}\n" + "\n" + " {#- Render reasoning/reasoning_content as thinking channel -#}\n" + " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" + " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" + " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" + " {%- endif -%}\n" + "\n" + " {%- if message.get('tool_calls') -%}\n" + " {%- for tool_call in message['tool_calls'] -%}\n" + " {%- set function = tool_call['function'] -%}\n" + " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" + " {%- if function['arguments'] is mapping -%}\n" + " {%- set ns_args = namespace(found_first=false) -%}\n" + " {%- for key, value in function['arguments'] | dictsort -%}\n" + " {%- if ns_args.found_first %},{% endif -%}\n" + " {%- set ns_args.found_first = true -%}\n" + " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" + " {%- endfor -%}\n" + " {%- elif function['arguments'] is string -%}\n" + " {{- function['arguments'] -}}\n" + " {%- endif -%}\n" + " {{- '}' -}}\n" + " {%- endfor -%}\n" + " {%- set ns.prev_message_type = 'tool_call' -%}\n" + " {%- endif -%}\n" + "\n" + " {%- set ns_tr_out = namespace(flag=false) -%}\n" + " {%- if message.get('tool_responses') -%}\n" + " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" + " {%- for tool_response in message['tool_responses'] -%}\n" + " {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n" + " {%- set ns_tr_out.flag = true -%}\n" + " {%- set ns.prev_message_type = 'tool_response' -%}\n" + " {%- endfor -%}\n" + " {%- elif message.get('tool_calls') -%}\n" + " {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n" + " {%- set ns_tool_scan = namespace(stopped=false) -%}\n" + " {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n" + " {%- if ns_tool_scan.stopped -%}\n" + " {%- elif loop_messages[k]['role'] != 'tool' -%}\n" + " {%- set ns_tool_scan.stopped = true -%}\n" + " {%- else -%}\n" + " {%- set follow = loop_messages[k] -%}\n" + " {#- Resolve tool_call_id to function name -#}\n" + " {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n" + " {%- for tc in message['tool_calls'] -%}\n" + " {%- if tc.get('id') == follow.get('tool_call_id') -%}\n" + " {%- set ns_tname.name = tc['function']['name'] -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {#- Handle content as string or content-parts array -#}\n" + " {%- set tool_body = follow.get('content') -%}\n" + " {%- if tool_body is string -%}\n" + " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" + " {%- elif tool_body is sequence and tool_body is not string -%}\n" + " {%- set ns_txt = namespace(s='') -%}\n" + " {%- for part in tool_body -%}\n" + " {%- if part.get('type') == 'text' -%}\n" + " {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" + " {%- for part in tool_body -%}\n" + " {%- if part.get('type') == 'image_url' -%}\n" + " {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n" + " {{- '<|image|>' + url_val -}}\n" + " {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n" + " {%- if part.get('type') == 'audio_url' -%}\n" + " {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif part.get('type') == 'input_audio' -%}\n" + " {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" + # " {%- elif part.get('type') == 'video_url' -%}\n" + # " {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n" + # " {{- '<|video|>' + video_val -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- else -%}\n" + " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" + " {%- endif -%}\n" + " {%- set ns_tr_out.flag = true -%}\n" + " {%- set ns.prev_message_type = 'tool_response' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "\n" + " {%- set captured_content -%}\n" + " {%- if message['content'] is string -%}\n" + " {%- if role == 'model' -%}\n" + " {{- strip_thinking(message['content']) -}}\n" + " {%- else -%}\n" + " {{- message['content'] | trim -}}\n" + " {%- endif -%}\n" + " {%- elif message['content'] is sequence -%}\n" + " {%- for item in message['content'] -%}\n" + " {%- if item['type'] == 'text' -%}\n" + " {%- if role == 'model' -%}\n" + " {{- strip_thinking(item['text']) -}}\n" + " {%- else -%}\n" + " {{- item['text'] | trim -}}\n" + " {%- endif -%}\n" + " {%- elif item['type'] == 'image_url' -%}\n" + " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" + " {{- '<|image|>' + url_val -}}\n" + " {%- set ns.prev_message_type = 'image' -%}\n" + " {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n" + " {%- if item['type'] == 'audio_url' -%}\n" + " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif item['type'] == 'input_audio' -%}\n" + " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" + " {%- set ns.prev_message_type = 'audio' -%}\n" + " {%- endif -%}\n" + # " {%- elif item['type'] == 'video_url' -%}\n" + # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" + # " {{- '<|video|>' + video_val -}}\n" + # " {%- set ns.prev_message_type = 'video' -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- endset -%}\n" + "\n" + " {{- captured_content -}}\n" + " {%- set has_content = captured_content | trim | length > 0 -%}\n" + "\n" + " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" + " {{- '<|tool_response>' -}}\n" + " {%- elif not (ns_tr_out.flag and not has_content) -%}\n" + " {{- '\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- if add_generation_prompt -%}\n" + " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" + " {{- '<|turn>model\\n' -}}\n" + " {%- if not enable_thinking | default(false) -%}\n" + " {{- '<|channel>thought\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endif -%}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the Gemma 4 Handler. + + Args: + enable_thinking (bool): Controls whether the <|think|> tag is injected and + manages <|channel>thought behavior. + Note: ONLY supported on Gemma4 31B and 26BA4B models. + NOT supported on Gemma4 E2B and E4B models. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject the thinking variable into the Jinja environment + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Set the stop token based on Gemma 4's format () + # generation_config.json: "eos_token_id": [1, 106, 50] + kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + +class GLM41VChatHandler(MTMDChatHandler): + # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. + + GLM41V_EOS_TOKEN = "<|endoftext|>" + GLM41V_PAD_TOKEN = "<|endoftext|>" + GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]\n" + "{%- for msg in messages -%}" + "{%- if msg.role == 'system' -%}" + "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'user' -%}" + "<|user|>\n" + "{%- if msg.content is string -%}" + "{{ msg.content }}" + "{%- else -%}" + "{%- for item in msg.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'assistant' -%}" + "{%- if msg.metadata -%}" + "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- else -%}" + "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{%- endif -%}" + ) + + def __call__(self, **kwargs): + self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN + # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json + stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch + kwargs['stop'] = stop_tokens + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + + +class GLM46VChatHandler(MTMDChatHandler): + GLM46V_EOS_TOKEN = "<|endoftext|>" + GLM46V_PAD_TOKEN = "<|endoftext|>" + GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]" + "{%- if tools -%}" + "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" + "You are provided with function signatures within XML tags:\n\n" + "{%- for tool in tools -%}" + "{{ tool | tojson(ensure_ascii=False) }}\n" + "{%- endfor -%}" + "\n\nFor each function call, output the function name and arguments within the following XML format:\n" + "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" + "{%- endif -%}" + + "{%- for m in messages -%}" + "{%- if m.role == 'system' -%}" + "<|system|>\n{{ m.content }}" + "{%- elif m.role == 'user' -%}" + "<|user|>\n" + "{%- if m.content is string -%}" + "{{ m.content }}" + "{%- else -%}" + "{%- for item in m.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + # If enable_thinking is disabled, insert `/nothink` according to the source code logic. + "{{ '/nothink' if not enable_thinking else '' }}" + "{%- elif m.role == 'assistant' -%}" + "<|assistant|>" + "{%- if enable_thinking -%}" + "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" + "\n{{ reasoning.strip() }}" + "{%- else -%}" + "\n" + "{%- endif -%}" + "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" + "{%- endif -%}" + "{{ GLM46V_EOS_TOKEN }}" + "{%- endfor -%}" + + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{{ '' if enable_thinking else '\n' }}" + "{%- endif -%}" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + GLM-4.6V Handler + Parameters: + - enable_thinking (bool): Whether to enable the model's think process. The default is True. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN + + # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json + kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + +class GraniteDoclingChatHandler(MTMDChatHandler): + """ + Handler for Granite-Docling models. + + Format(512x512): Content + + Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!! + Since the model does not have special tokens for the start and end of an image, + it is recommended to process only one image at a time. + You can iterate through the images individually for recognition. + + """ + GRANITE_BOS_TOKEN = "<|start_of_role|>" + GRANITE_EOS_TOKEN = "<|end_of_text|>" + GRANITE_PAD_TOKEN = "<|end_of_text|>" + GRANITE_IMAGE_TOKEN = "" + + CHAT_FORMAT = ( + "{%- for message in messages -%}" + "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}" + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + "{%- for part in message['content'] -%}" + "{%- if part['type'] == 'text' -%}" + "{{- part['text'] -}}" + "{%- elif part['type'] == 'image_url' -%}" + "{%- if part.image_url is string -%}" + "{{- part.image_url -}}" + "{%- else -%}" + "{{- part.image_url.url -}}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- '<|end_of_text|>\n' -}}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{- '<|start_of_role|>assistant' -}}" + # Support the 'controls' parameter if present in generation arguments + "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}" + "{{- '<|end_of_role|>' -}}" + "{%- endif -%}" + ) + + def __init__(self, controls: dict = None, **kwargs): + """ + Granite-Docling Handler + Args: + controls (dict, optional): Operational parameters passed to the assistant role. + + The 'controls' parameter is used to guide the model's behavior or output format. + Common examples for 'controls' include: + - Document Parsing: {"mode": "document_parsing", "format": "json"} + """ + self.controls = controls + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject controls into the template environment + self.extra_template_arguments["controls"] = self.controls + self.DEFAULT_SYSTEM_MESSAGE = None + kwargs['stop'] = [self.GRANITE_EOS_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + + return super().__call__(**kwargs) + + +class LFM2VLChatHandler(MTMDChatHandler): + LFM2VL_BOS_TOKEN = "<|startoftext|>" + LFM2VL_EOS_TOKEN = "<|im_end|>" + LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" + + CHAT_FORMAT = ( + "{%- for message in messages -%}" + "{{ '<|im_start|>' + message['role'] + '\n' }}" + "{%- if message['content'] is string -%}" + "{{ message['content'] }}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if 'image_url' in content -%}" + "{%- if content.image_url is string -%}" + "<|image_start|>{{ content.image_url }}<|image_end|>" + "{%- else -%}" + "<|image_start|>{{ content.image_url.url }}<|image_end|>" + "{%- endif -%}" + "{%- elif content['type'] == 'text' -%}" + "{{ content['text'] }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{ '<|im_end|>\n' }}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{ '<|im_start|>assistant\n' }}" + "{%- endif -%}" + ) + + def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): + """ + LFM2-VL Handler + LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 + """ + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) + + def __call__(self, **kwargs): + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + return super().__call__(**kwargs) + + +class LFM25VLChatHandler(MTMDChatHandler): + """ + Handler for LFM2.5-VL multimodal models. + + Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing. + """ + # Aligned with LFM2.5-VL tokenizer_config + LFM25VL_BOS_TOKEN = "<|startoftext|>" + LFM25VL_EOS_TOKEN = "<|im_end|>" + LFM25VL_PAD_TOKEN = "<|pad|>" + + # Image specific tokens + LFM25VL_IMAGE_TOKEN = "" + LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" + LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" + + CHAT_FORMAT = ( + "{{- bos_token -}}\n" + "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" + "{%- set ns = namespace(system_prompt='', content='') -%}\n" + "{%- if messages[0]['role'] == 'system' -%}\n" + " {%- set ns.system_prompt = messages[0]['content'] -%}\n" + " {%- set messages = messages[1:] -%}\n" + "{%- endif -%}\n" + "{%- if tools -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" + " {%- for tool in tools -%}\n" + " {%- if tool is not string -%}\n" + " {%- set tool = tool | tojson -%}\n" + " {%- endif -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" + " {%- if not loop.last -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" + "{%- endif -%}\n" + "{%- if ns.system_prompt -%}\n" + " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" + "{%- endif -%}\n" + "{%- set ns.last_assistant_index = -1 -%}\n" + "{%- for message in messages -%}\n" + " {%- if message['role'] == 'assistant' -%}\n" + " {%- set ns.last_assistant_index = loop.index0 -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "{%- for message in messages -%}\n" + " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" + " {%- set content = message['content'] -%}\n" + " {%- if content is not string -%}\n" + " {%- set ns.content = '' -%}\n" + " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" + " {%- for item in content -%}\n" + " {%- if item['type'] == 'image_url' -%}\n" + " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" + " {%- set ns.content = ns.content + img_val -%}\n" + " {%- elif item['type'] == 'text' -%}\n" + " {%- set ns.content = ns.content + item['text'] -%}\n" + " {%- else -%}\n" + " {%- set ns.content = ns.content + (item | tojson) -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set content = ns.content -%}\n" + " {%- endif -%}\n" + " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" + " {%- if '' in content -%}\n" + " {%- set content = content.split('')[-1] | trim -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {{- content + '<|im_end|>\\n' -}}\n" + "{%- endfor -%}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, keep_past_thinking: bool = False, **kwargs): + self.keep_past_thinking = keep_past_thinking + super().__init__(**kwargs) + + + def __call__(self, **kwargs): + if self.image_min_tokens > 256: + if self.verbose: + print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.") + self.image_min_tokens = -1 + + self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking + + kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") + return super().__call__(**kwargs) + + +class PaddleOCRChatHandler(MTMDChatHandler): + """ + Handler for PaddleOCR 1.5/1.6 multimodal models. + """ + + PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" + PADDLEOCR_BOS_TOKEN = "" + PADDLEOCR_EOS_TOKEN = "" + PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>" + PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>" + PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>" + + CHAT_FORMAT = ( + "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}" + "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}" + "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}" + + "{{- cls_token -}}" + "{%- for message in messages -%}" + "{%- if message['role'] == 'user' -%}" + "{{- 'User: ' -}}" + + # Robust parsing: Check if content is string or list + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + # Pass 1: Render all images first + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'image_url' and 'image_url' in content -%}" + "{{- '<|IMAGE_START|>' -}}" + "{%- if content.image_url is string -%}" + "{{- content.image_url -}}" + "{%- else -%}" + "{{- content.image_url.url -}}" + "{%- endif -%}" + "{{- '<|IMAGE_END|>' -}}" + "{%- endif -%}" + "{%- endfor -%}" + + # Pass 2: Render all text second + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- '\\n' -}}" + + "{%- elif message['role'] == 'assistant' -%}" + "{{- 'Assistant:\\n' -}}" + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- eos_token -}}" + + "{%- elif message['role'] == 'system' -%}" + "{%- if message['content'] is string -%}" + "{{- message['content'] + '\\n' -}}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] + '\\n' -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + + "{%- if add_generation_prompt -%}" + "{{- 'Assistant:\\n' -}}" + "{%- endif -%}" + ) + + def __init__( + self, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + **kwargs + ): + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + super().__init__( + image_min_tokens=self.image_min_tokens, + image_max_tokens=self.image_max_tokens, + **kwargs + ) + + def __call__(self, **kwargs): + # Set the specific stop token defined in the PaddleOCR template + kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + return super().__call__(**kwargs) + + +class Qwen25VLChatHandler(MTMDChatHandler): + + QWEN25_VL_BOS_TOKEN = "<|endoftext|>" + QWEN25_VL_PAD_TOKEN = "<|endoftext|>" + QWEN25_VL_EOS_TOKEN = "<|im_end|>" + + CHAT_FORMAT = ( + "{% set image_count = namespace(value=0) %}" + "{% for message in messages %}" + "{% if loop.first and message['role'] != 'system' %}" + "<|im_start|>system\n" + "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" + "{% endif %}" + "<|im_start|>{{ message['role'] }}\n" + "{% if message['content'] is string %}" + "{{ message['content'] }}<|im_end|>\n" + "{% else %}" + "{% for content in message['content'] %}" + "{% if content['type'] == 'image_url' %}" + "{% if content.image_url is string %}" + "{% set image_count.value = image_count.value + 1 %}" + "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>" + "{% else %}" + "{% set image_count.value = image_count.value + 1 %}" + "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>" + "{% endif %}" + "{% elif content['type'] == 'text' %}" + "{{ content['text'] }}" + "{% endif %}" + "{% endfor %}" + "<|im_end|>\n" + "{% endif %}" + "{% endfor %}" + "<|im_start|>assistant\n" + ) + + def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + +class Qwen3ASRChatHandler(MTMDChatHandler): + """ + Handler for Qwen 3 ASR (Automatic Speech Recognition) models. + + Features: + - Highly specialized for Speech-to-Text tasks. + - Aggregates all system text into a single cohesive system block. + - Drops user text entirely, extracting ONLY audio data into a unified user turn. + - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>. + - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url. + """ + + DEFAULT_SYSTEM_MESSAGE = """ + You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language. + You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization. + """ + + QWEN3_ASR_BOS_TOKEN = "<|im_start|>" + QWEN3_ASR_PAD_TOKEN = "<|endoftext|>" + QWEN3_ASR_EOS_TOKEN = "<|im_end|>" + + + QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>" + QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>" + QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>" + + CHAT_FORMAT = ( + "{%- set ns = namespace(system_text='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.role == 'system' -%}\n" + " {%- if m.content is string -%}\n" + " {%- set ns.system_text = ns.system_text + m.content -%}\n" + " {%- else -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'text' and (c.text is defined) -%}\n" + " {%- set ns.system_text = ns.system_text + c.text -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- set ns2 = namespace(audio_tokens='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.content is not string -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n" + " {#- MTMD Audio Injection -#}\n" + " {%- set audio_val = '' -%}\n" + " {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n" + " {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n" + " {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n" + " {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n" + " {%- endif -%}\n" + " {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n" + "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token + kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") + + return super().__call__(**kwargs) + +class Qwen3VLChatHandler(MTMDChatHandler): + + QWEN3_VL_BOS_TOKEN = "<|endoftext|>" + QWEN3_VL_PAD_TOKEN = "<|endoftext|>" + QWEN3_VL_EOS_TOKEN = "<|im_end|>" + + CHAT_FORMAT = ( + "{{- '<|im_start|>system\n' -}}" + "{%- if messages[0].content is string and messages[0].role == 'system' -%}" + "{{- messages[0].content -}}" + "{%- elif messages[0].role == 'system' -%}" + "{%- if 'text' in messages[0].content -%}" + "{{- messages[0].content.text -}}" + "{%- else -%}" + "{{- 'You are a helpful assistant.' -}}" + "{%- endif -%}" + "{%- endif -%}" + "{%- if tools -%}" + "{{- '\n\n' -}}" + "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' -}}" + "{%- for tool in tools -%}" + "{{- '\n' -}}" + "{{- tool | tojson -}}" + "{%- endfor -%}" + "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" + "{%- endif -%}" + "{{- '<|im_end|>\n' -}}" + "{%- set image_count = namespace(value=0) -%}" + #"{%- set video_count = namespace(value=0) -%}" + "{%- for message in messages -%}" + "{%- if message.role == 'tool' -%}" + "{{- '<|im_start|>user\n\n' -}}" + "{%- elif message.role != 'system' -%}" + "{{- '<|im_start|>' + message.role + '\n' -}}" + "{%- endif -%}" + "{%- if message.content is string and message.role != 'system' -%}" + "{{- message.content -}}" + "{%- elif message.role != 'system' -%}" + "{%- for content in message.content -%}" + "{%- if 'image_url' in content -%}" + "{%- set image_count.value = image_count.value + 1 -%}" + "{%- if add_vision_id -%}" + "{{- 'Picture ' -}}" + "{{- image_count.value | string -}}" + "{{- ': ' -}}" + "{%- endif -%}" + "{{- '<|vision_start|>' -}}" + "{%- if content.image_url is string -%}" + "{{- content.image_url -}}" + "{%- else -%}" + "{{- content.image_url.url -}}" + "{%- endif -%}" + "{{- '<|vision_end|>' -}}" + "{%- endif -%}" + # Video not supported yet + "{%- if 'text' in content -%}" + "{{- content.text -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- if message.role == 'assistant' -%}" + "{%- if message.tool_calls -%}" + "{%- for tool_call in message.tool_calls -%}" + "{%- if (loop.first and message.content) or (not loop.first) -%}" + "{{- '\n' -}}" + "{%- endif -%}" + "{%- if tool_call.function -%}" + "{%- set tool_call = tool_call.function -%}" + "{%- endif -%}" + "{{- '\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}" + "{%- if tool_call.arguments is string -%}" + "{{- tool_call.arguments -}}" + "{%- else -%}" + "{{- tool_call.arguments | tojson -}}" + "{%- endif -%}" + "{{- '}\n' -}}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- elif message.role == 'tool' -%}" + "{{- '' -}}" + "{%- endif -%}" + "{%- if message.role != 'system' -%}" + "{{- '<|im_end|>\n' -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{- '<|im_start|>assistant\n' -}}" + "{%- if force_reasoning -%}" + "{{- '\n' -}}" + "{%- endif -%}" + "{%- endif -%}" + ) + + def __init__( + self, + force_reasoning: bool = False, + add_vision_id: bool = True, + **kwargs, + ): + """ + Parameters: + - force_reasoning (bool): + - True: Force the reasoning in the model by adding to the chat template. + - False (default): Don't force the reasoning. + - add_vision_id (bool): + - True (default): Count all the images. Recommended for multi-image. + - False: Doesn't count the images. Can save tokens with single-image. + """ + super().__init__(**kwargs) + self.force_reasoning = force_reasoning + self.extra_template_arguments["force_reasoning"] = force_reasoning + self.extra_template_arguments["add_vision_id"] = add_vision_id + + def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + +class Qwen35ChatHandler(MTMDChatHandler): + """ + Handler for Qwen3.5/Qwen3.6 models. + """ + CHAT_FORMAT = ( + "{%- set image_count = namespace(value=0) -%}" + "{%- set video_count = namespace(value=0) -%}" + "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}" + " {%- if content is string -%}" + " {{- content -}}" + " {%- elif content is iterable and content is not mapping -%}" + " {%- for item in content -%}" + " {%- if 'image_url' in item or item.type == 'image_url' -%}" + " {%- if is_system_content -%}" + " {{- raise_exception('System message cannot contain images.') -}}" + " {%- endif -%}" + " {%- if do_vision_count -%}" + " {%- set image_count.value = image_count.value + 1 -%}" + " {%- endif -%}" + " {%- if add_vision_id -%}" + " {{- 'Picture ' -}}" + " {{- image_count.value | string -}}" + " {{- ': ' -}}" + " {%- endif -%}" + " {{- '<|vision_start|>' -}}" + " {%- if item.image_url is string -%}" + " {{- item.image_url -}}" + " {%- else -%}" + " {{- item.image_url.url -}}" + " {%- endif -%}" + " {{- '<|vision_end|>' -}}" + " {%- elif 'video' in item -%}" + " {{- raise_exception('llama.cpp does not currently support video.') -}}" # Video not supported, raise exception + " {%- if is_system_content -%}" + " {{- raise_exception('System message cannot contain videos.') -}}" + " {%- endif -%}" + " {%- if do_vision_count -%}" + " {%- set video_count.value = video_count.value + 1 -%}" + " {%- endif -%}" + " {%- if add_vision_id -%}" + " {{- 'Video ' ~ video_count.value ~ ': ' -}}" + " {%- endif -%}" + " {{- '<|vision_start|>' -}}" + " {{- item.video -}}" + " {{- '<|vision_end|>' -}}" + " {%- elif 'text' in item -%}" + " {{- item.text -}}" + " {%- else -%}" + " {{- raise_exception('Unexpected item type in content.') -}}" + " {%- endif -%}" + " {%- endfor -%}" + " {%- elif content is none or content is undefined -%}" + " {{- '' -}}" + " {%- else -%}" + " {{- raise_exception('Unexpected content type.') -}}" + " {%- endif -%}" + "{%- endmacro -%}" + "{%- if not messages -%}" + " {{- raise_exception('No messages provided.') -}}" + "{%- endif -%}" + "{%- if tools and tools is iterable and tools is not mapping -%}" + " {{- '<|im_start|>system\n' -}}" + " {{- '# Tools\n\nYou have access to the following functions:\n\n' -}}" + " {%- for tool in tools -%}" + " {{- '\n' -}}" + " {{- tool | tojson -}}" + " {%- endfor -%}" + " {{- '\n' -}}" + " {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' -}}" + " {%- if messages[0].role == 'system' -%}" + " {%- set content = render_content(messages[0].content, false, true) | trim -%}" + " {%- if content -%}" + " {{- '\n\n' + content -}}" + " {%- endif -%}" + " {%- endif -%}" + " {{- '<|im_end|>\n' -}}" + "{%- elif messages[0].role == 'system' -%}" + " {%- set content = render_content(messages[0].content, false, true) -%}" + " {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}" + "{%- endif -%}" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}" + "{%- for message in messages[::-1] -%}" + " {%- set index = messages | length - 1 - loop.index0 -%}" + " {%- if ns.multi_step_tool and message.role == 'user' -%}" + " {%- set content = render_content(message.content, false) | trim -%}" + " {%- if not (content.startswith('') and content.endswith('')) -%}" + " {%- set ns.multi_step_tool = false -%}" + " {%- set ns.last_query_index = index -%}" + " {%- endif -%}" + " {%- endif -%}" + "{%- endfor -%}" + "{%- if ns.multi_step_tool -%}" + " {{- raise_exception('No user query found in messages.') -}}" + "{%- endif -%}" + "{%- for message in messages -%}" + " {%- set content = render_content(message.content, true) | trim -%}" + " {%- if message.role == 'system' -%}" + " {%- if not loop.first -%}" + " {{- raise_exception('System message must be at the beginning.') -}}" + " {%- endif -%}" + " {%- elif message.role == 'user' -%}" + " {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}" + " {%- elif message.role == 'assistant' -%}" + " {%- set reasoning_content = '' -%}" + " {%- if message.reasoning_content is string -%}" + " {%- set reasoning_content = message.reasoning_content -%}" + " {%- elif '' in content -%}" + " {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') -%}" + " {%- set content = content.split('')[-1].lstrip('\n') -%}" + " {%- endif -%}" + " {%- set reasoning_content = reasoning_content | trim -%}" + " {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}" + " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" + " {%- else -%}" + " {{- '<|im_start|>' + message.role + '\n' + content -}}" + " {%- endif -%}" + " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" + " {%- for tool_call in message.tool_calls -%}" + " {%- if tool_call.function is defined -%}" + " {%- set tool_call = tool_call.function -%}" + " {%- endif -%}" + " {%- if loop.first -%}" + " {%- if content | trim -%}" + " {{- '\n\n\n\n' -}}" + " {%- else -%}" + " {{- '\n\n' -}}" + " {%- endif -%}" + " {%- else -%}" + " {{- '\n\n\n' -}}" + " {%- endif -%}" + " {%- if tool_call.arguments is defined -%}" + " {%- for (args_name, args_value) in tool_call.arguments | items -%}" + " {{- '\n' -}}" + " {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}" + " {{- args_value -}}" + " {{- '\n' -}}" + " {%- endfor -%}" + " {%- endif -%}" + " {{- '\n' -}}" + " {%- endfor -%}" + " {%- endif -%}" + " {{- '<|im_end|>\n' -}}" + " {%- elif message.role == 'tool' -%}" + " {%- if loop.previtem and loop.previtem.role != 'tool' -%}" + " {{- '<|im_start|>user' -}}" + " {%- endif -%}" + " {{- '\n\n' -}}" + " {{- content -}}" + " {{- '\n' -}}" + " {%- if not loop.last and loop.nextitem.role != 'tool' -%}" + " {{- '<|im_end|>\n' -}}" + " {%- elif loop.last -%}" + " {{- '<|im_end|>\n' -}}" + " {%- endif -%}" + " {%- else -%}" + " {{- raise_exception('Unexpected message role.') -}}" + " {%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + " {{- '<|im_start|>assistant\n' -}}" + " {%- if enable_thinking is defined and enable_thinking is false -%}" + " {{- '\n\n\n\n' -}}" + " {%- else -%}" + " {{- '\n' -}}" + " {%- endif -%}" + "{%- endif -%}" + ) + + def __init__( + self, + add_vision_id: bool = True, + enable_thinking: bool = True, + preserve_thinking: bool = False, + **kwargs, + ): + """ + Parameters: + - add_vision_id (bool): + - True (default): Count all the images. Recommended for multi-image. + - False: Doesn't count the images. Can save tokens with single-image. + - enable_thinking (bool): + - True (default): Enables reasoning for better results. + - False: Disables reasoning for faster results. + - preserve_thinking (bool): + - True: Keeps reasoning process for ALL historical conversational turns. + - False (default): Only keeps for the latest assistant reply to save tokens. + """ + super().__init__(**kwargs) + self.enable_thinking = enable_thinking + self.preserve_thinking = preserve_thinking + self.extra_template_arguments["add_vision_id"] = add_vision_id + self.extra_template_arguments["enable_thinking"] = enable_thinking + self.extra_template_arguments["preserve_thinking"] = preserve_thinking + + def __call__(self, **kwargs): + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + + +class Step3VLChatHandler(MTMDChatHandler): + """ + Handler for Step3-VL models. + """ + + STEP3VL_BOS_TOKEN = "<|im_start|>" + STEP3VL_EOS_TOKEN = "<|im_end|>" + STEP3VL_PAD_TOKEN = "<|endoftext|>" + STEP3VL_IMAGE_TOKEN = "" + + CHAT_FORMAT = ( + "{%- macro render_content(content) -%}\n" + " {%- if content is none -%}{{- '' -}}\n" + " {%- elif content is string -%}{{- content -}}\n" + " {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n" + " {%- elif content is iterable -%}\n" + " {%- for item in content -%}\n" + " {%- if item.type == 'text' -%}\n" + " {{- item['value'] if 'value' in item else item['text'] -}}\n" + " {%- elif item.type in ['image', 'image_url'] -%}\n" + " {%- set url_val = '' -%}\n" + " {%- if item.image_url -%}\n" + " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" + " {%- endif -%}\n" + " {{- '' + url_val -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "\n" + "{%- if tools -%}\n" + " {{- '<|im_start|>system\\n' -}}\n" + " {%- if messages[0].role == 'system' -%}\n" + " {{- render_content(messages[0].content) + '\\n\\n' -}}\n" + " {%- endif -%}\n" + " {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n' -}}\n" + " {%- for tool in tools -%}\n" + " {{- '\\n' -}}\n" + " {{- tool | tojson -}}\n" + " {%- endfor -%}\n" + " {{- '\\n\\n\\nAlways adhere to this exact format for tool use:\\n\\n\\n{\"name\": , \"arguments\": }\\n\\n{additional_tool_calls}\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within XML tags.\\n- `` must be an exact match to one of the available tools.\\n- `` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n" + "{%- else -%}\n" + " {%- if messages[0].role == 'system' -%}\n" + " {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n" + " {%- endif -%}\n" + "{%- endif -%}\n" + "\n" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n" + "{%- for message in messages[::-1] -%}\n" + " {%- set index = (messages|length - 1) - loop.index0 -%}\n" + " {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) -%}\n" + " {%- set ns.multi_step_tool = false -%}\n" + " {%- set ns.last_query_index = index -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- for message in messages -%}\n" + " {%- set content = render_content(message.content) -%}\n" + " {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n" + " {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n" + " {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n" + " {%- elif message.role == 'assistant' -%}\n" + " {%- if message.reasoning_content is string -%}\n" + " {%- set reasoning_content = render_content(message.reasoning_content) -%}\n" + " {%- else -%}\n" + " {%- if '' in content -%}\n" + " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') -%}\n" + " {%- set content = content.split('')[-1].lstrip('\\n') -%}\n" + " {%- else -%}\n" + " {%- set reasoning_content = '' -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if loop.index0 > ns.last_query_index -%}\n" + " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n' + content -}}\n" + " {%- else -%}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content -}}\n" + " {%- endif -%}\n" + " {%- if message.tool_calls -%}\n" + " {{- '\\n' -}}\n" + " {%- for tool_call in message.tool_calls -%}\n" + " {{- '\\n' -}}\n" + " {%- if tool_call.function -%}\n" + " {%- set tool_call = tool_call.function -%}\n" + " {%- endif -%}\n" + " {{- '\\n{\"name\": \"' -}}\n" + " {{- tool_call.name -}}\n" + " {{- '\", \"arguments\": ' -}}\n" + " {%- if tool_call.arguments is string -%}\n" + " {{- tool_call.arguments -}}\n" + " {%- else -%}\n" + " {{- tool_call.arguments | tojson -}}\n" + " {%- endif -%}\n" + " {{- '}\\n' -}}\n" + " {%- endfor -%}\n" + " {{- '\\n' -}}\n" + " {%- endif -%}\n" + " {{- '<|im_end|>\\n' -}}\n" + " {%- elif message.role == 'tool' -%}\n" + " {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n" + " {{- '<|im_start|>tool_response' -}}\n" + " {%- endif -%}\n" + " {{- '\\n\\n' -}}\n" + " {{- content -}}\n" + " {{- '\\n' -}}\n" + " {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n" + " {{- '<|im_end|>\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n\\n\\n\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the Step3-VL Handler. + + Args: + enable_thinking (bool): If False, injects an empty block to bypass reasoning. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Pass thinking toggle into Jinja + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Step3 uses standard <|im_end|> ChatML stop formatting + kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) From d84b0c21fa4a131df5c17ddd1b2447929dc1973f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 22:08:32 +0800 Subject: [PATCH 23/36] fix(model): handle missing chat templates - Update LlamaModel.model_chat_template() to return Optional[str] and accept name=None for the default model chat template. - llama_model_chat_template() may return nullptr when no chat template is available. Handle that case explicitly instead of decoding a null pointer, and return None so callers can apply their own fallback logic. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 434921e6bd..91befb2247 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -152,12 +152,17 @@ def model_size(self) -> int: """ return llama_cpp.llama_model_size(self.model) - def model_chat_template(self, name: bytes) -> str: + def model_chat_template(self, name: Optional[bytes] = None) -> Optional[str]: """ - Get the default chat template. Returns nullptr if not available - If name is NULL, returns the default chat template + Get a chat template from the model. + + If name is None, returns the default chat template. + Returns None if no chat template is available. """ - return llama_cpp.llama_model_chat_template(self.model, name).decode("utf-8") + template = llama_cpp.llama_model_chat_template(self.model, name) + if template is None: + return None + return template.decode("utf-8") def n_params(self) -> int: """ From c9745316d748cec408b07c2f3a43fd97fa921e73 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 23:43:33 +0800 Subject: [PATCH 24/36] feat(mtmd): enhance generic chat template support - Enhance GenericMTMDChatHandler to better support model-provided chat templates. - Allow the generic handler to accept an optional named chat template, load it from the model at call time via llama_model_chat_template(), fall back to the model's default chat template, and finally use the built-in MTMD CHAT_FORMAT when no model template is available. - Also expand the generic media placeholder list for common multimodal templates and document the handler as a template-driven MTMD implementation. This prepares the generic path for a later render-driven placeholder replacement pass. Signed-off-by: JamePeng --- llama_cpp/llama.py | 2 + llama_cpp/llama_multimodal.py | 118 +++++++++++++++++++++++++++++++--- 2 files changed, 110 insertions(+), 10 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index dbc60eaf76..b6a2c8d5a7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -174,6 +174,7 @@ def __init__( log_filters: Optional[Sequence[str]] = None, log_filters_case_sensitive: bool = True, # Extra Params + chat_template_name: Optional[str] = None, chat_handler_kwargs: Dict[str, Any] = {}, **kwargs, # type: ignore ): @@ -721,6 +722,7 @@ def __init__( chat_format = self.metadata.get("tokenizer.chat_template", None), mmproj_path = mmproj_path, verbose = self.verbose, + chat_template_name=chat_template_name, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py index a055869543..a0f7e594e4 100644 --- a/llama_cpp/llama_multimodal.py +++ b/llama_cpp/llama_multimodal.py @@ -91,6 +91,8 @@ class MTMDChatHandler: "{% endif %}" ) + KNOWN_MEDIA_TAGS: List[str] = [] + def __init__( self, mmproj_path: Optional[str] = None, @@ -1189,41 +1191,137 @@ def from_pretrained( **kwargs, ) -# Experiments are not recommended for this purpose at this time. +# Generic template-driven MTMD handler. class GenericMTMDChatHandler(MTMDChatHandler): + """ + Generic MTMD chat handler backed by the model-provided chat template. + + This handler is intentionally template-driven. It renders the model's + tokenizer.chat_template first, then normalizes rendered media URLs or + placeholder tokens into MTMD media markers before tokenization. + + It is designed for model templates that emit media placeholders such as + <|image_pad|>, <|image|>, , [IMG], or Kimi-style <|media_pad|>. + Model-specific handlers may still be preferable when a model requires + special stop tokens, generation flags, or non-standard template arguments. + """ + KNOWN_MEDIA_TAGS = [ + # Pad placeholders inside model-specific wrappers. "<|image_pad|>", "<|audio_pad|>", "<|video_pad|>", + + # Direct placeholders inside Gemma/Llama/GLM-style wrappers. "<|image|>", "<|audio|>", "<|video|>", - "[IMG]" + + # LLaVA / LFM / Mistral-style placeholders. + "", + "