From 1f5226b4882542545bec204d81f04171059566ee Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Mon, 4 May 2026 20:58:58 +0200
Subject: [PATCH 01/36] Implemented generic multimodal chat handler.

---
 llama_cpp/llama.py             | 12 +++++++++
 llama_cpp/llama_chat_format.py | 49 +++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 1241f81e26..848706a90d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -85,6 +85,7 @@ class Llama:
     def __init__(
         self,
         model_path: str,
+        clip_model_path: Optional[str] = None,
         *,
         # Model Params
         n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto",
@@ -608,6 +609,17 @@ def __init__(
 
         if self.verbose:
             print(f"Model metadata: {self.metadata}", file=sys.stderr)
+        
+        if clip_model_path is not None:
+            if self.chat_handler is not None and self.verbose:
+                print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True)
+
+            self.chat_handler = llama_chat_format.GenericMTMDChatHandler(
+                gguf_metadata = self.metadata,
+                clip_model_path = clip_model_path,
+                model_arch = None,
+                verbose = self.verbose
+            )
 
         eos_token_id = self.token_eos()
         bos_token_id = self.token_bos()
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index a0d8d25db4..468a73c077 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2887,10 +2887,14 @@ def __init__(
             raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}")
 
         # Pre-compile Jinja template
+        if not hasattr(self, "chat_format") or self.chat_format is None:
+            self.chat_format = self.CHAT_FORMAT
+
+        self._chat_format_parser_tags = []
         self.chat_template = ImmutableSandboxedEnvironment(
             trim_blocks=True,
             lstrip_blocks=True,
-        ).from_string(self.CHAT_FORMAT)
+        ).from_string(self.chat_format)
 
         self._exit_stack = ExitStack()
 
@@ -3116,6 +3120,13 @@ def _process_mtmd_prompt(
             tool_choice=tool_choice,
             **getattr(self, 'extra_template_arguments', {})
         )
+        
+        for tag in self._chat_format_parser_tags:
+            if tag not in text:
+                continue
+
+            text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):]
+
         # Replace image_url by media_marker in text
         for item in media_items:
             text = text.replace(item["url"], media_marker)
@@ -3827,6 +3838,42 @@ def from_pretrained(
             **kwargs,
         )
 
+class GenericMTMDChatHandler(MTMDChatHandler):
+    def __init__(
+        self,
+        gguf_metadata: Dict[str, Any],
+        clip_model_path: str,
+        model_arch: Optional[str] = None,
+        verbose: bool = True,
+        **kwargs
+    ) -> None:
+        self.model_metadata = gguf_metadata
+
+        self.chat_format = self.model_metadata.get("tokenizer.chat_template", None)
+        self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch
+
+        if verbose:
+            print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)
+        
+        if self.arch is None:
+            if verbose:
+                print("Unknown model architecture. Will use general/most-common tags.")
+            
+            self.arch = "unknown"
+
+        if self.chat_format is None:
+            raise ValueError("Failed to get model chat template automatically.")
+        
+        super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs)
+        
+        if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]:
+            self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"]
+        elif self.arch in ["gemma4"]:
+            self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"]
+        elif self.arch in ["mistral3", "mistral4", "deepseek2"]:
+            self._chat_format_parser_tags += ["[IMG]"]
+        elif verbose:
+            print("Warning: Could not determine chat format parser tags.", flush = True)
 
 class Llava15ChatHandler(MTMDChatHandler):
     CHAT_FORMAT = (

From a8d19d3bbd18890693576b1f5ed6cd0b2d487eab Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Mon, 4 May 2026 21:19:20 +0200
Subject: [PATCH 02/36] Used text.replace()

---
 llama_cpp/llama_chat_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 468a73c077..ab5e438d3e 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3125,7 +3125,7 @@ def _process_mtmd_prompt(
             if tag not in text:
                 continue
 
-            text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):]
+            text = text.replace(tag, media_marker)
 
         # Replace image_url by media_marker in text
         for item in media_items:

From 3e031d5de16d5bd81dd35ef2cc3b8e2d49fac063 Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Tue, 5 May 2026 17:46:08 +0200
Subject: [PATCH 03/36] Fixed some bugs.

---
 llama_cpp/llama_chat_format.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index ab5e438d3e..40491968a9 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3874,6 +3874,18 @@ def __init__(
             self._chat_format_parser_tags += ["[IMG]"]
         elif verbose:
             print("Warning: Could not determine chat format parser tags.", flush = True)
+    
+    def __call__(self, **kwargs):
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing")
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
 
 class Llava15ChatHandler(MTMDChatHandler):
     CHAT_FORMAT = (

From 389d0d97babca3edcf6fb74f476e306a21183b5f Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Tue, 5 May 2026 18:49:21 +0200
Subject: [PATCH 04/36] Implemented 'chat_handler_kwargs'.

---
 llama_cpp/llama.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 848706a90d..6dab44602d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -152,6 +152,7 @@ def __init__(
         spm_infill: bool = False,
         verbose: bool = True,
         # Extra Params
+        chat_handler_kwargs: Dict[str, Any] = {},
         **kwargs,  # type: ignore
     ):
         """Load a llama.cpp model from `model_path`.
@@ -618,7 +619,8 @@ def __init__(
                 gguf_metadata = self.metadata,
                 clip_model_path = clip_model_path,
                 model_arch = None,
-                verbose = self.verbose
+                verbose = self.verbose,
+                **chat_handler_kwargs
             )
 
         eos_token_id = self.token_eos()

From 9187910e35e6f4d063f33364a10812727a05e58d Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Sat, 16 May 2026 06:41:17 +0200
Subject: [PATCH 05/36] fix

---
 llama_cpp/llama.py             |  1 -
 llama_cpp/llama_chat_format.py | 33 +++++++++++----------------------
 2 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6dab44602d..7666b822a8 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -618,7 +618,6 @@ def __init__(
             self.chat_handler = llama_chat_format.GenericMTMDChatHandler(
                 gguf_metadata = self.metadata,
                 clip_model_path = clip_model_path,
-                model_arch = None,
                 verbose = self.verbose,
                 **chat_handler_kwargs
             )
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 40491968a9..0be38a19d3 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3839,47 +3839,36 @@ def from_pretrained(
         )
 
 class GenericMTMDChatHandler(MTMDChatHandler):
+    KNOWN_MEDIA_TAGS = [
+        "<|image_pad|>",
+        "<|audio_pad|>",
+        "<|video_pad|>",
+        "<|image|>",
+        "<|audio|>",
+        "<|video|>",
+        "[IMG]"
+    ]
+
     def __init__(
         self,
         gguf_metadata: Dict[str, Any],
         clip_model_path: str,
-        model_arch: Optional[str] = None,
         verbose: bool = True,
         **kwargs
     ) -> None:
         self.model_metadata = gguf_metadata
-
         self.chat_format = self.model_metadata.get("tokenizer.chat_template", None)
-        self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch
 
         if verbose:
             print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)
-        
-        if self.arch is None:
-            if verbose:
-                print("Unknown model architecture. Will use general/most-common tags.")
-            
-            self.arch = "unknown"
 
         if self.chat_format is None:
             raise ValueError("Failed to get model chat template automatically.")
         
         super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs)
-        
-        if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]:
-            self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"]
-        elif self.arch in ["gemma4"]:
-            self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"]
-        elif self.arch in ["mistral3", "mistral4", "deepseek2"]:
-            self._chat_format_parser_tags += ["[IMG]"]
-        elif verbose:
-            print("Warning: Could not determine chat format parser tags.", flush = True)
     
     def __call__(self, **kwargs):
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
+        self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format]
 
         if self.verbose:
             print(f"{self.log_prefix} - Start processing")

From b48d57a2b4019bbd248c848eefa1442c9e7890cb Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Mon, 18 May 2026 21:43:37 +0800
Subject: [PATCH 06/36] Update Submodule vendor/llama.cpp 39cf5d6..6db1304

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 39cf5d6191..6db130445d 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee
+Subproject commit 6db130445d29b243ee2171efb8cd61b84a1c5322

From f309265b0df3ab2477682db3a959656dcb6d06e6 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Tue, 19 May 2026 19:36:28 +0800
Subject: [PATCH 07/36] build(ci+cu131): bundle LLVM OpenMP runtime for Windows
 CPU backends

- Add a PowerShell step to the Windows CI workflow to locate and copy
  `libomp140.x86_64.dll` from the Visual Studio redistributables.
- Place the runtime DLL into the `llama_cpp\lib` package directory.

This ensures that the dynamically loaded `ggml-cpu-*.dll` variants
(which are built with LLVM OpenMP on Windows) have their required
dependencies packaged in the wheel. Without this,
`ggml_backend_load_all_from_path()` can silently fail to load the CPU
backends at runtime on end-user machines.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 .github/workflows/build-wheels-cu131-win.yml | 25 ++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml
index 14bea65d19..5f77003a5f 100644
--- a/.github/workflows/build-wheels-cu131-win.yml
+++ b/.github/workflows/build-wheels-cu131-win.yml
@@ -67,6 +67,31 @@ jobs:
           echo LIB=%LIB%>>%GITHUB_ENV%
           echo LIBPATH=%LIBPATH%>>%GITHUB_ENV%
 
+      - name: Copy LLVM OpenMP runtime
+        shell: pwsh
+        run: |
+          # GGML CPU all-variant backends are built with LLVM OpenMP on Windows.
+          # The dynamically loaded ggml-cpu-*.dll files depend on this runtime.
+          # If it is missing from the wheel, ggml_backend_load_all_from_path()
+          # may fail to load CPU backend DLLs at runtime.
+          $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib"
+          New-Item -ItemType Directory -Force $packageLibDir | Out-Null
+
+          $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" `
+            -Recurse `
+            -Filter "libomp140.x86_64.dll" `
+            -ErrorAction SilentlyContinue |
+            Where-Object { $_.FullName -match "OpenMP\.LLVM" } |
+            Select-Object -First 1
+
+          if (!$omp) {
+            Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables."
+            exit 1
+          }
+
+          Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force
+          Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)"
+
       - name: Build wheel
         run: |
           $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '')

From 677db7b0d5b834ae3d3831af4702ec21986ab335 Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Thu, 28 May 2026 00:12:35 +0200
Subject: [PATCH 08/36] Resolve file conflicts.

---
 .github/workflows/build-wheels-cu131-win.yml | 25 --------------------
 1 file changed, 25 deletions(-)

diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml
index 5f77003a5f..14bea65d19 100644
--- a/.github/workflows/build-wheels-cu131-win.yml
+++ b/.github/workflows/build-wheels-cu131-win.yml
@@ -67,31 +67,6 @@ jobs:
           echo LIB=%LIB%>>%GITHUB_ENV%
           echo LIBPATH=%LIBPATH%>>%GITHUB_ENV%
 
-      - name: Copy LLVM OpenMP runtime
-        shell: pwsh
-        run: |
-          # GGML CPU all-variant backends are built with LLVM OpenMP on Windows.
-          # The dynamically loaded ggml-cpu-*.dll files depend on this runtime.
-          # If it is missing from the wheel, ggml_backend_load_all_from_path()
-          # may fail to load CPU backend DLLs at runtime.
-          $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib"
-          New-Item -ItemType Directory -Force $packageLibDir | Out-Null
-
-          $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" `
-            -Recurse `
-            -Filter "libomp140.x86_64.dll" `
-            -ErrorAction SilentlyContinue |
-            Where-Object { $_.FullName -match "OpenMP\.LLVM" } |
-            Select-Object -First 1
-
-          if (!$omp) {
-            Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables."
-            exit 1
-          }
-
-          Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force
-          Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)"
-
       - name: Build wheel
         run: |
           $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '')

From 4794c8c20ee731838cbc2c8d601ccb2c245d6893 Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Thu, 28 May 2026 01:52:48 +0200
Subject: [PATCH 09/36] Added support when using the keyword 'audio' instead of
 'audio_url'.

---
 llama_cpp/llama_chat_format.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f9b9d52367..254195f95a 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2996,13 +2996,13 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa
                         media_items.append({"url": url, "type": "image"})
 
                     # 2. Audio Processing
-                    elif content_type in ["audio_url", "input_audio"]:
+                    elif content_type in ["audio", "audio_url", "input_audio"]:
                         if not self.is_support_audio:
                             raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.")
 
                         # Case A: Handle custom/forward-compatible audio_url format
-                        if content_type == "audio_url":
-                            audio_url = content["audio_url"]
+                        if content_type == "audio_url" or content_type == "audio":
+                            audio_url = content[content_type]
                             url = audio_url if isinstance(audio_url, str) else audio_url["url"]
                             media_items.append({"url": url, "type": "audio"})
                         # Case B: Handle OpenAI standard input_audio format

From 323da373ad2f30409123bfba8322041113f0eba8 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Mon, 8 Jun 2026 23:08:36 +0800
Subject: [PATCH 10/36] build(CMakelists): Improve Windows LLVM OpenMP runtime
 discovery

- Also improve diagnostics by reporting the selected runtime source and path,
warning when an explicit override points to a missing file, and keeping a clear
runtime warning when no OpenMP DLL can be found.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 CMakeLists.txt | 79 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 54 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f09cdb783..1ace43c4aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,8 @@ function(llama_cpp_python_install_target target)
 endfunction()
 
 
-# Install an extra Windows runtime DLL into the Python package runtime directory.
+# Copy an extra Windows runtime DLL into the Python package runtime directory
+# during the CMake install step.
 #
 # Some dynamically loaded backend libraries depend on runtime DLLs that are not
 # always discoverable through $<TARGET_RUNTIME_DLLS:...>. One important example
@@ -75,7 +76,10 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file)
     endif()
 
     if(NOT EXISTS "${runtime_file}")
-        message(WARNING "Windows runtime file does not exist and will not be installed: ${runtime_file}")
+        message(WARNING
+            "Windows runtime DLL was selected but does not exist and will not be copied: "
+            "${runtime_file}"
+        )
         return()
     endif()
 
@@ -92,6 +96,11 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file)
     foreach(DIR ${INSTALL_DIRS})
         file(TO_CMAKE_PATH "${DIR}" DIR_CMAKE)
 
+        message(STATUS
+            "Will copy Windows runtime DLL during install: "
+            "${runtime_file_cmake} -> ${DIR_CMAKE}"
+        )
+
         install(
             FILES "${runtime_file_cmake}"
             DESTINATION "${DIR_CMAKE}"
@@ -115,42 +124,62 @@ function(llama_cpp_python_install_windows_openmp_runtime)
     endif()
 
     set(OPENMP_RUNTIME_DLL "")
+    set(OPENMP_RUNTIME_SOURCE "")
+    set(FOUND_OPENMP_DLLS "")
+
+    if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL)
+        if(EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}")
+            set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}")
+            set(OPENMP_RUNTIME_SOURCE "LLAMA_CPP_OPENMP_RUNTIME_DLL")
+        else()
+            message(WARNING
+                "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: "
+                "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio "
+                "LLVM OpenMP runtime discovery."
+            )
+        endif()
+    endif()
 
-    if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL AND EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}")
-        set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}")
-    else()
+    if(NOT OPENMP_RUNTIME_DLL)
         file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE)
         file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE)
 
-        set(VS_OPENMP_SEARCH_ROOTS
-            "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC"
-            "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC"
-            "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC"
-            "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC"
-        )
+        set(VS_OPENMP_SEARCH_PATTERNS
+            # Prefer the exact VS 2022 Enterprise / BuildTools LLVM OpenMP redist layout.
+            "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll"
+            "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll"
 
-        foreach(ROOT ${VS_OPENMP_SEARCH_ROOTS})
-            if(EXISTS "${ROOT}")
-                file(
-                    GLOB_RECURSE FOUND_OPENMP_DLLS
-                    "${ROOT}/*/debug_nonredist/x64/Microsoft.VC*.OpenMP.LLVM/libomp140.x86_64.dll"
-                    "${ROOT}/**/libomp140.x86_64.dll"
-                )
+            # Keep these as secondary fallbacks for non-standard installs.
+            "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll"
+            "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll"
+            "C:/Windows/System32/libomp140.x86_64.dll"
+        )
 
-                if(FOUND_OPENMP_DLLS)
-                    list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL)
-                    break()
-                endif()
-            endif()
+        foreach(PATTERN ${VS_OPENMP_SEARCH_PATTERNS})
+            file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}")
+            list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS})
         endforeach()
+
+        if(FOUND_OPENMP_DLLS)
+            list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS)
+            list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING)
+            list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL)
+            set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 LLVM OpenMP redist fallback")
+        endif()
     endif()
 
     if(OPENMP_RUNTIME_DLL)
-        message(STATUS "Installing Windows LLVM OpenMP runtime: ${OPENMP_RUNTIME_DLL}")
+        message(STATUS
+            "Selected Windows LLVM OpenMP runtime from ${OPENMP_RUNTIME_SOURCE}: "
+            "${OPENMP_RUNTIME_DLL}"
+        )
         llama_cpp_python_install_windows_runtime_file("${OPENMP_RUNTIME_DLL}")
     else()
         message(WARNING
-            "Could not find libomp140.x86_64.dll. "
+            "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. "
+            "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL and Visual Studio 2022 "
+            "Enterprise/BuildTools redist paths under Program Files and Program Files (x86), "
+            "with a fuzzy MSVC version match such as 14.44.35112 or 14.44.35207. "
             "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, "
             "the packaged ggml-cpu-*.dll files may fail to load at runtime. "
             "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll "

From 111819832614d488c840b266ad95f894f420bfea Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Mon, 8 Jun 2026 23:48:49 +0800
Subject: [PATCH 11/36] ci(test): add cuda 13.0.2 build workflow

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 .github/workflows/build-wheels-cu130-win.yml | 249 +++++++++++++++++++
 1 file changed, 249 insertions(+)
 create mode 100644 .github/workflows/build-wheels-cu130-win.yml

diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml
new file mode 100644
index 0000000000..790d7c9665
--- /dev/null
+++ b/.github/workflows/build-wheels-cu130-win.yml
@@ -0,0 +1,249 @@
+name: Build Wheels (CU130) for Windows
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu130
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["windows-2022"]
+        pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        cuda: ["13.0.2"]
+        cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"]
+
+    defaults:
+      run:
+        shell: pwsh
+
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      CUDAARCHVER: ${{ matrix.cudaarch }}
+      MAX_JOBS: 12
+
+    steps:
+      - name: Add MSBuild to PATH
+        uses: microsoft/setup-msbuild@v3
+        with:
+          msbuild-architecture: x64
+
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Inspect Visual Studio OpenMP runtime paths
+        run: |
+          Write-Output "ProgramFiles=$env:ProgramFiles"
+          Write-Output "ProgramFiles(x86)=${env:ProgramFiles(x86)}"
+          Write-Output ""
+
+          $vsRoots = @(
+            "$env:ProgramFiles\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC",
+            "$env:ProgramFiles\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC",
+            "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC",
+            "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC"
+          )
+
+          foreach ($root in $vsRoots) {
+            Write-Output "Checking root: $root"
+
+            if (Test-Path $root) {
+              Write-Output "  Exists: yes"
+              Write-Output "  MSVC version directories:"
+
+              Get-ChildItem $root -Directory -ErrorAction SilentlyContinue |
+                Sort-Object Name |
+                ForEach-Object {
+                  Write-Output "    $($_.FullName)"
+                }
+
+              Write-Output "  OpenMP runtime candidates:"
+
+              Get-ChildItem $root -Recurse -Filter "libomp140.x86_64.dll" -ErrorAction SilentlyContinue |
+                Sort-Object FullName |
+                ForEach-Object {
+                  $sizeKB = [Math]::Round($_.Length / 1KB, 2)
+                  $sizeMB = [Math]::Round($_.Length / 1MB, 4)
+
+                  Write-Output "    Path: $($_.FullName)"
+                  Write-Output "    Size: $($_.Length) bytes / $sizeKB KB / $sizeMB MB"
+                }
+            } else {
+              Write-Output "  Exists: no"
+            }
+
+            Write-Output ""
+          }
+
+          Write-Output "Checking System32 fallback:"
+          $system32OpenMP = "C:\Windows\System32\libomp140.x86_64.dll"
+
+          if (Test-Path $system32OpenMP) {
+            $dll = Get-Item $system32OpenMP
+            $sizeKB = [Math]::Round($dll.Length / 1KB, 2)
+            $sizeMB = [Math]::Round($dll.Length / 1MB, 4)
+
+            Write-Output "  Path: $($dll.FullName)"
+            Write-Output "  Size: $($dll.Length) bytes / $sizeKB KB / $sizeMB MB"
+          } else {
+            Write-Output "  Not found: $system32OpenMP"
+          }
+
+      - name: Install CUDA ${{ matrix.cuda }}
+        uses: Jimver/cuda-toolkit@v0.2.35
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda }}
+          use-github-cache: false
+
+      - name: Install uv and Python ${{ matrix.pyver }}
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: ${{ matrix.pyver }}
+          activate-environment: true
+          enable-cache: true
+
+      - name: Install dependencies
+        run: |
+          git config --system core.longpaths true
+          uv pip install --upgrade build setuptools wheel packaging
+
+      - name: Setup MSVC environment for nvcc
+        shell: cmd
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
+          echo PATH=%PATH%>>%GITHUB_ENV%
+          echo INCLUDE=%INCLUDE%>>%GITHUB_ENV%
+          echo LIB=%LIB%>>%GITHUB_ENV%
+          echo LIBPATH=%LIBPATH%>>%GITHUB_ENV%
+
+      - name: Build wheel
+        run: |
+          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '')
+
+          $env:CUDA_HOME = $env:CUDA_PATH
+          $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH
+          $env:VERBOSE = '1'
+
+          # Force CMake to use Ninja + LLVM/Clang instead of the default
+          # Visual Studio generator. MSVC skips several GGML CPU all-variant
+          # backends, such as ivybridge, piledriver, cooperlake, zen4, and
+          # sapphirerapids.
+          $env:CMAKE_GENERATOR = 'Ninja Multi-Config'
+
+          $toolchainCandidates = @(
+            (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"),
+            (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake")
+          )
+
+          $toolchainFile = $toolchainCandidates |
+            Where-Object { Test-Path $_ } |
+            Select-Object -First 1
+
+          if (!$toolchainFile) {
+            Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')"
+            exit 1
+          }
+
+          $toolchainFile = $toolchainFile.Replace('\', '/')
+          Write-Output "Using toolchain file: $toolchainFile"
+
+          # Build one CUDA wheel with dynamic GGML backends:
+          # - GGML_BACKEND_DL enables runtime-loadable backend DLLs.
+          # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64,
+          #   ggml-cpu-haswell, ggml-cpu-alderlake, etc.
+          # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU.
+
+          # Suppress CUDA compiler warnings
+          $cudaDiagSuppress = '--diag-suppress=177,221,550'
+
+          $cmakeArgs = @(
+            # Windows toolchain / common runtime
+            '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake'
+            '-DLLAMA_BUILD_BORINGSSL=ON'
+
+            # Disable non-wheel targets
+            '-DLLAMA_BUILD_EXAMPLES=OFF'
+            '-DLLAMA_BUILD_TESTS=OFF'
+            '-DLLAMA_BUILD_TOOLS=OFF'
+            '-DLLAMA_BUILD_SERVER=OFF'
+            '-DLLAMA_BUILD_UI=OFF'
+            '-DLLAMA_USE_PREBUILT_UI=OFF'
+            '-DLLAMA_CURL=OFF'
+
+            # GGML dynamic backend layout
+            '-DGGML_CPU=ON'
+            '-DGGML_CUDA=ON'
+            '-DGGML_NATIVE=OFF'
+            '-DGGML_BACKEND_DL=ON'
+            '-DGGML_CPU_ALL_VARIANTS=ON'
+            '-DGGML_OPENMP=ON'
+
+            # CUDA backend
+            "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER"
+            '-DGGML_CUDA_FORCE_MMQ=ON'
+            '-DCUDA_SEPARABLE_COMPILATION=ON'
+            "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress"
+
+            # Build behavior
+            "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS"
+            '-DENABLE_CCACHE=ON'
+          )
+
+          $env:CMAKE_ARGS = $cmakeArgs -join ' '
+          Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS"
+
+          python -m build --wheel
+
+          # Check if wheel was built
+          if (!(Test-Path '.\dist\*.whl')) {
+            Write-Error "No wheel built in dist/ directory"
+            exit 1
+          }
+
+          $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1
+
+          # Wheel filename format:
+          #   name-version-python_tag-abi_tag-platform_tag.whl
+          $parts = $wheelFile.Name.Split('-')
+          $distName = $parts[0]
+          $version  = $parts[1]
+          $pyTag    = $parts[2]
+          $abiTag   = $parts[3]
+          $platTag  = $parts[4]
+
+          # CPU all-variants is now an internal runtime layout detail.
+          $newVersion = "$version+cu$cudaVersion"
+          $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag"
+
+          # Rename wheel file
+          Rename-Item -Path $wheelFile.FullName -NewName $newName
+          Write-Output "Renamed wheel to: $newName"
+
+          # Write the build tag to the output
+          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
+          Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV
+
+      - name: Get current date
+        id: get-date
+        run: |
+          $currentDate = Get-Date -UFormat "%Y%m%d"
+          Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV
+
+      - name: Create release
+        if: always() && env.TAG_VERSION != ''
+        uses: softprops/action-gh-release@v3
+        with:
+          files: dist/*
+          # Set tag_name to v<tag>-cu<cuda_version>-win-<date>
+          tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 7a6ee9fcd57438a950eb2ee6c8e079f2409c2765 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Tue, 9 Jun 2026 00:33:10 +0800
Subject: [PATCH 12/36] =?UTF-8?q?build(CMakeLists):=20prefer=20VS=202022?=
 =?UTF-8?q?=20VC143=20OpenMP=20redist=20and=20keep=20System32=20as=20final?=
 =?UTF-8?q?=20fallback=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 CMakeLists.txt | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ace43c4aa..5b2cfeeb8c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,7 +135,7 @@ function(llama_cpp_python_install_windows_openmp_runtime)
             message(WARNING
                 "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: "
                 "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio "
-                "LLVM OpenMP runtime discovery."
+                "VC143 LLVM OpenMP runtime discovery."
             )
         endif()
     endif()
@@ -144,18 +144,19 @@ function(llama_cpp_python_install_windows_openmp_runtime)
         file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE)
         file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE)
 
-        set(VS_OPENMP_SEARCH_PATTERNS
-            # Prefer the exact VS 2022 Enterprise / BuildTools LLVM OpenMP redist layout.
+        set(VS_OPENMP_VC143_PATTERNS
+            # Prefer VS 2022 VC143 LLVM OpenMP redist paths.
+            # The MSVC version directory is intentionally globbed because
+            # GitHub runners may contain versions such as 14.44.35112 or 14.44.35207.
             "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll"
             "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll"
 
-            # Keep these as secondary fallbacks for non-standard installs.
+            # Secondary VS layout fallbacks for unusual installations.
             "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll"
             "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll"
-            "C:/Windows/System32/libomp140.x86_64.dll"
         )
 
-        foreach(PATTERN ${VS_OPENMP_SEARCH_PATTERNS})
+        foreach(PATTERN ${VS_OPENMP_VC143_PATTERNS})
             file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}")
             list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS})
         endforeach()
@@ -164,7 +165,16 @@ function(llama_cpp_python_install_windows_openmp_runtime)
             list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS)
             list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING)
             list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL)
-            set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 LLVM OpenMP redist fallback")
+            set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 VC143 LLVM OpenMP redist")
+        endif()
+    endif()
+
+    if(NOT OPENMP_RUNTIME_DLL)
+        set(SYSTEM32_OPENMP_RUNTIME_DLL "C:/Windows/System32/libomp140.x86_64.dll")
+
+        if(EXISTS "${SYSTEM32_OPENMP_RUNTIME_DLL}")
+            set(OPENMP_RUNTIME_DLL "${SYSTEM32_OPENMP_RUNTIME_DLL}")
+            set(OPENMP_RUNTIME_SOURCE "System32 fallback")
         endif()
     endif()
 
@@ -177,9 +187,10 @@ function(llama_cpp_python_install_windows_openmp_runtime)
     else()
         message(WARNING
             "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. "
-            "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL and Visual Studio 2022 "
-            "Enterprise/BuildTools redist paths under Program Files and Program Files (x86), "
-            "with a fuzzy MSVC version match such as 14.44.35112 or 14.44.35207. "
+            "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL, Visual Studio 2022 "
+            "Enterprise/BuildTools VC143 redist paths under Program Files and "
+            "Program Files (x86), with a fuzzy MSVC version match such as "
+            "14.44.35112 or 14.44.35207, and C:/Windows/System32 as a final fallback. "
             "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, "
             "the packaged ggml-cpu-*.dll files may fail to load at runtime. "
             "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll "

From 50bbdd61fdf7e2e1cd7582a2183e476c98a47c17 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Tue, 9 Jun 2026 02:54:00 +0800
Subject: [PATCH 13/36] Update Submodule vendor/llama.cpp f0156d1..7d2b45b

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f0156d1401..7d2b45b4f7 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f0156d1401500512ad85042ccf38970568b12253
+Subproject commit 7d2b45b4f7b663cda74f23fbc3ce6dc3bd4f6545

From 55e855b75f901b494259a1c81b45ac80f0e3013f Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Tue, 9 Jun 2026 05:03:15 +0800
Subject: [PATCH 14/36] Update mtmd API 20260609

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/mtmd_cpp.py | 293 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 283 insertions(+), 10 deletions(-)

diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 4542555c65..61fb0e7859 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -10,12 +10,14 @@
     c_uint8,
     c_int32,
     c_uint32,
+    c_int64,
     c_float,
     c_void_p,
     c_size_t,
     POINTER,
     _Pointer,  # type: ignore
     Structure,
+    CFUNCTYPE
 )
 import pathlib
 from typing import (
@@ -318,6 +320,16 @@ def mtmd_get_audio_sample_rate(ctx: mtmd_context_p) -> c_int:
     """
     ...
 
+# // get the current marker string
+# MTMD_API const char * mtmd_get_marker(const mtmd_context * ctx);
+@ctypes_function_mtmd(
+    "mtmd_get_marker", [mtmd_context_p_ctypes], c_char_p)
+def mtmd_get_marker(ctx: mtmd_context_p) -> c_char_p:
+    """
+    get the current marker string
+    """
+    ...
+
 # // mtmd_bitmap
 # //
 # // if bitmap is image:
@@ -420,6 +432,58 @@ def mtmd_bitmap_set_id(
     ...
 
 
+# // mtmd_bitmap lazy
+# //
+# // this is a special bitmap that:
+# // - does not hold the actual data
+# // - can be expanded into one or more chunks (either media to text chunks)
+# // user must provide a callback to fill in the data when mtmd_tokenize() is called
+# // this is useful for large video inputs:
+# // - allow reading video frame by frame, without loading the entire video into memory
+# // - allow tracking the whole video with a single ID (for example, the file hash)
+
+# // set (*out_bitmap) to non-nullptr to emit a bitmap chunk; it will be freed automatically
+# // set (*out_text) to non-nullptr to emit a text chunk; it must be heap-allocated, null-terminated and will be freed automatically
+# // either out_bitmap or out_text can be set, but not both
+# // out_bitmap cannot be another lazy bitmap (no nested lazy allowed)
+# // return value:
+# //    0 on success
+# //   -1 on EOF (signal to mtmd_tokenize to move on)
+# //   -2 on error (signal to mtmd_tokenize to abort)
+# typedef int(* mtmd_bitmap_lazy_callback)(
+#     size_t chunk_idx,
+#     void * user_data,
+#     mtmd_bitmap ** out_bitmap,
+#     char ** out_text);
+mtmd_bitmap_lazy_callback = CFUNCTYPE(
+    c_int,
+    c_size_t,                 # chunk_idx
+    c_void_p,                 # user_data
+    POINTER(mtmd_bitmap_p),   # mtmd_bitmap ** out_bitmap
+    POINTER(c_char_p),        # char ** out_text
+)
+
+# MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx,
+#                                              const char * id, // usually set to file hash
+#                                              void * user_data,
+#                                              mtmd_bitmap_lazy_callback callback);
+@ctypes_function_mtmd(
+    "mtmd_input_chunks_get", [
+        mtmd_context_p_ctypes,
+        c_char_p,
+        c_void_p,
+        mtmd_bitmap_lazy_callback,
+    ], mtmd_bitmap_p_ctypes)
+def mtmd_input_chunks_get(
+    ctx: mtmd_context_p,
+    id: c_char_p,
+    user_data: c_void_p,
+    callback: mtmd_bitmap_lazy_callback,  # type: ignore
+    /,
+) -> mtmd_bitmap_p:
+    ...
+
+
 # // mtmd_input_chunks
 # //
 # // this is simply a list of mtmd_input_chunk
@@ -772,6 +836,9 @@ def mtmd_test_create_input_chunks() -> mtmd_input_chunk_p:
 # // BREAKING CHANGES are expected.
 # //
 
+# struct mtmd_helper_video;
+mtmd_helper_video_p = NewType("mtmd_helper_video_p", int)
+mtmd_helper_video_p_ctypes = c_void_p
 
 # // Set callback for all future logging events.
 # // If this is not called, or NULL is supplied, everything is output on stderr.
@@ -786,11 +853,33 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): #
     ...
 
 
+# // Returns true if this build includes video support (MTMD_VIDEO was ON at compile time).
+# MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx);
+@ctypes_function_mtmd(
+    "mtmd_helper_support_video", [mtmd_context_p], c_bool)
+def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool:
+    """
+    Returns true if this build includes video support (MTMD_VIDEO was ON at compile time).
+    """
+    ...
+
+
+# struct mtmd_helper_bitmap_wrapper {
+#     mtmd_bitmap * bitmap;
+#     mtmd_helper_video * video_ctx;
+# };
+class mtmd_helper_bitmap_wrapper(Structure):
+    _fields_ = [
+        ("bitmap", mtmd_bitmap_p),
+        ("video_ctx", mtmd_helper_video_p),
+    ]
+mtmd_helper_bitmap_wrapper_p_ctypes = POINTER(mtmd_helper_bitmap_wrapper)
+
 # // helper function to construct a mtmd_bitmap from a file
 # // it calls mtmd_helper_bitmap_init_from_buf() internally
 # // returns nullptr on failure
 # // this function is thread-safe
-# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
+# MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
 
 @ctypes_function_mtmd(
     "mtmd_helper_bitmap_init_from_file", [
@@ -798,14 +887,14 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): #
         c_char_p,
         c_bool,
     ],
-    mtmd_bitmap_p_ctypes
+    mtmd_helper_bitmap_wrapper
 )
 def mtmd_helper_bitmap_init_from_file(
     ctx: mtmd_context_p,
     fname: c_char_p,
     placeholder: c_bool,
     /,
-) -> mtmd_bitmap_p:
+) -> mtmd_helper_bitmap_wrapper:
     """
     helper function to construct a mtmd_bitmap from a file
     it calls mtmd_helper_bitmap_init_from_buf() internally
@@ -818,10 +907,13 @@ def mtmd_helper_bitmap_init_from_file(
 # // supported formats:
 # //     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
 # //     audio: formats supported by miniaudio: wav, mp3, flac
-# // note: audio files will be auto-detected based on magic bytes
+# // note:
+# //   - for now, video input is only supported via C++ helper functions
+# //   - audio files will be auto-detected based on magic bytes
+# //   - output bitmap will have FNV hash as the ID
 # // returns nullptr on failure
 # // this function is thread-safe
-# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
+# MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
 @ctypes_function_mtmd(
     "mtmd_helper_bitmap_init_from_buf", [
         mtmd_context_p_ctypes,
@@ -829,7 +921,7 @@ def mtmd_helper_bitmap_init_from_file(
         c_size_t,
         c_bool,
     ],
-    mtmd_bitmap_p_ctypes
+    mtmd_helper_bitmap_wrapper
 )
 def mtmd_helper_bitmap_init_from_buf(
     ctx: mtmd_context_p,
@@ -837,13 +929,16 @@ def mtmd_helper_bitmap_init_from_buf(
     len: c_size_t,
     placeholder: c_bool,
     /,
-) -> mtmd_bitmap_p:
+) -> mtmd_helper_bitmap_wrapper:
     """
     helper function to construct a mtmd_bitmap from a buffer containing a file
     supported formats:
-         image: formats supported by stb_image: jpg, png, bmp, gif, etc.
-         audio: formats supported by miniaudio: wav, mp3, flac
-    note: audio files will be auto-detected based on magic bytes
+        image: formats supported by stb_image: jpg, png, bmp, gif, etc.
+        audio: formats supported by miniaudio: wav, mp3, flac
+    note:
+        - for now, video input is only supported via C++ helper functions
+        - audio files will be auto-detected based on magic bytes
+        - output bitmap will have FNV hash as the ID
     returns nullptr on failure
     """
     ...
@@ -1020,3 +1115,181 @@ def mtmd_helper_decode_image_chunk(
     ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
     """
     ...
+
+# //
+# // video input helpers (requires ffmpeg/ffprobe installed on the system)
+# // the notion of video only exists at the helper level, it is not visible to the core mtmd library
+# //
+# // NOTE: this implementation is model-agnostic, it can be used with any vision-capable model
+# //       however, it may not be accurate for some specific models
+# //       (this is expected for now, to keep the implementation simple)
+# //
+
+# struct mtmd_helper_video_info {
+#     uint32_t width;
+#     uint32_t height;
+#     float    fps;      // effective fps (fps_target if set, else original video fps)
+#     int32_t  n_frames; // estimated total frames at effective fps (-1 if unknown)
+# };
+class mtmd_helper_video_info(Structure):
+    _fields_ = [
+        ("width", c_uint32),
+        ("height", c_uint32),
+        ("fps", c_float),
+        ("n_frames", c_int32),
+    ]
+mtmd_helper_video_info_p_ctypes = POINTER(mtmd_helper_video_info)
+
+
+# struct mtmd_helper_video_init_params {
+#     float fps_target;            // desired output fps; <= 0 means use the video's native fps, defaulted to 4.0f
+#     const char * ffmpeg_bin_dir; // directory containing ffmpeg/ffprobe binaries; NULL means search PATH
+#     int64_t timestamp_interval_ms; // interval for adding timestamp as text chunk (example: "[10m50.5s]"); <= 0 means no timestamp, defaulted to 5000ms
+#     // TODO @ngxson : allow "placeholder" bitmap output for counting tokens
+# };
+class mtmd_helper_video_init_params(Structure):
+    _fields_ = [
+        ("fps_target", c_float),
+        ("ffmpeg_bin_dir", c_char_p),
+        ("timestamp_interval_ms", c_int64),
+    ]
+mtmd_helper_video_init_params_p_ctypes = POINTER(mtmd_helper_video_init_params)
+
+
+# MTMD_API struct mtmd_helper_video_init_params mtmd_helper_video_init_params_default(void);
+@ctypes_function_mtmd(
+    "mtmd_helper_video_init_params_default",
+    [],
+    mtmd_helper_video_init_params,
+)
+def mtmd_helper_video_init_params_default(
+    /,
+) -> mtmd_helper_video_init_params:
+    """
+    get default init params for mtmd_helper_video
+    """
+    ...
+
+
+# // returns NULL on failure (ffprobe not found, file unreadable, etc.)
+# MTMD_API mtmd_helper_video * mtmd_helper_video_init(
+#                     struct mtmd_context * mctx,
+#                     const char * path,
+#                     struct mtmd_helper_video_init_params params);
+@ctypes_function_mtmd(
+    "mtmd_helper_video_init", [
+        mtmd_context_p_ctypes,
+        c_char_p,
+        mtmd_helper_video_init_params,
+    ],
+    mtmd_helper_video_p)
+def mtmd_helper_video_init(
+    mctx: mtmd_context_p,
+    path: c_char_p,
+    params: mtmd_helper_video_init_params,
+    /,
+) -> mtmd_helper_video_p:
+    """
+    helper function to init an mtmd_helper_video object
+    returns NULL on failure (ffprobe not found, file unreadable, etc.)
+    """
+    ...
+
+
+# // Same as mtmd_helper_video_init(), but reads from an in-memory buffer.
+# // The buffer is copied internally; the caller does not need to keep it alive.
+# // Note: pipe input is not seekable, so seeking will use output-side seeking
+# // (ffmpeg decodes and discards frames up to the target position).
+# MTMD_API mtmd_helper_video * mtmd_helper_video_init_from_buf(
+#                     struct mtmd_context * mctx,
+#                     const unsigned char * buf, size_t len,
+#                     struct mtmd_helper_video_init_params params);
+@ctypes_function_mtmd(
+    "mtmd_helper_video_init_from_buf",
+    [
+        mtmd_context_p_ctypes,
+        c_char_p,
+        c_size_t,
+        mtmd_helper_video_init_params,
+    ],
+    mtmd_helper_video_p_ctypes,
+)
+def mtmd_helper_video_init_from_buf(
+    mctx: mtmd_context_p,
+    buf: c_char_p,
+    len: int,
+    params: mtmd_helper_video_init_params,
+    /,
+) -> mtmd_helper_video_p:
+    """
+    helper function to init an mtmd_helper_video object from an in-memory video buffer
+
+    The buffer is copied internally, so the caller does not need to keep it alive
+    after this function returns.
+    """
+    ...
+
+
+# MTMD_API void mtmd_helper_video_free(mtmd_helper_video * ctx);
+@ctypes_function_mtmd("mtmd_helper_video_free", [mtmd_helper_video_p_ctypes], None)
+def mtmd_helper_video_free(
+    ctx: mtmd_helper_video_p,
+    /,
+) -> None:
+    """
+    free an mtmd_helper_video object
+    """
+    ...
+
+
+# MTMD_API struct mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx);
+@ctypes_function_mtmd("mtmd_helper_video_get_info", [mtmd_helper_video_p_ctypes], mtmd_helper_video_info)
+def mtmd_helper_video_get_info(
+    ctx: mtmd_helper_video_p,
+    /,
+) -> mtmd_helper_video_info:
+    """
+    get video information from an mtmd_helper_video object
+    """
+    ...
+
+
+# // Read the next item from the video stream; exactly one of out_bitmap or out_text is set per call.
+# // *out_bitmap - heap-allocated; caller must free with mtmd_bitmap_free()
+# // *out_text   - heap-allocated (always via strdup/malloc); caller must free with free()
+# // returns 0 on success, -1 on EOF, -2 on error
+# MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
+#             mtmd_bitmap ** out_bitmap,
+#             char ** out_text);
+@ctypes_function_mtmd(
+    "mtmd_helper_video_read_next",
+    [
+        mtmd_helper_video_p_ctypes,
+        POINTER(mtmd_bitmap_p_ctypes),
+        POINTER(c_char_p),
+    ],
+    c_int32,
+)
+def mtmd_helper_video_read_next(
+    ctx: mtmd_helper_video_p,
+    out_bitmap: POINTER(mtmd_bitmap_p_ctypes),  # type: ignore
+    out_text:   POINTER(c_char_p),              # type: ignore
+    /,
+) -> int:
+    """
+    read the next item from the video stream
+
+    Exactly one of out_bitmap or out_text is set per successful call.
+
+    out_bitmap:
+        heap-allocated bitmap; caller must free it with mtmd_bitmap_free()
+
+    out_text:
+        heap-allocated string via strdup/malloc; caller must free it with free()
+
+    returns:
+        0  on success
+        -1 on EOF
+        -2 on error
+    """
+    ...

From 10b4addb9d5f2ff71bddde34f43f8a43fac44b61 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Tue, 9 Jun 2026 05:08:49 +0800
Subject: [PATCH 15/36] feat(mtmd): add video input support to MTMDChatHandler

- Add video_url handling to the MTMD chat template and media extraction
pipeline. Detect whether the loaded libmtmd build supports video helpers
and reject video inputs early when MTMD_VIDEO is unavailable.

- Update media loading and bitmap creation for the new helper wrapper API.
mtmd_helper_bitmap_init_from_buf now returns a bitmap wrapper containing
both the decoded bitmap and an optional video helper context, so keep the
video context alive until mtmd_tokenize completes and release it afterward.

- Also consolidate duplicated audio/video byte loading into a shared
_load_bytes helper, reuse it for image loading, and add richer default HTTP
headers for remote media requests.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/llama_chat_format.py | 173 ++++++++++++++++++++++-----------
 1 file changed, 115 insertions(+), 58 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index fb42a59f23..2224466436 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3064,6 +3064,8 @@ class MTMDChatHandler:
                             "{% else %}"
                                 "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}"
                             "{% endif %}"
+                        "{% elif content.type == 'video_url' %}"
+                            "{{ content.video_url if content.video_url is string else content.video_url.url }}"
                         "{% elif content.type == 'text' %}"
                             "{{ content.text }}"
                         "{% endif %}"
@@ -3114,6 +3116,10 @@ def __init__(
         self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
         self.extra_template_arguments: dict[str, Any] = {}
 
+        self.is_support_vision = False
+        self.is_support_audio = False
+        self.is_support_video = False
+
         if not os.path.exists(clip_model_path):
             raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}")
 
@@ -3182,6 +3188,15 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama):
             if self.verbose:
                 print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr)
 
+        # Check if video is supported
+        self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx)
+        if self.is_support_video:
+            if self.verbose:
+                print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr)
+        else:
+            if self.verbose:
+                print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr)
+
     def close(self) -> None:
         """Explicitly free the mtmd context and vision model resources."""
         if getattr(self, "mtmd_ctx", None) is not None:
@@ -3259,7 +3274,16 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa
                                 if url:
                                     media_items.append({"url": url, "type": "audio"})
 
-                    # 3. Text & Unknown Types
+                    # 3. Video Processing
+                    elif content_type == "video_url":
+                        if not self.is_support_video:
+                            raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.")
+
+                        video_url = content["video_url"]
+                        url = video_url if isinstance(video_url, str) else video_url["url"]
+                        media_items.append({"url": url, "type": "video"})
+
+                    # 4. Text & Unknown Types
                     elif content_type == "text":
                         continue
                     else:
@@ -3274,6 +3298,7 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes):
         Supported formats:
           - Images (via stb_image): jpg, png, bmp, etc.
           - Audio (via miniaudio): wav, mp3, flac.
+          - Video: depends on whether MTMD_VIDEO was enabled at build time.
 
         Note:
           - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes.
@@ -3283,25 +3308,35 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes):
             media_bytes (bytes): The raw byte content of the media file.
 
         Returns:
-            mtmd_bitmap: A pointer to the allocated bitmap structure containing decoded media features.
+            bitmap: mtmd_bitmap *
+            video_ctx: mtmd_helper_video * or NULL
         """
         if self.mtmd_ctx is None:
             raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.")
 
-        # Create bitmap from buffer using helper function
-        bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
+        if not media_bytes:
+            raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.")
+
+        buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes)
+
+        wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
             self.mtmd_ctx,
-            (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)),
+            buf,
             len(media_bytes),
             False,
         )
 
-        if bitmap is None:
-            raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): "
-                                "Failed to load image or audio file from media bytes "
-                                "(unsupported media format or corrupted data).")
+        if not wrapper.bitmap:
+            if wrapper.video_ctx:
+                self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx)
 
-        return bitmap
+            raise ValueError(
+                f"{self.log_prefix}(_create_bitmap_from_bytes): "
+                "Failed to load media from bytes "
+                "(unsupported media format, corrupted data, or missing helper support)."
+            )
+
+        return wrapper.bitmap, wrapper.video_ctx
 
 
     def _process_mtmd_prompt(
@@ -3360,16 +3395,17 @@ def _process_mtmd_prompt(
         # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding
         bitmaps = [None] * len(media_items)
         bitmap_cleanup = []
+        video_cleanup = []
         chunks = None
 
         try:
             # Concurrent Media Decoding
             import concurrent.futures
             if media_items:
-                def _create_bitmap_func(idx: int, item: str):
+                def _create_bitmap_func(idx: int, item: dict):
                     media_bytes = self.load_media(item["url"], item["type"])
-                    bitmap = self._create_bitmap_from_bytes(media_bytes)
-                    return idx, bitmap
+                    bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes)
+                    return idx, bitmap, video_ctx
                 # This method uses multi-threaded parallel processing to convert images or audio to bitmaps,
                 # which can be used in the future to process large numbers of video frames.
                 max_workers = min(llama.n_threads, len(media_items))
@@ -3377,10 +3413,14 @@ def _create_bitmap_func(idx: int, item: str):
                     futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)]
 
                     for future in concurrent.futures.as_completed(futures):
-                        idx, bitmap = future.result()
+                        idx, bitmap, video_ctx = future.result()
+
                         bitmaps[idx] = bitmap
                         bitmap_cleanup.append(bitmap)
 
+                        if video_ctx:
+                            video_cleanup.append(video_ctx)
+
                 # Strict validation: Abort if any thread failed to decode its assigned media
                 if any(b is None for b in bitmaps):
                     raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.")
@@ -3415,6 +3455,12 @@ def _create_bitmap_func(idx: int, item: str):
             if result != 0:
                 raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.")
 
+            # Video helper contexts only need to stay alive until mtmd_tokenize() completes.
+            if video_cleanup:
+                for video_ctx in video_cleanup:
+                    self._mtmd_cpp.mtmd_helper_video_free(video_ctx)
+                video_cleanup.clear()
+
             # 6. Virtual Token Ledger Construction
             full_prompt_ids = []
             chunk_token_spans = []
@@ -3424,6 +3470,7 @@ def _create_bitmap_func(idx: int, item: str):
             # Cursor to track the actual media contents (URLs or base64 data) provided by the user
             media_items_count = len(media_items)
             media_items_cur = 0
+            last_media_id = None
 
             for i in range(n_chunks):
                 chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i)
@@ -3463,7 +3510,11 @@ def _create_bitmap_func(idx: int, item: str):
                         # while instantly breaking the match if the image content changes.
                         # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100
                         media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100
+                        last_media_id = media_id
                         media_items_cur += 1
+                    elif last_media_id is not None:
+                        # video may expand into multiple image chunks from one media marker
+                        media_id = last_media_id
                     else:
                         # Magic Negative Number as fallback :)
                         media_id = -314159
@@ -3492,6 +3543,12 @@ def _create_bitmap_func(idx: int, item: str):
                 for bitmap in bitmap_cleanup:
                     self._mtmd_cpp.mtmd_bitmap_free(bitmap)
                 bitmap_cleanup = None
+            # Free videos
+            if len(video_cleanup) > 0:
+                for video_ctx in video_cleanup:
+                    self._mtmd_cpp.mtmd_helper_video_free(video_ctx)
+                video_cleanup = None
+
             bitmaps = None
 
             raise e
@@ -3825,18 +3882,22 @@ def __call__(
     def load_media(self, media_url: str, media_type: str) -> bytes:
         """
         Unified dispatcher for loading media payloads.
-        Routes the URL/URI to the specific image or audio processor based on the media_type.
+        Routes the URL/URI to the specific image, audio, or video processor based on the media_type.
         """
         if media_type == "image":
             return self._load_image(media_url)
+
         elif media_type == "audio":
-            audio_bytes = self._load_audio(media_url)
-            # Apply ironclad magic bytes validation before returning
+            audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio")
             try:
                 self.detect_audio_format(audio_bytes)
             except ValueError as e:
                 raise ValueError(f"{self.log_prefix}(load_media): {e}")
             return audio_bytes
+
+        elif media_type == "video":
+            return self._load_bytes(media_url, timeout=30, kind="video")
+
         else:
             raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'")
 
@@ -3876,41 +3937,51 @@ def detect_audio_format(audio_bytes: bytes) -> str:
                 "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC."
             )
 
+    DEFAULT_HTTP_HEADERS = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/148.0.0.0 Safari/537.36"
+        ),
+    }
+
     @staticmethod
-    def _load_audio(audio_url: str) -> bytes:
+    def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes:
         """
-        Load audio from either a URL, local path, or a data URI and return raw bytes.
+        Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL.
         """
+        media_bytes = b""
 
-        audio_bytes = b""
-
-        # 1. Handle data URI (base64)
-        if audio_url.strip().startswith("data:"):
-            comma_pos = audio_url.find(",")
+        # 1. Handle data URI
+        if media_url.strip().startswith("data:"):
+            comma_pos = media_url.find(",")
             if comma_pos == -1:
                 raise ValueError("Invalid data URI: missing comma separator")
-            base64_data = audio_url[comma_pos + 1 :]
-            audio_bytes = base64.b64decode(base64_data)
+
+            base64_data = media_url[comma_pos + 1:]
+            media_bytes = base64.b64decode(base64_data)
 
         # 2. Handle local file path
-        elif os.path.exists(audio_url):
-            with open(audio_url, "rb") as f:
-                audio_bytes = f.read()
+        elif os.path.exists(media_url):
+            with open(media_url, "rb") as f:
+                media_bytes = f.read()
 
         # 3. Handle remote URL via HTTP/HTTPS
         else:
-            headers = {"User-Agent": "Mozilla/5.0"}
-            req = urllib.request.Request(audio_url, headers=headers)
+            req = urllib.request.Request(
+                media_url,
+                headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS,
+            )
             try:
-                with urllib.request.urlopen(req, timeout=15) as f:
-                    audio_bytes = f.read()
+                with urllib.request.urlopen(req, timeout=timeout) as f:
+                    media_bytes = f.read()
             except (URLError, HTTPError) as e:
-                raise ConnectionError(f"Failed to download audio from {audio_url}: {e}")
+                raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}")
 
-        if not audio_bytes:
-            raise ValueError("Empty audio data received")
+        if not media_bytes:
+            raise ValueError(f"Empty {kind} data received")
 
-        return audio_bytes
+        return media_bytes
 
     @staticmethod
     def _load_image(image_url: str) -> bytes:
@@ -3926,28 +3997,14 @@ def _load_image(image_url: str) -> bytes:
         Returns:
             JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models.
         """
-        image_bytes = b""
-
-        # 1. Handle data URI (base64)
-        if image_url.strip().startswith("data:"):
-            # Split only once from the right to correctly handle mime types containing commas
-            comma_pos = image_url.find(",")
-            if comma_pos == -1:
-                raise ValueError("Invalid data URI: missing comma separator")
-            base64_data = image_url[comma_pos + 1 :]
-            image_bytes = base64.b64decode(base64_data)
-
-        # 2. Handle local/remote URL
-        else:
-            headers = {"User-Agent": "Mozilla/5.0"}
-            req = urllib.request.Request(image_url, headers=headers)
-
-            try:
-                with urllib.request.urlopen(req, timeout=15) as f:
-                    image_bytes = f.read()
-            except (URLError, HTTPError) as e:
-                raise ConnectionError(f"Failed to download image from {image_url}: {e}")
+        # 1. Load image bytes from image_url
+        image_bytes = MTMDChatHandler._load_bytes(
+            image_url,
+            timeout=15,
+            kind="image",
+        )
 
+        # 2. Check if image_bytes is empty.
         if not image_bytes:
             raise ValueError("Empty image data received")
 

From e4dcac1af57b58973ecf7e206a3c25b3c367d881 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Tue, 9 Jun 2026 22:22:15 +0800
Subject: [PATCH 16/36] Update Submodule vendor/llama.cpp 7d2b45b..d6d0ce8

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/mtmd_cpp.py | 14 ++++++--------
 vendor/llama.cpp      |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 61fb0e7859..30ca8fab90 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -459,8 +459,8 @@ def mtmd_bitmap_set_id(
     c_int,
     c_size_t,                 # chunk_idx
     c_void_p,                 # user_data
-    POINTER(mtmd_bitmap_p),   # mtmd_bitmap ** out_bitmap
-    POINTER(c_char_p),        # char ** out_text
+    POINTER(mtmd_bitmap_p_ctypes),   # mtmd_bitmap ** out_bitmap
+    POINTER(c_char_p),               # char ** out_text
 )
 
 # MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx,
@@ -856,7 +856,7 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): #
 # // Returns true if this build includes video support (MTMD_VIDEO was ON at compile time).
 # MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx);
 @ctypes_function_mtmd(
-    "mtmd_helper_support_video", [mtmd_context_p], c_bool)
+    "mtmd_helper_support_video", [mtmd_context_p_ctypes], c_bool)
 def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool:
     """
     Returns true if this build includes video support (MTMD_VIDEO was ON at compile time).
@@ -870,8 +870,8 @@ def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool:
 # };
 class mtmd_helper_bitmap_wrapper(Structure):
     _fields_ = [
-        ("bitmap", mtmd_bitmap_p),
-        ("video_ctx", mtmd_helper_video_p),
+        ("bitmap", mtmd_bitmap_p_ctypes),
+        ("video_ctx", mtmd_helper_video_p_ctypes),
     ]
 mtmd_helper_bitmap_wrapper_p_ctypes = POINTER(mtmd_helper_bitmap_wrapper)
 
@@ -1162,9 +1162,7 @@ class mtmd_helper_video_init_params(Structure):
     [],
     mtmd_helper_video_init_params,
 )
-def mtmd_helper_video_init_params_default(
-    /,
-) -> mtmd_helper_video_init_params:
+def mtmd_helper_video_init_params_default() -> mtmd_helper_video_init_params:
     """
     get default init params for mtmd_helper_video
     """
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 7d2b45b4f7..d6d0ce8215 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 7d2b45b4f7b663cda74f23fbc3ce6dc3bd4f6545
+Subproject commit d6d0ce8215a1c324e8de04b52f9dd65c5edc129f

From 54f56bd8f89769f2021f31eba0aa377dc290f203 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sat, 13 Jun 2026 00:46:09 +0800
Subject: [PATCH 17/36] Update Submodule vendor/llama.cpp d6d0ce8..ebc1077

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index d6d0ce8215..ebc10770ac 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit d6d0ce8215a1c324e8de04b52f9dd65c5edc129f
+Subproject commit ebc10770ac5a9331824c53ef0c6adad780904dc3

From 6d1bd3b8d751a3a2ac86d377ecd34a3b37278b15 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sun, 14 Jun 2026 00:04:56 +0800
Subject: [PATCH 18/36] Update Submodule vendor/llama.cpp ebc1077..e8067a8

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index ebc10770ac..e8067a8b36 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit ebc10770ac5a9331824c53ef0c6adad780904dc3
+Subproject commit e8067a8b3624aa40cc88ecb2940060e5d65b7532

From 971ee384227f6268f244c93f620b12f0a6ff47c0 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sun, 14 Jun 2026 01:03:09 +0800
Subject: [PATCH 19/36] Update(mtmd): Append mtmd batching API

- Sync upstream: mtmd: add batching API (#24384)

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/llama_chat_format.py |   3 +
 llama_cpp/mtmd_cpp.py          | 142 ++++++++++++++++++++++++++++++---
 2 files changed, 134 insertions(+), 11 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 2224466436..520d2429d4 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3094,6 +3094,7 @@ def __init__(
             use_gpu: bool = True,
             image_min_tokens: int = -1,
             image_max_tokens: int = -1,
+            batch_max_tokens: int = 1024,
             **kwargs
     ):
 
@@ -3108,6 +3109,7 @@ def __init__(
         self.clip_model_path = clip_model_path
         self.image_min_tokens = image_min_tokens
         self.image_max_tokens = image_max_tokens
+        self.batch_max_tokens = batch_max_tokens
         self.use_gpu = use_gpu
         self.verbose = verbose
 
@@ -3152,6 +3154,7 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama):
         if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0:
             raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) "
                                 f"cannot be less than image_min_tokens ({self.image_min_tokens}).")
+        self.mctx_params.batch_max_tokens = self.batch_max_tokens
 
         # Cache the model's eos token and bos token
         self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore')
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 30ca8fab90..4513761a63 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -153,6 +153,21 @@ class mtmd_pos_type(enum.IntEnum):
 mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
 mtmd_input_chunks_p_ctypes = c_void_p
 
+# struct mtmd_batch {
+#     mtmd_context * ctx;
+#     std::vector<const mtmd_input_chunk *> entries;
+#     std::vector<float> output_embd; // aggregated output embedding for the whole batch
+#     mtmd_batch(mtmd_context * ctx): ctx(ctx) {}
+#     int32_t n_tokens() const {
+#         int32_t n = 0;
+#         for (const auto * chunk : entries) {
+#             n += mtmd_input_chunk_get_n_tokens(chunk);
+#         }
+#         return n;
+#     }
+# };
+mtmd_batch_p = NewType("mtmd_batch_p", int)
+mtmd_batch_p_ctypes = c_void_p
 
 # struct mtmd_input_text {
 #     const char * text;
@@ -210,6 +225,11 @@ class clip_context_params(Structure):
 #     // callback function passed over to mtmd proper
 #     ggml_backend_sched_eval_callback cb_eval;
 #     void * cb_eval_user_data;
+#
+#     // batching params
+#     int32_t batch_max_tokens; // maximum number of output tokens in a batch
+#                               // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit)
+#                               // (default: 1024)
 # };
 class mtmd_context_params(Structure):
     _fields_ = [
@@ -224,6 +244,7 @@ class mtmd_context_params(Structure):
         ("image_max_tokens", c_int),
         ("cb_eval", ggml_backend_sched_eval_callback),
         ("cb_eval_user_data", c_void_p),
+        ("batch_max_tokens", c_int32),
     ]
 
 mtmd_context_params_p_ctypes = POINTER(mtmd_context_params)
@@ -731,8 +752,8 @@ def mtmd_tokenize(
 
 # // returns 0 on success
 # // TODO: deprecate
-# MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
-#                              const mtmd_image_tokens * image_tokens);
+# DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens),
+#            "use mtmd_encode_chunk() instead");
 @ctypes_function_mtmd(
     "mtmd_encode", [
         mtmd_context_p_ctypes,
@@ -745,10 +766,15 @@ def mtmd_encode(
     image_tokens: mtmd_image_tokens_p,
     /,
 ) -> c_int32:
+    """
+    DEPRECATED: use mtmd_encode_chunk() instead
+    """
     ...
 
 
+# // text chunk will be ignored silently, only media chunk will be encoded
 # // returns 0 on success
+# // returns 1 on generic error
 # MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
 #                                    const mtmd_input_chunk * chunk);
 @ctypes_function_mtmd(
@@ -763,6 +789,11 @@ def mtmd_encode_chunk(
     chunk: mtmd_input_chunk_p,
     /,
 ) -> c_int32:
+    """
+    text chunk will be ignored silently, only media chunk will be encoded
+    returns 0 on success
+    returns 1 on generic error
+    """
     ...
 
 # // get output embeddings from the last encode pass
@@ -778,6 +809,95 @@ def mtmd_get_output_embd(ctx: mtmd_context_p) -> POINTER(c_float): # type: ignor
     ...
 
 
+# // batch encoding API
+# // chunks are not owned by the batch, they will not be freed by mtmd_batch_free()
+# // batch is valid for a given context, cannot be shared across contexts
+# MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx);
+@ctypes_function_mtmd(
+    "mtmd_batch_init",
+    [mtmd_context_p_ctypes],
+    mtmd_batch_p_ctypes,
+)
+def mtmd_batch_init(ctx: mtmd_context_p, /) -> mtmd_batch_p:
+    ...
+
+
+# MTMD_API void         mtmd_batch_free(mtmd_batch * batch);
+@ctypes_function_mtmd(
+    "mtmd_batch_free",
+    [mtmd_batch_p_ctypes],
+    None,
+)
+def mtmd_batch_free(batch: mtmd_batch_p, /):
+    """
+    chunks are not owned by the batch, they will not be freed by mtmd_batch_free()
+    batch is valid for a given context, cannot be shared across contexts
+    """
+    ...
+
+
+# // only media chunks are allowed, text chunks will be rejected
+# // returns 0 on success
+# // returns 1 on generic error
+# // returns 2 if the batch is too large (chunk won't be added)
+# // returns 3 if it cannot be batched with the existing chunks in the batch
+# MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+@ctypes_function_mtmd(
+    "mtmd_batch_add_chunk",
+    [
+        mtmd_batch_p_ctypes,
+        mtmd_input_chunk_p_ctypes,
+    ],
+    c_int32,
+)
+def mtmd_batch_add_chunk(
+    batch: mtmd_batch_p,
+    chunk: mtmd_input_chunk_p,
+    /,
+) -> c_int32:
+    """
+    only media chunks are allowed, text chunks will be rejected
+    returns 0 on success
+    returns 1 on generic error
+    returns 2 if the batch is too large (chunk won't be added)
+    returns 3 if it cannot be batched with the existing chunks in the batch
+    """
+    ...
+
+
+# // returns 0 on success
+# // returns 1 on generic error
+# MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch);
+@ctypes_function_mtmd(
+    "mtmd_batch_encode",
+    [mtmd_batch_p_ctypes],
+    c_int32,
+)
+def mtmd_batch_encode(batch: mtmd_batch_p, /) -> c_int32:
+    """
+    returns 0 on success
+    returns 1 on generic error
+    """
+    ...
+
+
+# MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+@ctypes_function_mtmd(
+    "mtmd_batch_get_output_embd",
+    [
+        mtmd_batch_p_ctypes,
+        mtmd_input_chunk_p_ctypes,
+    ],
+    POINTER(c_float),
+)
+def mtmd_batch_get_output_embd(
+    batch: mtmd_batch_p,
+    chunk: mtmd_input_chunk_p,
+    /,
+) -> POINTER(c_float):  # type: ignore
+    ...
+
+
 # // Set callback for all future logging events.
 # // If this is not called, or NULL is supplied, everything is output on stderr.
 # MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
@@ -947,8 +1067,8 @@ def mtmd_helper_bitmap_init_from_buf(
 # // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 @ctypes_function_mtmd(
-    "mtmd_helper_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t)
-def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunk_p) -> c_size_t:
+    "mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t)
+def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p) -> c_size_t:
     """
     helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
     """
@@ -959,8 +1079,8 @@ def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunk_p) -> c_size_t:
 # // normally, n_pos is equal to n_tokens, but for M-RoPE it is different
 # MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
 @ctypes_function_mtmd(
-    "mtmd_helper_get_n_pos", [mtmd_input_chunk_p_ctypes], c_int32)
-def mtmd_helper_get_n_pos(chunks: mtmd_input_chunk_p) -> c_int32:
+    "mtmd_helper_get_n_pos", [mtmd_input_chunks_p_ctypes], c_int32)
+def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p) -> c_int32:
     """
     helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
     normally, n_pos is equal to n_tokens, but for M-RoPE it is different
@@ -991,8 +1111,8 @@ def mtmd_helper_image_get_decoder_pos(
 
 # // helper function that automatically:
 # // 1. run llama_decode() on text chunks
-# // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
-# // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+# // 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+# // if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error
 # // otherwise, returns 0 on success
 # // this function is NOT thread-safe
 # MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
@@ -1007,7 +1127,7 @@ def mtmd_helper_image_get_decoder_pos(
     "mtmd_helper_eval_chunks", [
         mtmd_context_p_ctypes,
         llama_cpp.llama_context_p_ctypes,
-        mtmd_input_chunk_p_ctypes,
+        mtmd_input_chunks_p_ctypes,
         c_int32,
         c_int32,
         c_int32,
@@ -1018,7 +1138,7 @@ def mtmd_helper_image_get_decoder_pos(
 def mtmd_helper_eval_chunks(
     ctx: mtmd_context_p,
     lctx: llama_cpp.llama_context_p,
-    chunks: mtmd_input_chunk_p,
+    chunks: mtmd_input_chunks_p,
     n_past: c_int32,
     seq_id: c_int32,
     n_batch: c_int32,
@@ -1106,7 +1226,7 @@ def mtmd_helper_decode_image_chunk(
     n_past: c_int32,
     seq_id: c_int32,
     n_batch: c_int32,
-    new_n_past: c_int32,
+    new_n_past: POINTER(c_int32),   # type: ignore
     /,
 ) -> c_int32:
     """

From cb299e67e51e5aff061ebcf9f1521695ad3f1a5d Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sun, 14 Jun 2026 03:05:32 +0800
Subject: [PATCH 20/36] Update(MTMDChatHandler): add chunk type helpers

- Add small helper methods `_is_text_chunk`/`_is_image_chunk`/`_is_audio_chunk` for checking MTMD text, image, and audio chunk type
enum values.

- This keeps MTMD prompt processing easier to read and avoids repeating direct
enum comparisons when building token spans for text and media chunks.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/llama_chat_format.py | 36 +++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 520d2429d4..aadec4600e 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3341,6 +3341,26 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes):
 
         return wrapper.bitmap, wrapper.video_ctx
 
+    def _is_text_chunk(self, chunk_type: int) -> bool:
+        """Return True if `chunk_type` is the MTMD text chunk type enum value."""
+        return (
+            chunk_type
+            == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT
+        )
+
+    def _is_image_chunk(self, chunk_type: int) -> bool:
+        """Return True if `chunk_type` is the MTMD image chunk type enum value."""
+        return (
+            chunk_type
+            == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE
+        )
+
+    def _is_audio_chunk(self, chunk_type: int) -> bool:
+        """Return True if `chunk_type` is the MTMD audio chunk type enum value."""
+        return (
+            chunk_type
+            == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO
+        )
 
     def _process_mtmd_prompt(
         self,
@@ -3480,7 +3500,7 @@ def _create_bitmap_func(idx: int, item: dict):
                 if chunk is None: continue
                 chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
 
-                if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT:
+                if self._is_text_chunk(chunk_type):
                     # Extract standard text token IDs
                     n_tokens_out = ctypes.c_size_t()
                     tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out))
@@ -3489,10 +3509,7 @@ def _create_bitmap_func(idx: int, item: dict):
                         chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None))
                         full_prompt_ids.extend(tokens)
                         current_idx += len(tokens)
-                elif chunk_type in [
-                        self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                        self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO
-                    ]:
+                elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type):
                     # Extract media properties
                     # Note(JamePeng):
                     # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models).
@@ -3673,7 +3690,7 @@ def __call__(
                 if end_idx <= n_past:
                     continue
 
-                if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT:
+                if self._is_text_chunk(chunk_type):
                     unprocessed_start = max(start_idx, n_past) - start_idx
                     n_tokens_out = ctypes.c_size_t()
                     tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out))
@@ -3689,14 +3706,11 @@ def __call__(
                             llama.eval(tokens_to_eval)
                             n_past = llama.n_tokens
 
-                elif chunk_type in [
-                        self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                        self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO
-                    ]:
+                elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type):
                     chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr)
 
                     if self.verbose:
-                        media_str = "IMAGE" if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE else "AUDIO"
+                        media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO"
                         print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr)
 
                     # Stage 5: Multimodal Physical OOM Defense

From d8ee3eed7163c6c1f3802a9b979f9009e5e96c53 Mon Sep 17 00:00:00 2001
From: Alcoft <alcofttao@protonmail.com>
Date: Sun, 14 Jun 2026 08:00:09 +0200
Subject: [PATCH 21/36] Change 'clip_model_path' to 'mmproj_path'. Implemented
 'chat_template_override'. Only the chat template is passed from llama to the
 chat handler; not the entire model's metadata.

---
 llama_cpp/llama.py             | 10 ++++-----
 llama_cpp/llama_chat_format.py | 39 +++++++++++++++++++---------------
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 544e755ea9..1f5ffa20b5 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -96,7 +96,7 @@ class Llama:
     def __init__(
         self,
         model_path: str,
-        clip_model_path: Optional[str] = None,
+        mmproj_path: Optional[str] = None,
         *,
         # Model Params
         n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto",
@@ -710,13 +710,13 @@ def __init__(
         if self.verbose:
             print(f"Model metadata: {self.metadata}", file=sys.stderr)
         
-        if clip_model_path is not None:
+        if mmproj_path is not None:
             if self.chat_handler is not None and self.verbose:
-                print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True)
+                print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True)
 
             self.chat_handler = llama_chat_format.GenericMTMDChatHandler(
-                gguf_metadata = self.metadata,
-                clip_model_path = clip_model_path,
+                chat_format = self.metadata.get("tokenizer.chat_template", None),
+                mmproj_path = mmproj_path,
                 verbose = self.verbose,
                 **chat_handler_kwargs
             )
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 254195f95a..966c2e28fa 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2856,11 +2856,12 @@ class MTMDChatHandler:
 
     def __init__(
             self,
-            clip_model_path: str,
+            mmproj_path: str,
             verbose: bool = True,
             use_gpu: bool = True,
             image_min_tokens: int = -1,
             image_max_tokens: int = -1,
+            chat_template_override: Optional[str] = None,
             **kwargs
     ):
 
@@ -2872,7 +2873,7 @@ def __init__(
                 f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}."
             )
 
-        self.clip_model_path = clip_model_path
+        self.mmproj_path = mmproj_path
         self.image_min_tokens = image_min_tokens
         self.image_max_tokens = image_max_tokens
         self.use_gpu = use_gpu
@@ -2883,20 +2884,25 @@ def __init__(
         self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
         self.extra_template_arguments: dict[str, Any] = {}
 
-        if not os.path.exists(clip_model_path):
-            raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}")
+        if not os.path.exists(mmproj_path):
+            raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {mmproj_path}")
 
         # Pre-compile Jinja template
-        if not hasattr(self, "chat_format") or self.chat_format is None:
+        if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None:
             self.chat_format = self.CHAT_FORMAT
+        elif chat_template_override is not None:
+            self.chat_format = chat_template_override
 
         self._chat_format_parser_tags = []
-        self.chat_template = ImmutableSandboxedEnvironment(
-            trim_blocks=True,
-            lstrip_blocks=True,
-        ).from_string(self.chat_format)
+        self.change_chat_template(self.chat_format)
 
         self._exit_stack = ExitStack()
+    
+    def change_chat_template(self, new_template: str):
+        self.chat_template = ImmutableSandboxedEnvironment(
+            trim_blocks=True,
+            lstrip_blocks=True
+        ).from_string(new_template)
 
     def _init_mtmd_context(self, llama_model: llama_core.Llama):
         """Initialize mtmd context with the llama model."""
@@ -2929,13 +2935,13 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama):
 
         # Initialize mtmd context
         self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
-            self.clip_model_path.encode(),
+            self.mmproj_path.encode(),
             llama_model.model,
             self.mctx_params
         )
 
         if self.mtmd_ctx is None:
-            raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}")
+            raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}")
 
         # Check if vision is supported
         self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx)
@@ -3835,7 +3841,7 @@ def from_pretrained(
             model_path = os.path.join(local_dir, filename)
 
         return cls(
-            clip_model_path=model_path,
+            mmproj_path=model_path,
             **kwargs,
         )
 
@@ -3852,13 +3858,12 @@ class GenericMTMDChatHandler(MTMDChatHandler):
 
     def __init__(
         self,
-        gguf_metadata: Dict[str, Any],
-        clip_model_path: str,
+        chat_format: str,
+        mmproj_path: str,
         verbose: bool = True,
         **kwargs
     ) -> None:
-        self.model_metadata = gguf_metadata
-        self.chat_format = self.model_metadata.get("tokenizer.chat_template", None)
+        self.chat_format = chat_format
 
         if verbose:
             print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)
@@ -3866,7 +3871,7 @@ def __init__(
         if self.chat_format is None:
             raise ValueError("Failed to get model chat template automatically.")
         
-        super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs)
+        super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs)
     
     def __call__(self, **kwargs):
         self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format]

From 1965d5f6c3c949cab7f7ef934266c8062ebc0f45 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sun, 14 Jun 2026 20:19:43 +0800
Subject: [PATCH 22/36] refactor(mtmd): move multimodal handlers to separate
 module `llama_multimodal`

- Move MTMDChatHandler, GenericMTMDChatHandler, and model-specific multimodal
chat handlers out of llama_chat_format.py into llama_multimodal.py.

- llama_chat_format.py has grown too large and difficult to maintain, especially
as MTMD support expands beyond image-only use cases. Splitting multimodal
handling into its own module makes the chat formatting layer smaller and keeps
media loading, MTMD tokenization, multimodal KV-cache bookkeeping, and handler
implementations in a dedicated place.

- This also prepares the codebase for broader multimodal support and future video
frame / image batch evaluation, where the media-processing path will need to
evolve independently from text-only chat formatting.

- Keep backward-compatible re-exports from llama_chat_format.py so existing
imports continue to work.

- Also keep `clip_model_path` as a deprecated initialization alias for
`mmproj_path` in the base MTMD handler.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/llama.py             |    8 +-
 llama_cpp/llama_chat_format.py | 3811 ++------------------------------
 llama_cpp/llama_multimodal.py  | 3473 +++++++++++++++++++++++++++++
 3 files changed, 3690 insertions(+), 3602 deletions(-)
 create mode 100644 llama_cpp/llama_multimodal.py

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index ec202568f1..dbc60eaf76 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -45,6 +45,7 @@
 from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer
 import llama_cpp.llama_cpp as llama_cpp_lib
 import llama_cpp.llama_chat_format as llama_chat_format
+import llama_cpp.llama_multimodal as llama_multimodal
 
 from llama_cpp.llama_speculative import LlamaDraftModel
 
@@ -711,20 +712,19 @@ def __init__(
             self.metadata = {}
             if self.verbose:
                 print(f"Failed to load metadata: {e}", file=sys.stderr)
-
-        if self.verbose:
-            print(f"Model metadata: {self.metadata}", file=sys.stderr)
         
         if mmproj_path is not None:
             if self.chat_handler is not None and self.verbose:
                 print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True)
 
-            self.chat_handler = llama_chat_format.GenericMTMDChatHandler(
+            self.chat_handler = llama_multimodal.GenericMTMDChatHandler(
                 chat_format = self.metadata.get("tokenizer.chat_template", None),
                 mmproj_path = mmproj_path,
                 verbose = self.verbose,
                 **chat_handler_kwargs
             )
+
+        if self.verbose:
             print(f"Model desc: {self.model_desc}, "
                   f"Model size: {self.model_size / (1024 * 1024):.2f} MB, "
                   f"Model metadata: {self.metadata}",
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 0e5c9d4906..6ffe68e5e3 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-import base64
-import ctypes
 import dataclasses
 import datetime
 import json
@@ -9,9 +7,7 @@
 import random
 import string
 import sys
-import zlib
 
-from contextlib import ExitStack
 from typing import (
     Any,
     Dict,
@@ -32,16 +28,11 @@
 import numpy as np
 import numpy.typing as npt
 
-import urllib.request
-from urllib.error import URLError, HTTPError
-
-import llama_cpp.llama_cpp as llama_cpp_lib
 import llama_cpp.llama as llama_core
 import llama_cpp.llama_types as llama_types
 import llama_cpp.llama_grammar as llama_grammar
 
-from ._ggml import GGMLLogLevel
-from ._logger import logger, ggml_log_callback
+from ._logger import logger
 from ._utils import suppress_stdout_stderr, Singleton
 
 ### Common Chat Templates and Special Tokens ###
@@ -3037,3612 +3028,204 @@ def generate_streaming(tools, functions, function_call, prompt):
         )
 
 
-class MTMDChatHandler:
-    DEFAULT_SYSTEM_MESSAGE: Optional[str] = (
-"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, "
-"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful."
-    )
-
-    CHAT_FORMAT = (
-        "{{ bos_token if bos_token is defined else '' }}"
+@register_chat_completion_handler("chatml-function-calling")
+def chatml_function_calling(
+    llama: llama_core.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    min_p: float = 0.05,
+    typical_p: float = 1.0,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+    max_tokens: Optional[int] = None,
+    present_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    top_n_sigma: float = -1.00,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    xtc_threshold: float = 0.1,
+    xtc_probability: float = 0.0,
+    dry_multiplier: float = 0.0,
+    dry_base: float = 1.75,
+    dry_allowed_length: int = 2,
+    dry_penalty_last_n:int = 0,
+    dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"],
+    adaptive_target : float = -1.0,
+    adaptive_decay : float = 0.9,
+    use_infill: bool = False,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama_core.LogitsProcessorList] = None,
+    grammar: Optional[llama_grammar.LlamaGrammar] = None,
+    logprobs: Optional[bool] = None,
+    top_logprobs: Optional[int] = None,
+    **kwargs,  # type: ignore
+) -> Union[
+    llama_types.CreateChatCompletionResponse,
+    Iterator[llama_types.CreateChatCompletionStreamResponse],
+]:
+    function_calling_template = (
         "{% for message in messages %}"
-            "{% if message.role == 'system' %}"
-                "{{ message.content }}"
-            "{% elif message.role == 'user' %}"
-                "USER: "
-                "{% if message.content is string %}"
-                    "{{ message.content }}"
-                "{% elif message.content is iterable %}"
-                    "{% for content in message.content %}"
-                        "{% if content.type == 'image_url' %}"
-                            "{{ content.image_url if content.image_url is string else content.image_url.url }}"
-                        "{% elif content.type == 'audio_url' %}"
-                            "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}"
-                        "{% elif content.type == 'input_audio' %}"
-                            "{% if content.input_audio is string %}"
-                                "{{ content.input_audio }}"
-                            "{% else %}"
-                                "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}"
-                            "{% endif %}"
-                        "{% elif content.type == 'video_url' %}"
-                            "{{ content.video_url if content.video_url is string else content.video_url.url }}"
-                        "{% elif content.type == 'text' %}"
-                            "{{ content.text }}"
-                        "{% endif %}"
-                    "{% endfor %}"
-                "{% endif %}"
-
-            "{% elif message.role == 'assistant' and message.content is not none %}"
-                "ASSISTANT: {{ message.content }}"
-            "{% endif %}"
-            "{{ \"\n\" }}"
+        "<|im_start|>{{ message.role }}\n"
+        # System message
+        "{% if message.role == 'system' %}"
+        "{{ message.content }}"
+        "{% if tool_calls %}"
+        "\n\nYou have access to the following functions:\n"
+        "{% for tool in tools %}"
+        "\nfunctions.{{ tool.function.name }}:\n"
+        "{{ tool.function.parameters | tojson }}"
+        "\n{% endfor %}"
+        "\n\nYou can respond to users messages with either a single message or one or more function calls."
+        "\n\nTo respond with a message begin the message with 'message:', use the following format:"
+        "\n\nmessage:"
+        "\n<message>"
+        "\n\nTo respond with one or more function calls begin the message with 'functions.<function_name>:', use the following format:"
+        "\n\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" }'
+        "\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" }'
+        "{% endif %}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "{{ message.content }}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        ## Reglar message
+        "{% if message.content and message.content | length > 0 %}"
+        "{% if tool_calls %}"
+        "message:\n"
+        "{% endif %}"
+        "{{ message.content }}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        ## Function calls
+        "{% if 'tool_calls' in message %}"
+        "{% for tool_call in message.tool_calls %}"
+        "functions.{{ tool_call.function.name }}:\n"
+        "{{ tool_call.function.arguments }}"
         "{% endfor %}"
-
-        "{% if eos_token is defined %}"
-            "{{ eos_token }}"
+        "<|im_end|>\n"
         "{% endif %}"
-
-        "{% if add_generation_prompt %}"
-            "ASSISTANT: "
         "{% endif %}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
     )
+    template_renderer = ImmutableSandboxedEnvironment(
+        autoescape=jinja2.select_autoescape(["html", "xml"]),
+        undefined=jinja2.StrictUndefined,
+    ).from_string(function_calling_template)
 
-    def __init__(
-            self,
-            mmproj_path: str,
-            verbose: bool = True,
-            use_gpu: bool = True,
-            image_min_tokens: int = -1,
-            image_max_tokens: int = -1,
-            chat_template_override: Optional[str] = None,
-            batch_max_tokens: int = 1024,
-            **kwargs
-    ):
-
-        self.log_prefix = self.__class__.__name__
-        if kwargs:
-            unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys())
-            raise TypeError(
-                f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n"
-                f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}."
-            )
-
-        self.mmproj_path = mmproj_path
-        self.image_min_tokens = image_min_tokens
-        self.image_max_tokens = image_max_tokens
-        self.batch_max_tokens = batch_max_tokens
-        self.use_gpu = use_gpu
-        self.verbose = verbose
-
-        import llama_cpp.mtmd_cpp as mtmd_cpp
-        self._mtmd_cpp = mtmd_cpp
-        self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
-        self.extra_template_arguments: dict[str, Any] = {}
-
-        self.is_support_vision = False
-        self.is_support_audio = False
-        self.is_support_video = False
-
-        if not os.path.exists(mmproj_path):
-            raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {mmproj_path}")
-
-        # Pre-compile Jinja template
-        if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None:
-            self.chat_format = self.CHAT_FORMAT
-        elif chat_template_override is not None:
-            self.chat_format = chat_template_override
-
-        self._chat_format_parser_tags = []
-        self.change_chat_template(self.chat_format)
-
-        self._exit_stack = ExitStack()
-    
-    def change_chat_template(self, new_template: str):
-        self.chat_template = ImmutableSandboxedEnvironment(
-            trim_blocks=True,
-            lstrip_blocks=True
-        ).from_string(new_template)
-
-    def _init_mtmd_context(self, llama_model: llama_core.Llama):
-        """Initialize mtmd context with the llama model."""
-        if self.mtmd_ctx is not None:
-            return  # Already initialized
-
-        self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0))
-
-        # Get default parameters
-        self.mctx_params = self._mtmd_cpp.mtmd_context_params_default()
-        self.mctx_params.use_gpu = self.use_gpu
-        self.mctx_params.print_timings = self.verbose
-        self.mctx_params.n_threads = llama_model.n_threads
-        self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO
-        self.mctx_params.warmup = True
-        if self.image_min_tokens > 0:
-            self.mctx_params.image_min_tokens = self.image_min_tokens
-        if self.image_max_tokens > 0:
-            self.mctx_params.image_max_tokens = self.image_max_tokens
-        if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0:
-            raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) "
-                                f"cannot be less than image_min_tokens ({self.image_min_tokens}).")
-        self.mctx_params.batch_max_tokens = self.batch_max_tokens
-
-        # Cache the model's eos token and bos token
-        self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore')
-        self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore')
-
-        # Cache the mtmd_default_marker
-        self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8')
-
-        # Initialize mtmd context
-        self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
-            self.mmproj_path.encode(),
-            llama_model.model,
-            self.mctx_params
-        )
-
-        if self.mtmd_ctx is None:
-            raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}")
-
-        # Check if vision is supported
-        self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx)
-        if self.is_support_vision:
-            if self.verbose:
-                print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr)
-        else:
-            if self.verbose:
-                print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr)
-
-        # Check if audio is supported
-        self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx)
-        if self.is_support_audio:
-            if self.verbose:
-                print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr)
-        else:
-            if self.verbose:
-                print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr)
-
-        # Check if video is supported
-        self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx)
-        if self.is_support_video:
-            if self.verbose:
-                print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr)
-        else:
-            if self.verbose:
-                print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr)
-
-    def close(self) -> None:
-        """Explicitly free the mtmd context and vision model resources."""
-        if getattr(self, "mtmd_ctx", None) is not None:
-            try:
-                self._mtmd_cpp.mtmd_free(self.mtmd_ctx)
-            except Exception:
-                pass
-            self.mtmd_ctx = None
-            self.mctx_params = None
-            self.chat_template = None
-
-        if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"):
-            self._exit_stack.close()
-            self._exit_stack = None
-
-    def __del__(self) -> None:
-        self.close()
-
-    def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]:
-        """
-        Extracts all media payloads (images, audio) sequentially to maintain exact chronological order.
-        Strictly enforces capability checks, raising exceptions if unsupported media is passed.
-
-        Returns:
-            media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio).
-        """
-        media_items: List[Dict[str, str]] = []
-        for message in messages:
-            if isinstance(message.get("content"), list):
-                for content in message["content"]:
-                    content_type = content.get("type", "")
-
-                    # 1. Vision Processing
-                    if content_type == "image_url":
-                        if not self.is_support_vision:
-                            raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.")
-
-                        url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"]
-                        media_items.append({"url": url, "type": "image"})
-
-                    # 2. Audio Processing
-                    elif content_type in ["audio", "audio_url", "input_audio"]:
-                        if not self.is_support_audio:
-                            raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.")
-
-                        # Case A: Handle custom/forward-compatible audio_url format
-                        if content_type == "audio_url" or content_type == "audio":
-                            audio_url = content[content_type]
-                            url = audio_url if isinstance(audio_url, str) else audio_url["url"]
-                            media_items.append({"url": url, "type": "audio"})
-                        # Case B: Handle OpenAI standard input_audio format
-                        elif content_type == "input_audio":
-                            input_audio = content.get("input_audio", {})
-                            if isinstance(input_audio, dict) and "data" in input_audio:
-                                # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic
-                                # input_audio: {
-                                #     data: audio.base64Data,
-                                #     format: audio.mimeType.includes('wav') ? 'wav' : 'mp3'
-                                # }
-                                audio_data = input_audio.get("data", "")
-                                audio_format = input_audio.get("format", "")
-
-                                # Strictly align with llama.cpp (require wav/mp3)
-                                if audio_format not in ["wav", "mp3"]:
-                                    raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'")
-
-                                # Format as a Data URI to reuse the unified load_media logic
-                                media_items.append({
-                                    "url": f"data:audio/{audio_format};base64,{audio_data}",
-                                    "type": "audio"
-                                })
-                            else:
-                                # Just a raw base64 data
-                                url = input_audio if isinstance(input_audio, str) else ""
-                                if url:
-                                    media_items.append({"url": url, "type": "audio"})
-
-                    # 3. Video Processing
-                    elif content_type == "video_url":
-                        if not self.is_support_video:
-                            raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.")
-
-                        video_url = content["video_url"]
-                        url = video_url if isinstance(video_url, str) else video_url["url"]
-                        media_items.append({"url": url, "type": "video"})
-
-                    # 4. Text & Unknown Types
-                    elif content_type == "text":
-                        continue
-                    else:
-                        if self.verbose:
-                            print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr)
-        return media_items
-
-    def _create_bitmap_from_bytes(self, media_bytes: bytes):
-        """
-        Constructs an mtmd_bitmap structure from a raw byte buffer containing media data.
-
-        Supported formats:
-          - Images (via stb_image): jpg, png, bmp, etc.
-          - Audio (via miniaudio): wav, mp3, flac.
-          - Video: depends on whether MTMD_VIDEO was enabled at build time.
-
-        Note:
-          - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes.
-          - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing.
-
-        Args:
-            media_bytes (bytes): The raw byte content of the media file.
-
-        Returns:
-            bitmap: mtmd_bitmap *
-            video_ctx: mtmd_helper_video * or NULL
-        """
-        if self.mtmd_ctx is None:
-            raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.")
+    # Convert legacy functions to tools
+    if functions is not None:
+        tools = [
+            {
+                "type": "function",
+                "function": function,
+            }
+            for function in functions
+        ]
 
-        if not media_bytes:
-            raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.")
+    # Convert legacy function_call to tool_choice
+    if function_call is not None:
+        if isinstance(function_call, str) and (
+            function_call == "none" or function_call == "auto"
+        ):
+            tool_choice = function_call
+        if isinstance(function_call, dict) and "name" in function_call:
+            tool_choice = {
+                "type": "function",
+                "function": {
+                    "name": function_call["name"],
+                },
+            }
 
-        buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes)
+    stop = (
+        [stop, "<|im_end|>"]
+        if isinstance(stop, str)
+        else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
+    )
 
-        wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
-            self.mtmd_ctx,
-            buf,
-            len(media_bytes),
-            False,
+    # Case 1: No tool choice by user
+    if (
+        tool_choice is None
+        or (isinstance(tool_choice, str) and tool_choice == "none")
+        or tools is None
+        or len(tools) == 0
+    ):
+        prompt = template_renderer.render(
+            messages=messages,
+            tools=[],
+            tool_calls=None,
+            add_generation_prompt=True,
         )
 
-        if not wrapper.bitmap:
-            if wrapper.video_ctx:
-                self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx)
-
-            raise ValueError(
-                f"{self.log_prefix}(_create_bitmap_from_bytes): "
-                "Failed to load media from bytes "
-                "(unsupported media format, corrupted data, or missing helper support)."
-            )
-
-        return wrapper.bitmap, wrapper.video_ctx
-
-    def _is_text_chunk(self, chunk_type: int) -> bool:
-        """Return True if `chunk_type` is the MTMD text chunk type enum value."""
-        return (
-            chunk_type
-            == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT
-        )
+        if response_format is not None and response_format["type"] == "json_object":
+            grammar = _grammar_for_response_format(response_format)
 
-    def _is_image_chunk(self, chunk_type: int) -> bool:
-        """Return True if `chunk_type` is the MTMD image chunk type enum value."""
-        return (
-            chunk_type
-            == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE
+        return _convert_completion_to_chat(
+            llama.create_completion(
+                prompt=prompt,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                stream=stream,
+                stop=stop,
+                max_tokens=max_tokens,
+                present_penalty=present_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                top_n_sigma=top_n_sigma,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                xtc_threshold=xtc_threshold,
+                xtc_probability=xtc_probability,
+                dry_multiplier=dry_multiplier,
+                dry_base=dry_base,
+                dry_allowed_length=dry_allowed_length,
+                dry_penalty_last_n=dry_penalty_last_n,
+                dry_seq_breakers=dry_seq_breakers,
+                adaptive_target=adaptive_target,
+                adaptive_decay=adaptive_decay,
+                use_infill=use_infill,
+                model=model,
+                logits_processor=logits_processor,
+                grammar=grammar,
+                logprobs=top_logprobs if logprobs else None,
+            ),
+            stream=stream,
         )
 
-    def _is_audio_chunk(self, chunk_type: int) -> bool:
-        """Return True if `chunk_type` is the MTMD audio chunk type enum value."""
-        return (
-            chunk_type
-            == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO
+    # Case 2: Tool choice by user
+    if isinstance(tool_choice, dict):
+        tool_name = tool_choice["function"]["name"]
+        tool = next(
+            (tool for tool in tools if tool["function"]["name"] == tool_name), None
         )
-
-    def _process_mtmd_prompt(
-        self,
-        llama: llama_core.Llama,
-        messages: List[llama_types.ChatCompletionRequestMessage],
-        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
-        add_generation_prompt: bool = True,
-    ) -> Tuple[List[int], List[tuple], Any, List[Any]]:
-        """
-        Core multimodal preprocessing pipeline.
-        Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger.
-
-        Features:
-        - Thread-safe concurrent media decoding to eliminate I/O bottlenecks.
-        - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens.
-        - Strict RAII-style C++ memory management to prevent leaks on failure.
-
-        Returns:
-            full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching.
-            chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id).
-            chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller).
-            bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation.
-        """
-        # 1. Inject default system prompt if omitted by the user
-        system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "")
-        if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
-            messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages
-
-        media_items = self._get_media_items(messages)
-        media_marker = self.media_marker
-
-        # 2. Render the chat template and replace actual URLs with C++ media markers
-        text = self.chat_template.render(
+        if tool is None:
+            raise ValueError(f"Tool with name '{tool_name}' not found in tools")
+        prompt = template_renderer.render(
             messages=messages,
-            add_generation_prompt=add_generation_prompt,
-            eos_token=self.mtmd_eos_token,
-            bos_token=self.mtmd_bos_token,
-            functions=functions,
-            function_call=function_call,
             tools=tools,
-            tool_choice=tool_choice,
-            **getattr(self, 'extra_template_arguments', {})
+            tool_calls=True,
+            add_generation_prompt=True,
         )
-        
-        for tag in self._chat_format_parser_tags:
-            if tag not in text:
-                continue
-
-            text = text.replace(tag, media_marker)
-
-        # Replace image_url by media_marker in text
-        for item in media_items:
-            text = text.replace(item["url"], media_marker)
-
-        if self.verbose:
-            print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr)
-            print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr)
-
-        # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding
-        bitmaps = [None] * len(media_items)
-        bitmap_cleanup = []
-        video_cleanup = []
-        chunks = None
-
-        try:
-            # Concurrent Media Decoding
-            import concurrent.futures
-            if media_items:
-                def _create_bitmap_func(idx: int, item: dict):
-                    media_bytes = self.load_media(item["url"], item["type"])
-                    bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes)
-                    return idx, bitmap, video_ctx
-                # This method uses multi-threaded parallel processing to convert images or audio to bitmaps,
-                # which can be used in the future to process large numbers of video frames.
-                max_workers = min(llama.n_threads, len(media_items))
-                with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                    futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)]
-
-                    for future in concurrent.futures.as_completed(futures):
-                        idx, bitmap, video_ctx = future.result()
-
-                        bitmaps[idx] = bitmap
-                        bitmap_cleanup.append(bitmap)
-
-                        if video_ctx:
-                            video_cleanup.append(video_ctx)
-
-                # Strict validation: Abort if any thread failed to decode its assigned media
-                if any(b is None for b in bitmaps):
-                    raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.")
-                else:
-                    if self.verbose:
-                        print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.")
-            else:
-                # If there are no images, set the bitmaps to empty.
-                bitmaps = []
-
-            # 4. Initialize mtmd_input_chunks
-            input_text = self._mtmd_cpp.mtmd_input_text()
-            input_text.text = text.encode('utf-8')
-            input_text.add_special = (llama.n_tokens == 0)
-            input_text.parse_special = True
-
-            chunks = self._mtmd_cpp.mtmd_input_chunks_init()
-            if chunks is None:
-                raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.")
-
-            # 5. Hybrid Tokenization (Text + Media binding)
-            if len(bitmaps) > 0:
-                bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps)
-                result = self._mtmd_cpp.mtmd_tokenize(
-                    self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps)
-                )
-            else:
-                result = self._mtmd_cpp.mtmd_tokenize(
-                    self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0
-                )
-
-            if result != 0:
-                raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.")
-
-            # Video helper contexts only need to stay alive until mtmd_tokenize() completes.
-            if video_cleanup:
-                for video_ctx in video_cleanup:
-                    self._mtmd_cpp.mtmd_helper_video_free(video_ctx)
-                video_cleanup.clear()
-
-            # 6. Virtual Token Ledger Construction
-            full_prompt_ids = []
-            chunk_token_spans = []
-            current_idx = 0
-            n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks)
-
-            # Cursor to track the actual media contents (URLs or base64 data) provided by the user
-            media_items_count = len(media_items)
-            media_items_cur = 0
-            last_media_id = None
-
-            for i in range(n_chunks):
-                chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i)
-                if chunk is None: continue
-                chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
-
-                if self._is_text_chunk(chunk_type):
-                    # Extract standard text token IDs
-                    n_tokens_out = ctypes.c_size_t()
-                    tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out))
-                    if tokens_ptr and n_tokens_out.value > 0:
-                        tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)]
-                        chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None))
-                        full_prompt_ids.extend(tokens)
-                        current_idx += len(tokens)
-                elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type):
-                    # Extract media properties
-                    # Note(JamePeng):
-                    # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models).
-                    # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample.
-                    # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise
-                    chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)
-
-                    if media_items_cur < media_items_count:
-                        # The C++ parser only sees identical placeholders (e.g., "<__media__>").
-                        # We MUST inject the actual media content's identity here.
-                        real_media_url = media_items[media_items_cur]["url"]
-                        # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5)
-                        # Generate a deterministic, unique negative ID for this specific image/audio.
-                        # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()).
-                        # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with
-                        #   positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k).
-                        # This empowers `longest_token_prefix` to correctly identify and reuse cached images,
-                        # while instantly breaking the match if the image content changes.
-                        # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100
-                        media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100
-                        last_media_id = media_id
-                        media_items_cur += 1
-                    elif last_media_id is not None:
-                        # video may expand into multiple image chunks from one media marker
-                        media_id = last_media_id
-                    else:
-                        # Magic Negative Number as fallback :)
-                        media_id = -314159
-
-                    if self.verbose:
-                        print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ")
-
-                    chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id))
-
-                    # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache
-                    full_prompt_ids.extend([media_id] * chunk_n_tokens)
-                    current_idx += chunk_n_tokens
-                else:
-                    raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.")
-
-            return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup
-
-        except Exception as e:
-            # Ensure no useless pointers remain upon any failure
-            # Free chunks
-            if chunks is not None:
-                self._mtmd_cpp.mtmd_input_chunks_free(chunks)
-                chunks = None
-            # Free bitmaps
-            if len(bitmap_cleanup) > 0:
-                for bitmap in bitmap_cleanup:
-                    self._mtmd_cpp.mtmd_bitmap_free(bitmap)
-                bitmap_cleanup = None
-            # Free videos
-            if len(video_cleanup) > 0:
-                for video_ctx in video_cleanup:
-                    self._mtmd_cpp.mtmd_helper_video_free(video_ctx)
-                video_cleanup = None
-
-            bitmaps = None
-
-            raise e
-
-    def __call__(
-        self,
-        *,
-        llama: llama_core.Llama,
-        messages: List[llama_types.ChatCompletionRequestMessage],
-        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
-        temperature: float = 0.2,
-        top_p: float = 0.95,
-        top_k: int = 40,
-        min_p: float = 0.05,
-        typical_p: float = 1.0,
-        stream: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
-        seed: Optional[int] = None,
-        response_format: Optional[
-            llama_types.ChatCompletionRequestResponseFormat
-        ] = None,
-        max_tokens: Optional[int] = None,
-        present_penalty: float = 0.0,
-        frequency_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
-        top_n_sigma: float = -1.00,
-        mirostat_mode: int = 0,
-        mirostat_tau: float = 5.0,
-        mirostat_eta: float = 0.1,
-        xtc_threshold: float = 0.1,
-        xtc_probability: float = 0.0,
-        dry_multiplier: float = 0.0,
-        dry_base: float = 1.75,
-        dry_allowed_length: int = 2,
-        dry_penalty_last_n:int = 0,
-        dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"],
-        adaptive_target : float = -1.0,
-        adaptive_decay : float = 0.9,
-        use_infill: bool = False,
-        model: Optional[str] = None,
-        logits_processor: Optional[llama_core.LogitsProcessorList] = None,
-        grammar: Optional[llama_grammar.LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        top_logprobs: Optional[int] = None,
-        add_generation_prompt: bool = True,
-        reasoning_budget: int = -1,
-        reasoning_start: str = "<think>",
-        reasoning_end: str = "</think>",
-        reasoning_budget_message: Optional[str] = None,
-        reasoning_start_in_prompt: bool = False,
-        reasoning_start_max_tokens: Optional[int] = 32,
-        **kwargs,  # type: ignore
-    ) -> Union[
-        llama_types.CreateChatCompletionResponse,
-        Iterator[llama_types.CreateChatCompletionStreamResponse],
-    ]:
-        # 1. Initialize mtmd context
-        self._init_mtmd_context(llama)
-        assert self.mtmd_ctx is not None
-
-        # 2. Concurrent Preprocessing & Ledger Construction
-        full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt(
-            llama=llama,
-            messages=messages,
-            functions=functions,
-            function_call=function_call,
-            tools=tools,
-            tool_choice=tool_choice,
-            add_generation_prompt=add_generation_prompt,
-        )
-
-        if self.verbose:
-            print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr)
-
-        try:
-            # 3. KV Cache Synchronization & State Rollback
-            # Compares the virtual ledger with physical history to prevent Cache Poisoning.
-            current_history = llama.input_ids[:llama.n_tokens].tolist()
-            longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose)
-
-            if longest_prefix < llama.n_tokens:
-                if llama.is_hybrid and llama._hybrid_cache_mgr is not None:
-                    if llama._hybrid_cache_mgr.max_checkpoints > 0:
-                        if self.verbose:
-                            print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). "
-                                f"Searching for nearest checkpoint...", file=sys.stderr)
-
-                        best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0)
-                        if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0):
-                            llama.n_tokens = best_ckpt.pos
-                            if self.verbose:
-                                print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr)
-                        else:
-                            if self.verbose:
-                                print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr)
-                            llama._hybrid_cache_mgr.clear()
-                            llama._ctx.memory_clear(True)
-                            llama.n_tokens = 0
-                    else:
-                        if self.verbose:
-                            print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr)
-                        llama._hybrid_cache_mgr.clear()
-                        llama._ctx.memory_clear(True)
-                        llama.n_tokens = 0
-                else:
-                    if self.verbose:
-                        print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr)
-                    llama._ctx.memory_seq_rm(0, longest_prefix, -1)
-                    llama.n_tokens = longest_prefix
-
-            n_past = llama.n_tokens
-
-            for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans:
-                # Skip previously matched chunks
-                if end_idx <= n_past:
-                    continue
-
-                if self._is_text_chunk(chunk_type):
-                    unprocessed_start = max(start_idx, n_past) - start_idx
-                    n_tokens_out = ctypes.c_size_t()
-                    tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out))
-
-                    if tokens_ptr and n_tokens_out.value > 0:
-                        all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)]
-                        tokens_to_eval = all_tokens[unprocessed_start:]
-
-                        if tokens_to_eval:
-                            if self.verbose:
-                                print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr)
-                            # Text evaluation delegates shift and chunking to native llama.eval
-                            llama.eval(tokens_to_eval)
-                            n_past = llama.n_tokens
-
-                elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type):
-                    chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr)
-
-                    if self.verbose:
-                        media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO"
-                        print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr)
-
-                    # Stage 5: Multimodal Physical OOM Defense
-                    if n_past + chunk_n_tokens > llama.n_ctx():
-                        if not llama._ctx.memory_can_shift():
-                            raise RuntimeError(
-                                f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend "
-                                f"(n_pos_per_embd > 1 or incompatible M-RoPE). "
-                                f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), "
-                                f"You MUST increase n_ctx to fit the dialogue."
-                            )
-                        else:
-                            # Safely discard oldest tokens while preserving system prompts
-                            n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch
-                            n_keep = min(llama.n_keep, n_past)
-                            n_discard = min(n_discard, n_past - n_keep)
-
-                            if n_discard <= 0:
-                                raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.")
-
-                            if self.verbose:
-                                print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr)
-
-                            # Execute physical memory shift
-                            llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard)
-                            llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard)
-
-                            # Shift python virtual array to match
-                            remaining_len = n_past - (n_keep + n_discard)
-                            if remaining_len > 0:
-                                llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past]
-
-                            n_past -= n_discard
-                            llama.n_tokens = n_past
-
-                    # Execute C++ Multimodal Black-box Extraction
-                    new_n_past = llama_cpp_lib.llama_pos(0)
-                    result = self._mtmd_cpp.mtmd_helper_eval_chunk_single(
-                        self.mtmd_ctx,
-                        llama._ctx.ctx,
-                        chunk_ptr,
-                        llama_cpp_lib.llama_pos(n_past),
-                        llama_cpp_lib.llama_seq_id(0),
-                        llama.n_batch,
-                        True, # logits_last = True, drastically saves computational overhead
-                        ctypes.byref(new_n_past)
-                    )
-
-                    if result != 0:
-                        raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.")
-
-                    # Update Ledger with "Negative Reverse Vocabulary" IDs
-                    llama.input_ids[n_past : new_n_past.value] = media_id
-                    n_past = new_n_past.value
-                    llama.n_tokens = n_past
-
-            # Extract the final, perfectly synchronized prompt sequence
-            prompt = llama.input_ids[: llama.n_tokens].tolist()
-
-            # End-of-Turn Checkpoint
-            # Anchors the state ONLY after the entire multi-modal turn is processed
-            if (
-                llama.is_hybrid
-                and llama._hybrid_cache_mgr is not None
-                and llama._hybrid_cache_mgr.max_checkpoints > 0
-            ):
-                if self.verbose:
-                    print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr)
-
-                llama._hybrid_cache_mgr.save_checkpoint(
-                    current_pos=llama.n_tokens,
-                    tokens=prompt,
-                    seq_id=0
-                )
-        finally:
-            # Cleanup chunks
-            if chunks is not None:
-                self._mtmd_cpp.mtmd_input_chunks_free(chunks)
-                chunks = None
-            # Cleanup bitmaps
-            if bitmap_cleanup:
-                for bitmap in bitmap_cleanup:
-                    self._mtmd_cpp.mtmd_bitmap_free(bitmap)
-                bitmap_cleanup.clear()
-            bitmap_array = None
-
-        # Handle response format and tools (same as before)
-        if response_format is not None and response_format["type"] == "json_object":
-            grammar = _grammar_for_response_format(response_format)
-
-        # Convert legacy functions to tools
-        if functions is not None:
-            tools = [
-                {
-                    "type": "function",
-                    "function": function,
-                }
-                for function in functions
-            ]
-
-        # Convert legacy function_call to tool_choice
-        if function_call is not None:
-            if isinstance(function_call, str) and (
-                function_call == "none" or function_call == "auto"
-            ):
-                tool_choice = function_call
-            if isinstance(function_call, dict) and "name" in function_call:
-                tool_choice = {
-                    "type": "function",
-                    "function": {
-                        "name": function_call["name"],
-                    },
-                }
-
-        tool = None
-        if (
-            tool_choice is not None
-            and isinstance(tool_choice, dict)
-            and tools is not None
-        ):
-            name = tool_choice["function"]["name"]
-            tool = next((t for t in tools if t["function"]["name"] == name), None)
-            if tool is None:
-                raise ValueError(f"Tool choice '{name}' not found in tools.")
-            schema = tool["function"]["parameters"]
-            try:
-                # create grammar from json schema
-                grammar = llama_grammar.LlamaGrammar.from_json_schema(
-                    json.dumps(schema), verbose=llama.verbose
-                )
-            except Exception as e:
-                if llama.verbose:
-                    print(str(e), file=sys.stderr)
-                grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF, verbose=llama.verbose
-                )
-
-        completion_or_chunks = llama.create_completion(
-            prompt=prompt,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            min_p=min_p,
-            typical_p=typical_p,
-            logprobs=top_logprobs if logprobs else None,
-            stream=stream,
-            stop=stop,
-            seed=seed,
-            max_tokens=max_tokens,
-            present_penalty=present_penalty,
-            frequency_penalty=frequency_penalty,
-            repeat_penalty=repeat_penalty,
-            top_n_sigma=top_n_sigma,
-            mirostat_mode=mirostat_mode,
-            mirostat_tau=mirostat_tau,
-            mirostat_eta=mirostat_eta,
-            xtc_threshold=xtc_threshold,
-            xtc_probability=xtc_probability,
-            dry_multiplier=dry_multiplier,
-            dry_base=dry_base,
-            dry_allowed_length=dry_allowed_length,
-            dry_penalty_last_n=dry_penalty_last_n,
-            dry_seq_breakers=dry_seq_breakers,
-            adaptive_target=adaptive_target,
-            adaptive_decay=adaptive_decay,
-            use_infill=use_infill,
-            model=model,
-            logits_processor=logits_processor,
-            grammar=grammar,
-            logit_bias=logit_bias,
-            reasoning_budget=reasoning_budget,
-            reasoning_start=reasoning_start,
-            reasoning_end=reasoning_end,
-            reasoning_budget_message=reasoning_budget_message,
-            reasoning_start_in_prompt=reasoning_start_in_prompt,
-            reasoning_start_max_tokens=reasoning_start_max_tokens,
-        )
-
-        if tool is not None:
-            tool_name = tool["function"]["name"]
-            return _convert_completion_to_chat_function(
-                tool_name, completion_or_chunks, stream
-            )
-        return _convert_completion_to_chat(completion_or_chunks, stream=stream)
-
-    def load_media(self, media_url: str, media_type: str) -> bytes:
-        """
-        Unified dispatcher for loading media payloads.
-        Routes the URL/URI to the specific image, audio, or video processor based on the media_type.
-        """
-        if media_type == "image":
-            return self._load_image(media_url)
-
-        elif media_type == "audio":
-            audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio")
-            try:
-                self.detect_audio_format(audio_bytes)
-            except ValueError as e:
-                raise ValueError(f"{self.log_prefix}(load_media): {e}")
-            return audio_bytes
-
-        elif media_type == "video":
-            return self._load_bytes(media_url, timeout=30, kind="video")
-
-        else:
-            raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'")
-
-    @staticmethod
-    def detect_audio_format(audio_bytes: bytes) -> str:
-        """
-        Pure utility function: Detects the audio format from magic bytes.
-        Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility
-        and avoid false positives (e.g., AVI files disguised as RIFF).
-        """
-        length = len(audio_bytes)
-
-        if length < 12:
-            raise ValueError("Audio data is corrupted or too small (less than 12 bytes).")
-
-        # RIFF & WAVE magic bytes verification
-        is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE"
-
-        # ID3 metadata or MPEG sync word verification
-        is_mp3 = length >= 3 and (
-            audio_bytes.startswith(b"ID3") or
-            (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0)
-        )
-
-        # FLAC magic bytes verification
-        is_flac = audio_bytes.startswith(b"fLaC")
-
-        if is_wav:
-            return "wav"
-        elif is_mp3:
-            return "mp3"
-        elif is_flac:
-            return "flac"
-        else:
-            raise ValueError(
-                "Unsupported audio format detected via magic bytes. "
-                "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC."
-            )
-
-    DEFAULT_HTTP_HEADERS = {
-        "User-Agent": (
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/148.0.0.0 Safari/537.36"
-        ),
-    }
-
-    @staticmethod
-    def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes:
-        """
-        Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL.
-        """
-        media_bytes = b""
-
-        # 1. Handle data URI
-        if media_url.strip().startswith("data:"):
-            comma_pos = media_url.find(",")
-            if comma_pos == -1:
-                raise ValueError("Invalid data URI: missing comma separator")
-
-            base64_data = media_url[comma_pos + 1:]
-            media_bytes = base64.b64decode(base64_data)
-
-        # 2. Handle local file path
-        elif os.path.exists(media_url):
-            with open(media_url, "rb") as f:
-                media_bytes = f.read()
-
-        # 3. Handle remote URL via HTTP/HTTPS
-        else:
-            req = urllib.request.Request(
-                media_url,
-                headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS,
-            )
-            try:
-                with urllib.request.urlopen(req, timeout=timeout) as f:
-                    media_bytes = f.read()
-            except (URLError, HTTPError) as e:
-                raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}")
-
-        if not media_bytes:
-            raise ValueError(f"Empty {kind} data received")
-
-        return media_bytes
-
-    @staticmethod
-    def _load_image(image_url: str) -> bytes:
-        """
-        Load an image from either a URL or a data URI and return it as JPEG bytes.
-
-        Supports:
-        - Remote images via HTTP/HTTPS (with proper User-Agent)
-        - Data URIs (base64-encoded, e.g., data:image/png;base64,...)
-        - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background
-        - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
-
-        Returns:
-            JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models.
-        """
-        # 1. Load image bytes from image_url
-        image_bytes = MTMDChatHandler._load_bytes(
-            image_url,
-            timeout=15,
-            kind="image",
-        )
-
-        # 2. Check if image_bytes is empty.
-        if not image_bytes:
-            raise ValueError("Empty image data received")
-
-        # 3. Open image with Pillow
-        try:
-            from PIL import Image, ImageStat
-        except ImportError:
-            raise ImportError("Pillow is required for image processing. Install with: pip install pillow")
-
-        import io
-        image = Image.open(io.BytesIO(image_bytes))
-
-        # 4. Handle transparency (RGBA, LA, P with transparency, etc.)
-        if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info):
-            # Use alpha channel as mask
-            if image.mode == "P":
-                image = image.convert("RGBA")
-
-            alpha = image.split()[-1]  # Last channel is alpha
-            # Compute average brightness of visible (non-transparent) pixels
-            stat = ImageStat.Stat(image.convert("L"), mask=alpha)
-
-            # Choose background: white for dark content, black for bright content
-            bg_color = (255, 255, 255)  # white
-            if stat.count[0] > 0 and stat.mean[0] > 127:
-                bg_color = (0, 0, 0)  # black
-
-            background = Image.new("RGB", image.size, bg_color)
-            background.paste(image, mask=alpha)
-            image = background
-
-        # 5. Ensure RGB mode for formats like CMYK, palette, etc.
-        elif image.mode != "RGB":
-            image = image.convert("RGB")
-
-        # 6. Save as high-quality JPEG, suitable for most vision models.
-        output = io.BytesIO()
-        image.save(output, format="JPEG", quality=95, optimize=True, progressive=True)
-        return output.getvalue()
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        repo_id: str,
-        filename: Optional[str],
-        local_dir: Optional[Union[str, os.PathLike[str]]] = None,
-        local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
-        cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
-        **kwargs: Any,
-    ) -> "MTMDChatHandler":
-        import fnmatch
-        from pathlib import Path
-
-        try:
-            from huggingface_hub import hf_hub_download, HfFileSystem  # type: ignore
-            from huggingface_hub.utils import validate_repo_id  # type: ignore
-        except ImportError:
-            raise ImportError(
-                "Llama.from_pretrained requires the huggingface_hub package. "
-                "You can install it with `pip install --upgrade huggingface_hub`."
-            )
-
-        validate_repo_id(repo_id)
-
-        hffs = HfFileSystem()
-
-        files = [
-            file["name"] if isinstance(file, dict) else file
-            for file in hffs.ls(repo_id)  # type: ignore
-        ]
-
-        # split each file into repo_id, subfolder, filename
-        file_list: List[str] = []
-        for file in files:
-            rel_path = Path(file).relative_to(repo_id)
-            file_list.append(str(rel_path))
-
-        matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)]  # type: ignore
-
-        if len(matching_files) == 0:
-            raise ValueError(
-                f"No file found in {repo_id} that match {filename}\n\n"
-                f"Available Files:\n{json.dumps(file_list)}"
-            )
-
-        if len(matching_files) > 1:
-            raise ValueError(
-                f"Multiple files found in {repo_id} matching {filename}\n\n"
-                f"Available Files:\n{json.dumps(files)}"
-            )
-
-        (matching_file,) = matching_files
-
-        subfolder = str(Path(matching_file).parent)
-        filename = Path(matching_file).name
-
-        # download the file
-        hf_hub_download(
-            repo_id=repo_id,
-            filename=filename,
-            subfolder=subfolder,
-            local_dir=cast(Union[str, Path, None], local_dir),
-            local_dir_use_symlinks=local_dir_use_symlinks,
-            cache_dir=cast(Union[str, Path, None], cache_dir),
-        )
-
-        if local_dir is None:
-            model_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=filename,
-                subfolder=subfolder,
-                local_dir=local_dir,
-                local_dir_use_symlinks=local_dir_use_symlinks,
-                cache_dir=cast(Union[str, Path, None], cache_dir),
-                local_files_only=True,
-            )
-        else:
-            model_path = os.path.join(local_dir, filename)
-
-        return cls(
-            mmproj_path=model_path,
-            **kwargs,
-        )
-
-class GenericMTMDChatHandler(MTMDChatHandler):
-    KNOWN_MEDIA_TAGS = [
-        "<|image_pad|>",
-        "<|audio_pad|>",
-        "<|video_pad|>",
-        "<|image|>",
-        "<|audio|>",
-        "<|video|>",
-        "[IMG]"
-    ]
-
-    def __init__(
-        self,
-        chat_format: str,
-        mmproj_path: str,
-        verbose: bool = True,
-        **kwargs
-    ) -> None:
-        self.chat_format = chat_format
-
-        if verbose:
-            print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)
-
-        if self.chat_format is None:
-            raise ValueError("Failed to get model chat template automatically.")
-        
-        super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs)
-    
-    def __call__(self, **kwargs):
-        self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format]
-
-        if self.verbose:
-            print(f"{self.log_prefix} - Start processing")
-
-        # Use parent implementation
-        return super().__call__(**kwargs)
-
-class Llava15ChatHandler(MTMDChatHandler):
-    CHAT_FORMAT = (
-        "{% for message in messages %}"
-            "{% if message.role == 'system' %}"
-                "{{ message.content }}"
-            "{% endif %}"
-
-            "{% if message.role == 'user' %}"
-                "{% if message.content is string %}"
-                    "\nUSER: {{ message.content }}"
-                "{% elif message.content is iterable %}"
-                    "\nUSER: "
-                    "{% for content in message.content %}"
-                        "{% if content.type == 'image_url' %}"
-                            "{{ content.image_url if content.image_url is string else content.image_url.url }}"
-                        "{% endif %}"
-                    "{% endfor %}"
-                    "{% for content in message.content %}"
-                        "{% if content.type == 'text' %}"
-                            "{{ content.text }}"
-                        "{% endif %}"
-                    "{% endfor %}"
-                "{% endif %}"
-            "{% endif %}"
-
-            "{% if message.role == 'assistant' and message.content is not none %}"
-                "\nASSISTANT: {{ message.content }}"
-            "{% endif %}"
-        "{% endfor %}"
-
-        "{% if add_generation_prompt %}"
-            "\nASSISTANT: "
-        "{% endif %}"
-    )
-
-
-class ObsidianChatHandler(MTMDChatHandler):
-    # Prompt Format
-    # The model followed ChatML format. However, with ### as the seperator
-
-    # <|im_start|>user
-    # What is this sign about?\n<image>
-    # ###
-    # <|im_start|>assistant
-    # The sign is about bullying, and it is placed on a black background with a red background.
-    # ###
-
-    CHAT_FORMAT = (
-        "{% for message in messages %}"
-        # System message
-        "{% if message.role == 'system' %}"
-        "<|im_start|>system\n"
-        "{{ message.content }}\n"
-        "###\n"
-        "{% endif %}"
-        # User message
-        "{% if message.role == 'user' %}"
-        "<|im_start|>user\n"
-        "{% if message.content is string %}"
-        "{{ message.content }}"
-        "{% endif %}"
-        "{% if message.content is iterable %}"
-        "{% for content in message.content %}"
-        "{% if content.type == 'image_url' and content.image_url is string %}"
-        "{{ content.image_url }}"
-        "{% endif %}"
-        "{% if content.type == 'image_url' and content.image_url is mapping %}"
-        "{{ content.image_url.url }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% for content in message.content %}"
-        "{% if content.type == 'text' %}"
-        "{{ content.text }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% endif %}"
-        "###\n"
-        "{% endif %}"
-        # Assistant message
-        "{% if message.role == 'assistant' %}"
-        "<|im_start|>assistant\n"
-        "{{ message.content }}"
-        "###\n"
-        "{% endif %}"
-        "{% endfor %}"
-        # Generation prompt
-        "{% if add_generation_prompt %}"
-        "<|im_start|>assistant\n"
-        "{% endif %}"
-    )
-
-
-class MoondreamChatHandler(MTMDChatHandler):
-    # Chat Format:
-    # f"<image>\n\n{chat_history}Question: {question}\n\nAnswer:"
-    CHAT_FORMAT = (
-        "{% for message in messages %}"
-        "{% if message.role == 'user' %}"
-        "{% if message.content is iterable %}"
-        # <image>
-        "{% for content in message.content %}"
-        "{% if content.type == 'image_url' %}"
-        "{% if content.image_url is string %}"
-        "{{ content.image_url }}\n\n"
-        "{% endif %}"
-        "{% if content.image_url is mapping %}"
-        "{{ content.image_url.url }}\n\n"
-        "{% endif %}"
-        "{% endif %}"
-        "{% endfor %}"
-        # Question:
-        "{% for content in message.content %}"
-        "{% if content.type == 'text' %}"
-        "Question: {{ content.text }}\n\n"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% endif %}"
-        # Question:
-        "{% if message.content is string %}"
-        "Question: {{ message.content }}\n\n"
-        "{% endif %}"
-        "{% endif %}"
-        # Answer:
-        "{% if message.role == 'assistant' %}"
-        "Answer:{{ message.content }}\n\n"
-        "{% endif %}"
-        "{% endfor %}"
-        # Generation prompt
-        "{% if add_generation_prompt %}"
-        "Answer:"
-        "{% endif %}"
-    )
-
-
-class Llava16ChatHandler(MTMDChatHandler):
-    # Example prompt
-    # "DEFAULT_SYSTEM_MESSAGE + USER: <image>\nWhat is shown in this image? ASSISTANT:"
-
-    CHAT_FORMAT = (
-        "{% for message in messages %}"
-        "{% if message.role == 'system' %}"
-        "{{ message.content }}"
-        "{% endif %}"
-        "{% if message.role == 'user' %}"
-        "{% if message.content is iterable %}"
-        # <image>
-        "{% for content in message.content %}"
-        "{% if content.type == 'image_url' %}"
-        "{% if content.image_url is string %}"
-        "{{ content.image_url }}\n"
-        "{% endif %}"
-        "{% if content.image_url is mapping %}"
-        "{{ content.image_url.url }}\n"
-        "{% endif %}"
-        "{% endif %}"
-        "{% endfor %}"
-        # Question:
-        "{% for content in message.content %}"
-        "{% if content.type == 'text' %}"
-        "{{ content.text }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% endif %}"
-        # Question:
-        "{% if message.content is string %}"
-        "{{ message.content }}"
-        "{% endif %}"
-        "{% endif %}"
-        # Answer:
-        "{% if message.role == 'assistant' %}"
-        "{{ message.content }}"
-        "{% endif %}"
-        "{% endfor %}"
-        # Generation prompt
-        "{% if add_generation_prompt %}"
-        "Answer:"
-        "{% endif %}"
-    )
-
-
-class NanoLlavaChatHandler(MTMDChatHandler):
-    # Prompt Format
-    # The model follow the ChatML standard, however, without \n at the end of <|im_end|>:
-
-    # <|im_start|>system
-    # Answer the question<|im_end|><|im_start|>user
-    # <image>
-    # What is the picture about?<|im_end|><|im_start|>assistant
-    DEFAULT_SYSTEM_MESSAGE = "Answer the question"
-
-    CHAT_FORMAT = (
-        "{% for message in messages %}"
-        # System message
-        "{% if message.role == 'system' %}"
-        "<|im_start|>system\n"
-        "{{ message.content }}"
-        "<|im_end|>"
-        "{% endif %}"
-        # User message
-        "{% if message.role == 'user' %}"
-        "<|im_start|>user\n"
-        "{% if message.content is string %}"
-        "{{ message.content }}"
-        "{% endif %}"
-        "{% if message.content is iterable %}"
-        "{% for content in message.content %}"
-        "{% if content.type == 'image_url' and content.image_url is string %}"
-        "{{ content.image_url }}"
-        "{% endif %}"
-        "{% if content.type == 'image_url' and content.image_url is mapping %}"
-        "{{ content.image_url.url }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% for content in message.content %}"
-        "{% if content.type == 'text' %}"
-        "{{ content.text }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% endif %}"
-        "<|im_end|>"
-        "{% endif %}"
-        # Assistant message
-        "{% if message.role == 'assistant' %}"
-        "<|im_start|>assistant\n"
-        "{{ message.content }}"
-        "<|im_end|>"
-        "{% endif %}"
-        "{% endfor %}"
-        # Generation prompt
-        "{% if add_generation_prompt %}"
-        "<|im_start|>assistant\n"
-        "{% endif %}"
-    )
-
-
-class Llama3VisionAlphaChatHandler(MTMDChatHandler):
-    # question = "<image>" + q
-
-    # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-
-    CHAT_FORMAT = (
-        "{% for message in messages %}"
-        "<|start_header_id|>"
-        "{% if message.role == 'user' %}"
-        "user<|end_header_id|>\n\n"
-        "{% if message.content is iterable %}"
-        # <image>
-        "{% for content in message.content %}"
-        "{% if content.type == 'image_url' %}"
-        "{% if content.image_url is string %}"
-        "{{ content.image_url }}"
-        "{% endif %}"
-        "{% if content.image_url is mapping %}"
-        "{{ content.image_url.url }}"
-        "{% endif %}"
-        "{% endif %}"
-        "{% endfor %}"
-        # Question:
-        "{% for content in message.content %}"
-        "{% if content.type == 'text' %}"
-        "{{ content.text }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% endif %}"
-        # Question:
-        "{% if message.content is string %}"
-        "{{ message.content }}"
-        "{% endif %}"
-        "{% endif %}"
-        # Answer:
-        "{% if message.role == 'assistant' %}"
-        "assistant<|end_header_id|>\n\n"
-        "{{ message.content }}"
-        "{% endif %}"
-        "<|eot_id|>"
-        "{% endfor %}"
-        # Generation prompt
-        "{% if add_generation_prompt %}"
-        "<|start_header_id|>assistant<|end_header_id|>\n\n"
-        "{% endif %}"
-    )
-
-
-# alias
-Llama3VisionAlpha = Llama3VisionAlphaChatHandler
-
-
-class MiniCPMv26ChatHandler(MTMDChatHandler):
-
-    CHAT_FORMAT = (
-        "{% set image_count = namespace(value=0) %}"
-        "{% for message in messages %}"
-        "{% if loop.first and messages[0]['role'] != 'system' %}"
-        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-        "{% endif %}"
-        "<|im_start|>{{ message['role'] }}\n"
-        "{% if message['content'] is iterable %}"
-        "{% for content in message['content'] %}"
-        "{% if content.type == 'image_url' %}"
-        "{% if content.image_url is string %}"
-        "{% set image_count.value = image_count.value + 1 %}"
-        "<image_id>{{ image_count.value }}</image_id>: <image>{{ content.image_url }}</image>"
-        "{% endif %}"
-        "{% if content.image_url is mapping %}"
-        "{% set image_count.value = image_count.value + 1 %}"
-        "<image_id>{{ image_count.value }}</image_id>: <image>{{ content.image_url.url }}</image>"
-        "{% endif %}"
-        "{% endif %}"
-        "{% endfor %}"
-
-        "{% for content in message['content'] %}"
-        "{% if content.type == 'text' %}"
-        "{{ content.text }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% endif %}"
-        "{% if message['content'] is string %}"
-        "{{ message['content'] }}"
-        "{% endif %}"
-        "<|im_end|>\n"
-        "{% endfor %}"
-        "{% if add_generation_prompt %}"
-        "<|im_start|>assistant\n"
-        "{% endif %}"
-    )
-
-
-class MiniCPMv45ChatHandler(MTMDChatHandler):
-    """
-    Handler for MiniCPM-V 4.5 models.
-
-    Supports:
-    - Multi-step tool calls with <tool_call> and <tool_response> XML tags.
-    - Integrated reasoning (thinking) process with <think> tags.
-    - Specialized system prompt handling with tool definitions.
-    - Global image numbering for multi-image processing.
-    """
-
-    # Model specific control tokens
-    MINICPMV_BOS_TOKEN = "<|im_start|>"
-    MINICPMV_EOS_TOKEN = "<|im_end|>"
-    MINICPMV_PAD_TOKEN = "<|endoftext|>"
-
-    # Image placeholder tags
-    MINICPMV_IMAGE_START_TOKEN = "<image>"
-    MINICPMV_IMAGE_END_TOKEN = "</image>"
-    MINICPMV_IMAGE_ID_START_TOKEN = "<image_id>"
-    MINICPMV_IMAGE_ID_END_TOKEN = "</image_id>"
-
-    CHAT_FORMAT = (
-        # --- 1. First System Message & Tools Definitions ---
-        "{%- if tools %}"
-            "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}"
-            "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}"
-            "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}"
-            "{{- 'You are provided with function signatures within <tools></tools> XML tags:\\n<tools>' }}"
-            "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}"
-            "{{- '\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\\n</tool_call>" + MINICPMV_EOS_TOKEN + "\\n' }}"
-        "{%- elif messages[0].role == 'system' %}"
-            "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}"
-        "{%- endif %}"
-
-        # --- 2. Message Stream Processing ---
-        "{% set image_count = namespace(value=0) %}"
-        "{%- for message in messages %}"
-            # --- Unified Role Handling (User, Assistant, and subsequent Systems) ---
-            "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}"
-                "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}"
-
-                "{%- set content = message.content %}"
-                "{%- if content is not string %}"
-                    "{%- set ns = namespace(content_str='') %}"
-                    "{%- for item in content %}"
-                        # --- Explicit image_url type and value checking ---
-                        "{%- if item.type == 'image_url' %}"
-                            "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}"
-                            "{%- set image_count.value = image_count.value + 1 %}"
-                            # Format: <image_id>N</image_id>: <image>IMAGE_URL</image>
-                            "{%- set ns.content_str = ns.content_str + '<image_id>' + (image_count.value | string) + '</image_id>: <image>' + image_url + '</image>' %}"
-                        "{%- elif item.type == 'text' %}"
-                            "{%- set ns.content_str = ns.content_str + item.text %}"
-                        "{%- endif %}"
-                    "{%- endfor %}"
-                    "{%- set content = ns.content_str %}"
-                "{%- endif %}"
-
-                "{{- content -}}"
-
-                # Append tool_calls to assistant messages if they exist
-                "{%- if message.role == 'assistant' and message.tool_calls %}"
-                    "{%- for tool_call in message.tool_calls %}"
-                        "{%- set tc = tool_call.function if tool_call.function else tool_call %}"
-                        "{{- '\\n<tool_call>\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}"
-                        "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}"
-                        "{{- '}\\n</tool_call>' }}"
-                    "{%- endfor %}"
-                "{%- endif %}"
-                "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}"
-
-            # --- Specialized Tool Response Handling ---
-            # Group consecutive tool responses under a single user-like block
-            "{%- elif message.role == 'tool' %}"
-                "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}"
-                    "{{- '" + MINICPMV_BOS_TOKEN + "user' }}"
-                "{%- endif %}"
-                "{{- '\\n<tool_response>\\n' + message.content + '\\n</tool_response>' }}"
-                "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}"
-                    "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}"
-                "{%- endif %}"
-            "{%- endif %}"
-        "{%- endfor %}"
-
-        # --- 3. Generation Prompt ---
-        "{%- if add_generation_prompt %}"
-            "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}"
-            # Handle thinking/reasoning block visibility based on configuration
-            "{%- if enable_thinking is defined and enable_thinking is false %}"
-                "{{- '<think>\\n\\n</think>\\n\\n' }}"
-            "{%- elif enable_thinking is defined and enable_thinking is true %}"
-                "{{- '<think>\\n' }}"
-            "{%- endif %}"
-        "{%- endif %}"
-    )
-
-    def __init__(self, enable_thinking: bool = True, **kwargs):
-        """
-        Initializes the MiniCPM-V 4.5 Handler.
-
-        Args:
-            enable_thinking (bool): If True, model generates reasoning before the final answer.
-            **kwargs: Additional arguments for the base MTMDChatHandler.
-        """
-        self.enable_thinking = enable_thinking
-        super().__init__(**kwargs)
-
-    def __call__(self, **kwargs):
-        # Inject thinking control flag into the template
-        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
-
-        # Set stop token patch
-        kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN]
-
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
-        return super().__call__(**kwargs)
-
-
-class MiniCPMV46ChatHandler(MTMDChatHandler):
-    """
-    Handler for MiniCPM-V-4.6 models.
-
-    Features:
-    - Aligned with official tokenizer_config.json special tokens.
-    - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens.
-    - Integrated MTMD-style URL and Base64 injection for visual content.
-    - Specialized `<tool_call>` and `<tool_response>` block generation.
-    - Autonomously folds previous reasoning paths using `last_query_index`.
-    - Toggles `<think>` block generation via `enable_thinking` (Defaults to False).
-    """
-
-    # Core tokens
-    MINICPM_BOS_TOKEN = "<|im_start|>"
-    MINICPM_EOS_TOKEN = "<|im_end|>"
-    MINICPM_PAD_TOKEN = "<|endoftext|>"
-
-    # Vision tokens
-    MINICPM_VISION_BOS_TOKEN = "<|vision_start|>"
-    MINICPM_VISION_EOS_TOKEN = "<|vision_end|>"
-    MINICPM_IMAGE_TOKEN = "<|image_pad|>"
-    MINICPM_VIDEO_TOKEN = "<|video_pad|>"
-
-    CHAT_FORMAT = (
-        "{%- if enable_thinking is not defined -%}\n"
-        "    {%- set enable_thinking = false -%}\n"
-        "{%- endif -%}\n"
-        "{%- macro render_content(content, is_system_content=false) -%}\n"
-        "    {%- if content is string -%}\n"
-        "        {{- content -}}\n"
-        "    {%- elif content is iterable and content is not mapping -%}\n"
-        "        {%- set ns = namespace(parts=[]) -%}\n"
-        "        {%- for item in content -%}\n"
-        "            {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n"
-        "                {%- if is_system_content -%}\n"
-        "                    {{- raise_exception('System message cannot contain images.') -}}\n"
-        "                {%- endif -%}\n"
-        "                {%- set url_val = '' -%}\n"
-        "                {%- if item.type == 'image_url' -%}\n"
-        "                    {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n"
-        "                {%- endif -%}\n"
-        "                {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n"
-        # "            {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n"
-        # "                {%- if is_system_content -%}\n"
-        # "                    {{- raise_exception('System message cannot contain videos.') -}}\n"
-        # "                {%- endif -%}\n"
-        # "                {%- set url_val = '' -%}\n"
-        # "                {%- if item.type == 'video_url' -%}\n"
-        # "                    {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n"
-        # "                {%- endif -%}\n"
-        # "                {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n"
-        "            {%- elif 'text' in item -%}\n"
-        "                {%- set ns.parts = ns.parts + [item.text] -%}\n"
-        "            {%- else -%}\n"
-        "                {{- raise_exception('Unexpected item type in content.') -}}\n"
-        "            {%- endif -%}\n"
-        "        {%- endfor -%}\n"
-        "        {{- ns.parts | join('\\n') -}}\n"
-        "    {%- elif content is none or content is undefined -%}\n"
-        "        {{- '' -}}\n"
-        "    {%- else -%}\n"
-        "        {{- raise_exception('Unexpected content type.') -}}\n"
-        "    {%- endif -%}\n"
-        "{%- endmacro -%}\n"
-        "{%- if not messages %}\n"
-        "    {{- raise_exception('No messages provided.') }}\n"
-        "{%- endif %}\n"
-        "{%- if tools and tools is iterable and tools is not mapping %}\n"
-        "    {{- '<|im_start|>system\\n' }}\n"
-        "    {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n<tools>' }}\n"
-        "    {%- for tool in tools %}\n"
-        "        {{- '\\n' }}\n"
-        "        {{- tool | tojson }}\n"
-        "    {%- endfor %}\n"
-        "    {{- '\\n</tools>' }}\n"
-        "    {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>\\nvalue_1\\n</parameter>\\n<parameter=example_parameter_2>\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\nReminder:\\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n</IMPORTANT>' }}\n"
-        "    {%- if messages[0].role == 'system' %}\n"
-        "        {%- set content = render_content(messages[0].content, true)|trim %}\n"
-        "        {%- if content %}\n"
-        "            {{- '\\n\\n' + content }}\n"
-        "        {%- endif %}\n"
-        "    {%- endif %}\n"
-        "    {{- '<|im_end|>\\n' }}\n"
-        "{%- else %}\n"
-        "    {%- if messages[0].role == 'system' %}\n"
-        "        {%- set content = render_content(messages[0].content, true)|trim %}\n"
-        "        {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n"
-        "    {%- endif %}\n"
-        "{%- endif %}\n"
-        "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n"
-        "{%- for message in messages[::-1] %}\n"
-        "    {%- set index = (messages|length - 1) - loop.index0 %}\n"
-        "    {%- if ns.multi_step_tool and message.role == 'user' %}\n"
-        "        {%- set content = render_content(message.content)|trim %}\n"
-        "        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}\n"
-        "            {%- set ns.multi_step_tool = false %}\n"
-        "            {%- set ns.last_query_index = index %}\n"
-        "        {%- endif %}\n"
-        "    {%- endif %}\n"
-        "{%- endfor %}\n"
-        "{%- if ns.multi_step_tool %}\n"
-        "    {{- raise_exception('No user query found in messages.') }}\n"
-        "{%- endif %}\n"
-        "{%- for message in messages %}\n"
-        "    {%- set content = render_content(message.content)|trim %}\n"
-        "    {%- if message.role == 'system' %}\n"
-        "        {%- if not loop.first %}\n"
-        "            {{- raise_exception('System message must be at the beginning.') }}\n"
-        "        {%- endif %}\n"
-        "    {%- elif message.role == 'user' %}\n"
-        "        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n"
-        "    {%- elif message.role == 'assistant' %}\n"
-        "        {%- set reasoning_content = '' %}\n"
-        "        {%- if message.reasoning_content is string %}\n"
-        "            {%- set reasoning_content = message.reasoning_content %}\n"
-        "        {%- else %}\n"
-        "            {%- if '</think>' in content %}\n"
-        "                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n"
-        "                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n"
-        "            {%- endif %}\n"
-        "        {%- endif %}\n"
-        "        {%- set reasoning_content = reasoning_content|trim %}\n"
-        "        {%- if loop.index0 > ns.last_query_index %}\n"
-        "            {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content + '\\n</think>\\n\\n' + content }}\n"
-        "        {%- else %}\n"
-        "            {{- '<|im_start|>' + message.role + '\\n' + content }}\n"
-        "        {%- endif %}\n"
-        "        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n"
-        "            {%- for tool_call in message.tool_calls %}\n"
-        "                {%- if tool_call.function is defined %}\n"
-        "                    {%- set tool_call = tool_call.function %}\n"
-        "                {%- endif %}\n"
-        "                {%- if loop.first %}\n"
-        "                    {%- if content|trim %}\n"
-        "                        {{- '\\n\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n"
-        "                    {%- else %}\n"
-        "                        {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n"
-        "                    {%- endif %}\n"
-        "                {%- else %}\n"
-        "                    {{- '\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n"
-        "                {%- endif %}\n"
-        "                {%- if tool_call.arguments is defined %}\n"
-        "                    {%- for args_name, args_value in tool_call.arguments|items %}\n"
-        "                        {{- '<parameter=' + args_name + '>\\n' }}\n"
-        "                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n"
-        "                        {{- args_value }}\n"
-        "                        {{- '\\n</parameter>\\n' }}\n"
-        "                    {%- endfor %}\n"
-        "                {%- endif %}\n"
-        "                {{- '</function>\\n</tool_call>' }}\n"
-        "            {%- endfor %}\n"
-        "        {%- endif %}\n"
-        "        {{- '<|im_end|>\\n' }}\n"
-        "    {%- elif message.role == 'tool' %}\n"
-        "        {%- if loop.previtem and loop.previtem.role != 'tool' %}\n"
-        "            {{- '<|im_start|>user' }}\n"
-        "        {%- endif %}\n"
-        "        {{- '\\n<tool_response>\\n' }}\n"
-        "        {{- content }}\n"
-        "        {{- '\\n</tool_response>' }}\n"
-        "        {%- if not loop.last and loop.nextitem.role != 'tool' %}\n"
-        "            {{- '<|im_end|>\\n' }}\n"
-        "        {%- elif loop.last %}\n"
-        "            {{- '<|im_end|>\\n' }}\n"
-        "        {%- endif %}\n"
-        "    {%- else %}\n"
-        "        {{- raise_exception('Unexpected message role.') }}\n"
-        "    {%- endif %}\n"
-        "{%- endfor %}\n"
-        "{%- if add_generation_prompt %}\n"
-        "    {{- '<|im_start|>assistant\\n' }}\n"
-        "    {%- if enable_thinking is defined and enable_thinking is false %}\n"
-        "        {{- '<think>\\n\\n</think>\\n\\n' }}\n"
-        "    {%- else %}\n"
-        "        {{- '<think>\\n' }}\n"
-        "    {%- endif %}\n"
-        "{%- endif %}\n"
-    )
-
-    def __init__(self, enable_thinking: bool = True, **kwargs):
-        """
-        Initializes the MiniCPM-V-4.6 Handler.
-
-        Args:
-            enable_thinking (bool): Controls whether to open a `<think>` block for reasoning.
-                                    Defaults to False as per the standard template logic.
-        """
-        self.enable_thinking = enable_thinking
-        super().__init__(**kwargs)
-
-    def __call__(self, **kwargs):
-        # Inject the thinking variable into the Jinja environment
-        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
-
-        # MiniCPM uses standard <|im_end|> ChatML stop formatting
-        kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN]
-
-        if self.verbose:
-            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
-
-        return super().__call__(**kwargs)
-
-
-class Gemma3ChatHandler(MTMDChatHandler):
-
-    GEMMA3_BOI_TOKEN  = "<start_of_image>"
-    GEMMA3_EOI_TOKEN = "<end_of_image>"
-    GEMMA3_BOS_TOKEN = "<bos>"
-    GEMMA3_EOS_TOKEN = "<eos>"
-
-    CHAT_FORMAT = (
-        "{% if messages[0]['role'] == 'system' %}"
-        "{% set loop_messages = messages[1:] %}"
-        "{% if messages[0]['content'] is string %}"
-        "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}"
-        "{% else %}"
-        "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}"
-        "{% endif %}"
-        "{% else %}"
-        "{% set loop_messages = messages %}"
-        "{% set first_user_prefix = '' %}"
-        "{% endif %}"
-
-        "{% for message in loop_messages %}"
-        "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
-        "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}"
-        "{% endif %}"
-
-        "{% if message['role'] == 'assistant' %}"
-        "{% set role = 'model' %}"
-        "{% else %}"
-        "{% set role = message['role'] %}"
-        "{% endif %}"
-
-        "{{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else '') }}"
-
-        "{% if message['content'] is string %}"
-        "{{ message['content'] | trim }}"
-        "{% elif message['content'] is iterable %}"
-        "{% for item in message['content'] %}"
-        "{% if item['type'] == 'image_url' and item['image_url'] is string %}"
-        "{{ '<start_of_image>' + item['image_url'] + '<end_of_image>' }}"
-        "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}"
-        "{{ '<start_of_image>' + item['image_url']['url'] + '<end_of_image>' }}"
-        "{% elif item['type'] == 'text' %}"
-        "{{ item['text'] | trim }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% else %}"
-        "{{ raise_exception('Invalid content type') }}"
-        "{% endif %}"
-
-        "<end_of_turn>\n"
-        "{% endfor %}"
-
-        "{% if add_generation_prompt %}"
-        "<start_of_turn>model\n"
-        "{% endif %}"
-    )
-
-
-class Gemma4ChatHandler(MTMDChatHandler):
-    """
-    Handler for Gemma 4 models.
-
-    Note on `enable_thinking`:
-        The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models.
-        It is NOT supported by Gemma4 E2B and E4B models.
-
-    [Important Note for Audio Processing!]
-        It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models.
-        Other quantizations are known to have degraded performance;
-        ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463
-    """
-
-    # The special token in Gemma 4
-    GEMMA4_BOI_TOKEN  = "<|image>"
-    GEMMA4_EOI_TOKEN = "<image|>"
-    GEMMA4_BOA_TOKEN  = "<|audio>"
-    GEMMA4_EOA_TOKEN = "<audio|>"
-    GEMMA4_BOS_TOKEN = "<bos>"
-    GEMMA4_EOS_TOKEN = "<eos>"
-    GEMMA4_SOT_TOKEN = "<|turn>"
-    GEMMA4_EOT_TOKEN = "<turn|>"
-    GEMMA4_SOC_TOKEN = "<|channel>"
-    GEMMA4_EOC_TOKEN = "<channel|>"
-    GEMMA4_STC_TOKEN = "<|tool_call>"
-    GEMMA4_ETC_TOKEN = "<tool_call|>"
-    GEMMA4_STD_TOKEN = "<|tool>"
-    GEMMA4_ETD_TOKEN = "<tool|>"
-    GEMMA4_STR_TOKEN = "<|tool_response>"
-    GEMMA4_ETR_TOKEN = "<tool_response|>"
-
-    CHAT_FORMAT = (
-        "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n"
-        "    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n"
-        "    {%- set ns = namespace(found_first=false) -%}\n"
-        "    {%- for key, value in properties | dictsort -%}\n"
-        "        {%- set add_comma = false -%}\n"
-        "        {%- if not filter_keys or key not in standard_keys -%}\n"
-        "            {%- if ns.found_first %},{% endif -%}\n"
-        "            {%- set ns.found_first = true -%}\n"
-        "            {{ key }}:{\n"
-        "            {%- if value['description'] -%}\n"
-        "                description:<|\"|>{{ value['description'] }}<|\"|>\n"
-        "                {%- set add_comma = true -%}\n"
-        "            {%- endif -%}\n"
-        "            {%- if value['type'] | upper == 'STRING' -%}\n"
-        "                {%- if value['enum'] -%}\n"
-        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
-        "                    enum:{{ format_argument(value['enum']) }}\n"
-        "                {%- endif -%}\n"
-        "            {%- elif value['type'] | upper == 'ARRAY' -%}\n"
-        "                {%- if value['items'] is mapping and value['items'] -%}\n"
-        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
-        "                    items:{\n"
-        "                    {%- set ns_items = namespace(found_first=false) -%}\n"
-        "                    {%- for item_key, item_value in value['items'] | dictsort -%}\n"
-        "                        {%- if item_value is not none -%}\n"
-        "                            {%- if ns_items.found_first %},{% endif -%}\n"
-        "                            {%- set ns_items.found_first = true -%}\n"
-        "                            {%- if item_key == 'properties' -%}\n"
-        "                                properties:{\n"
-        "                                {%- if item_value is mapping -%}\n"
-        "                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n"
-        "                                {%- endif -%}\n"
-        "                                }\n"
-        "                            {%- elif item_key == 'required' -%}\n"
-        "                                required:[\n"
-        "                                {%- for req_item in item_value -%}\n"
-        "                                    <|\"|>{{- req_item -}}<|\"|>\n"
-        "                                    {%- if not loop.last %},{% endif -%}\n"
-        "                                {%- endfor -%}\n"
-        "                                ]\n"
-        "                            {%- elif item_key == 'type' -%}\n"
-        "                                {%- if item_value is string -%}\n"
-        "                                    type:{{ format_argument(item_value | upper) }}\n"
-        "                                {%- else -%}\n"
-        "                                    type:{{ format_argument(item_value | map('upper') | list) }}\n"
-        "                                {%- endif -%}\n"
-        "                            {%- else -%}\n"
-        "                                {{ item_key }}:{{ format_argument(item_value) }}\n"
-        "                            {%- endif -%}\n"
-        "                        {%- endif -%}\n"
-        "                    {%- endfor -%}\n"
-        "                    }\n"
-        "                {%- endif -%}\n"
-        "            {%- endif -%}\n"
-        "            {%- if value['nullable'] %}\n"
-        "                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
-        "                nullable:true\n"
-        "            {%- endif -%}\n"
-        "            {%- if value['type'] | upper == 'OBJECT' -%}\n"
-        "                {%- if value['properties'] is defined and value['properties'] is mapping -%}\n"
-        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
-        "                    properties:{\n"
-        "                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n"
-        "                    }\n"
-        "                {%- elif value is mapping -%}\n"
-        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
-        "                    properties:{\n"
-        "                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n"
-        "                    }\n"
-        "                {%- endif -%}\n"
-        "                {%- if value['required'] -%}\n"
-        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
-        "                    required:[\n"
-        "                    {%- for item in value['required'] | default([]) -%}\n"
-        "                        <|\"|>{{- item -}}<|\"|>\n"
-        "                        {%- if not loop.last %},{% endif -%}\n"
-        "                    {%- endfor -%}\n"
-        "                    ]\n"
-        "                {%- endif -%}\n"
-        "            {%- endif -%}\n"
-        "            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
-        "            type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n"
-        "        {%- endif -%}\n"
-        "    {%- endfor -%}\n"
-        "{%- endmacro -%}\n"
-        "{%- macro format_function_declaration(tool_data) -%}\n"
-        "    declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n"
-        "    {%- set params = tool_data['function']['parameters'] -%}\n"
-        "    {%- if params -%}\n"
-        "        ,parameters:{\n"
-        "        {%- if params.get('properties') -%}\n"
-        "            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n"
-        "        {%- endif -%}\n"
-        "        {%- if params.get('required') -%}\n"
-        "            required:[\n"
-        "            {%- for item in params['required'] -%}\n"
-        "                <|\"|>{{- item -}}<|\"|>\n"
-        "                {{- ',' if not loop.last -}}\n"
-        "            {%- endfor -%}\n"
-        "            ],\n"
-        "        {%- endif -%}\n"
-        "        {%- if params.get('type') -%}\n"
-        "            type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n"
-        "        {%- endif -%}\n"
-        "    {%- endif -%}\n"
-        "    {%- if 'response' in tool_data['function'] -%}\n"
-        "        {%- set response_declaration = tool_data['function']['response'] -%}\n"
-        "        ,response:{\n"
-        "        {%- if response_declaration['description'] -%}\n"
-        "            description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n"
-        "        {%- endif -%}\n"
-        "        {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n"
-        "            type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n"
-        "        {%- endif -%}\n"
-        "    {%- endif -%}\n"
-        "    }\n"
-        "{%- endmacro -%}\n"
-        "{%- macro format_argument(argument, escape_keys=True) -%}\n"
-        "    {%- if argument is string -%}\n"
-        "        {{- '<|\"|>' + argument + '<|\"|>' -}}\n"
-        "    {%- elif argument is boolean -%}\n"
-        "        {{- 'true' if argument else 'false' -}}\n"
-        "    {%- elif argument is mapping -%}\n"
-        "        {{- '{' -}}\n"
-        "        {%- set ns = namespace(found_first=false) -%}\n"
-        "        {%- for key, value in argument | dictsort -%}\n"
-        "            {%- if ns.found_first %},{% endif -%}\n"
-        "            {%- set ns.found_first = true -%}\n"
-        "            {%- if escape_keys -%}\n"
-        "                {{- '<|\"|>' + key + '<|\"|>' -}}\n"
-        "            {%- else -%}\n"
-        "                {{- key -}}\n"
-        "            {%- endif -%}\n"
-        "            :{{- format_argument(value, escape_keys=escape_keys) -}}\n"
-        "        {%- endfor -%}\n"
-        "        {{- '}' -}}\n"
-        "    {%- elif argument is sequence -%}\n"
-        "        {{- '[' -}}\n"
-        "        {%- for item in argument -%}\n"
-        "            {{- format_argument(item, escape_keys=escape_keys) -}}\n"
-        "            {%- if not loop.last %},{% endif -%}\n"
-        "        {%- endfor -%}\n"
-        "        {{- ']' -}}\n"
-        "    {%- else -%}\n"
-        "        {{- argument -}}\n"
-        "    {%- endif -%}\n"
-        "{%- endmacro -%}\n"
-        "{%- macro strip_thinking(text) -%}\n"
-        "    {%- set ns = namespace(result='') -%}\n"
-        "    {%- for part in text.split('<channel|>') -%}\n"
-        "        {%- if '<|channel>' in part -%}\n"
-        "            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n"
-        "        {%- else -%}\n"
-        "            {%- set ns.result = ns.result + part -%}\n"
-        "        {%- endif -%}\n"
-        "    {%- endfor -%}\n"
-        "    {{- ns.result | trim -}}\n"
-        "{%- endmacro -%}\n"
-        "\n"
-        "{%- macro format_tool_response_block(tool_name, response) -%}\n"
-        "    {{- '<|tool_response>' -}}\n"
-        "    {%- if response is mapping -%}\n"
-        "        {{- 'response:' + tool_name + '{' -}}\n"
-        "        {%- for key, value in response | dictsort -%}\n"
-        "            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n"
-        "            {%- if not loop.last %},{% endif -%}\n"
-        "        {%- endfor -%}\n"
-        "        {{- '}' -}}\n"
-        "    {%- else -%}\n"
-        "        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n"
-        "    {%- endif -%}\n"
-        "    {{- '<tool_response|>' -}}\n"
-        "{%- endmacro -%}\n"
-        "\n"
-        "{%- set ns = namespace(prev_message_type=None) -%}\n"
-        "{%- set loop_messages = messages -%}\n"
-        "{{- bos_token -}}\n"
-        "{#- Handle System/Tool Definitions Block -#}\n"
-        "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n"
-        "    {{- '<|turn>system\\n' -}}\n"
-        "    {#- Inject Thinking token at the very top of the FIRST system turn -#}\n"
-        "    {%- if enable_thinking is defined and enable_thinking -%}\n"
-        "        {{- '<|think|>\\n' -}}\n"
-        "        {%- set ns.prev_message_type = 'think' -%}\n"
-        "    {%- endif -%}\n"
-        "    {%- if messages[0]['role'] in ['system', 'developer'] -%}\n"
-        "        {%- if messages[0]['content'] is string -%}\n"
-        "            {{- messages[0]['content'] | trim -}}\n"
-        "        {%- elif messages[0]['content'] is sequence -%}\n"
-        "            {%- for item in messages[0]['content'] -%}\n"
-        "                {{- item['text'] | trim + ' '-}}\n"
-        "            {%- endfor -%}\n"
-        "        {%- endif -%}\n"
-        "        {%- set loop_messages = messages[1:] -%}\n"
-        "    {%- endif -%}\n"
-        "    {%- if tools -%}\n"
-        "        {%- for tool in tools %}\n"
-        "            {{- '<|tool>' -}}\n"
-        "            {{- format_function_declaration(tool) | trim -}}\n"
-        "            {{- '<tool|>' -}}\n"
-        "        {%- endfor %}\n"
-        "        {%- set ns.prev_message_type = 'tool' -%}\n"
-        "    {%- endif -%}\n"
-        "    {{- '<turn|>\\n' -}}\n"
-        "{%- endif %}\n"
-        "\n"
-        "{#- Pre-scan: find last user message index for reasoning guard -#}\n"
-        "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n"
-        "{%- for i in range(loop_messages | length) -%}\n"
-        "    {%- if loop_messages[i]['role'] == 'user' -%}\n"
-        "        {%- set ns_turn.last_user_idx = i -%}\n"
-        "    {%- endif -%}\n"
-        "{%- endfor -%}\n"
-        "\n"
-        "{#- Loop through messages -#}\n"
-        "{%- for message in loop_messages -%}\n"
-        "    {%- if message['role'] != 'tool' -%}\n"
-        "    {%- set ns.prev_message_type = None -%}\n"
-        "    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n"
-        "    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n"
-        "    {%- set prev_nt = namespace(role=None, found=false) -%}\n"
-        "    {%- if loop.index0 > 0 -%}\n"
-        "        {%- for j in range(loop.index0 - 1, -1, -1) -%}\n"
-        "            {%- if not prev_nt.found -%}\n"
-        "                {%- if loop_messages[j]['role'] != 'tool' -%}\n"
-        "                    {%- set prev_nt.role = loop_messages[j]['role'] -%}\n"
-        "                    {%- set prev_nt.found = true -%}\n"
-        "                {%- endif -%}\n"
-        "            {%- endif -%}\n"
-        "        {%- endfor -%}\n"
-        "    {%- endif -%}\n"
-        "    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n"
-        "    {%- if not continue_same_model_turn -%}\n"
-        "        {{- '<|turn>' + role + '\\n' }}\n"
-        "    {%- endif -%}\n"
-        "\n"
-        "    {#- Render reasoning/reasoning_content as thinking channel -#}\n"
-        "    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n"
-        "    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n"
-        "        {{- '<|channel>thought\\n' + thinking_text + '\\n<channel|>' -}}\n"
-        "    {%- endif -%}\n"
-        "\n"
-        "            {%- if message.get('tool_calls') -%}\n"
-        "                {%- for tool_call in message['tool_calls'] -%}\n"
-        "                    {%- set function = tool_call['function'] -%}\n"
-        "                    {{- '<|tool_call>call:' + function['name'] + '{' -}}\n"
-        "                    {%- if function['arguments'] is mapping -%}\n"
-        "                        {%- set ns_args = namespace(found_first=false) -%}\n"
-        "                        {%- for key, value in function['arguments'] | dictsort -%}\n"
-        "                            {%- if ns_args.found_first %},{% endif -%}\n"
-        "                            {%- set ns_args.found_first = true -%}\n"
-        "                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n"
-        "                        {%- endfor -%}\n"
-        "                    {%- elif function['arguments'] is string -%}\n"
-        "                        {{- function['arguments'] -}}\n"
-        "                    {%- endif -%}\n"
-        "                    {{- '}<tool_call|>' -}}\n"
-        "                {%- endfor -%}\n"
-        "                {%- set ns.prev_message_type = 'tool_call' -%}\n"
-        "            {%- endif -%}\n"
-        "\n"
-        "            {%- set ns_tr_out = namespace(flag=false) -%}\n"
-        "            {%- if message.get('tool_responses') -%}\n"
-        "                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n"
-        "                {%- for tool_response in message['tool_responses'] -%}\n"
-        "                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n"
-        "                    {%- set ns_tr_out.flag = true -%}\n"
-        "                    {%- set ns.prev_message_type = 'tool_response' -%}\n"
-        "                {%- endfor -%}\n"
-        "            {%- elif message.get('tool_calls') -%}\n"
-        "                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n"
-        "                {%- set ns_tool_scan = namespace(stopped=false) -%}\n"
-        "                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n"
-        "                    {%- if ns_tool_scan.stopped -%}\n"
-        "                    {%- elif loop_messages[k]['role'] != 'tool' -%}\n"
-        "                        {%- set ns_tool_scan.stopped = true -%}\n"
-        "                    {%- else -%}\n"
-        "                        {%- set follow = loop_messages[k] -%}\n"
-        "                        {#- Resolve tool_call_id to function name -#}\n"
-        "                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n"
-        "                        {%- for tc in message['tool_calls'] -%}\n"
-        "                            {%- if tc.get('id') == follow.get('tool_call_id') -%}\n"
-        "                                {%- set ns_tname.name = tc['function']['name'] -%}\n"
-        "                            {%- endif -%}\n"
-        "                        {%- endfor -%}\n"
-        "                        {#- Handle content as string or content-parts array -#}\n"
-        "                        {%- set tool_body = follow.get('content') -%}\n"
-        "                        {%- if tool_body is string -%}\n"
-        "                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n"
-        "                        {%- elif tool_body is sequence and tool_body is not string -%}\n"
-        "                            {%- set ns_txt = namespace(s='') -%}\n"
-        "                            {%- for part in tool_body -%}\n"
-        "                                {%- if part.get('type') == 'text' -%}\n"
-        "                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n"
-        "                                {%- endif -%}\n"
-        "                            {%- endfor -%}\n"
-        "                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n"
-        "                            {%- for part in tool_body -%}\n"
-        "                                {%- if part.get('type') == 'image_url' -%}\n"
-        "                                    {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n"
-        "                                    {{- '<|image|>' + url_val -}}\n"
-        "                                {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n"
-        "                                    {%- if part.get('type') == 'audio_url' -%}\n"
-        "                                        {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n"
-        "                                        {{- '<|audio|>' + audio_val -}}\n"
-        "                                    {%- elif part.get('type') == 'input_audio' -%}\n"
-        "                                        {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n"
-        "                                        {{- '<|audio|>' + audio_val -}}\n"
-        "                                    {%- endif -%}\n"
-        # "                              {%- elif part.get('type') == 'video_url' -%}\n"
-        # "                                  {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n"
-        # "                                  {{- '<|video|>' + video_val -}}\n"
-        "                                {%- endif -%}\n"
-        "                            {%- endfor -%}\n"
-        "                        {%- else -%}\n"
-        "                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n"
-        "                        {%- endif -%}\n"
-        "                        {%- set ns_tr_out.flag = true -%}\n"
-        "                        {%- set ns.prev_message_type = 'tool_response' -%}\n"
-        "                    {%- endif -%}\n"
-        "                {%- endfor -%}\n"
-        "            {%- endif -%}\n"
-        "\n"
-        "            {%- set captured_content -%}\n"
-        "            {%- if message['content'] is string -%}\n"
-        "                {%- if role == 'model' -%}\n"
-        "                    {{- strip_thinking(message['content']) -}}\n"
-        "                {%- else -%}\n"
-        "                    {{- message['content'] | trim -}}\n"
-        "                {%- endif -%}\n"
-        "            {%- elif message['content'] is sequence -%}\n"
-        "                {%- for item in message['content'] -%}\n"
-        "                    {%- if item['type'] == 'text' -%}\n"
-        "                        {%- if role == 'model' -%}\n"
-        "                            {{- strip_thinking(item['text']) -}}\n"
-        "                        {%- else -%}\n"
-        "                            {{- item['text'] | trim -}}\n"
-        "                        {%- endif -%}\n"
-        "                    {%- elif item['type'] == 'image_url' -%}\n"
-        "                        {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n"
-        "                        {{- '<|image|>' + url_val -}}\n"
-        "                        {%- set ns.prev_message_type = 'image' -%}\n"
-        "                    {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n"
-        "                        {%- if item['type'] == 'audio_url' -%}\n"
-        "                            {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n"
-        "                            {{- '<|audio|>' + audio_val -}}\n"
-        "                        {%- elif item['type'] == 'input_audio' -%}\n"
-        "                            {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n"
-        "                            {{- '<|audio|>' + audio_val -}}\n"
-        "                        {%- endif -%}\n"
-        "                        {%- set ns.prev_message_type = 'audio' -%}\n"
-        "                    {%- endif -%}\n"
-        # "                    {%- elif item['type'] == 'video_url' -%}\n"
-        # "                        {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n"
-        # "                        {{- '<|video|>' + video_val -}}\n"
-        # "                        {%- set ns.prev_message_type = 'video' -%}\n"
-        "                {%- endfor -%}\n"
-        "            {%- endif -%}\n"
-        "            {%- endset -%}\n"
-        "\n"
-        "            {{- captured_content -}}\n"
-        "            {%- set has_content = captured_content | trim | length > 0 -%}\n"
-        "\n"
-        "        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n"
-        "            {{- '<|tool_response>' -}}\n"
-        "        {%- elif not (ns_tr_out.flag and not has_content) -%}\n"
-        "            {{- '<turn|>\\n' -}}\n"
-        "        {%- endif -%}\n"
-        "    {%- endif -%}\n"
-        "{%- endfor -%}\n"
-        "\n"
-        "{%- if add_generation_prompt -%}\n"
-        "    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n"
-        "        {{- '<|turn>model\\n' -}}\n"
-        "        {%- if not enable_thinking | default(false) -%}\n"
-        "            {{- '<|channel>thought\\n<channel|>' -}}\n"
-        "        {%- endif -%}\n"
-        "    {%- endif -%}\n"
-        "{%- endif -%}\n"
-    )
-
-    def __init__(self, enable_thinking: bool = True, **kwargs):
-        """
-        Initializes the Gemma 4 Handler.
-
-        Args:
-            enable_thinking (bool): Controls whether the <|think|> tag is injected and
-                                    manages <|channel>thought behavior.
-                                    Note: ONLY supported on Gemma4 31B and 26BA4B models.
-                                    NOT supported on Gemma4 E2B and E4B models.
-        """
-        self.enable_thinking = enable_thinking
-        super().__init__(**kwargs)
-
-    def __call__(self, **kwargs):
-        # Inject the thinking variable into the Jinja environment
-        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
-
-        # Set the stop token based on Gemma 4's format (<turn|>)
-        # generation_config.json:   "eos_token_id": [1, 106, 50]
-        kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN]
-
-        if self.verbose:
-            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
-
-        return super().__call__(**kwargs)
-
-
-class GLM41VChatHandler(MTMDChatHandler):
-    # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32.
-
-    GLM41V_EOS_TOKEN = "<|endoftext|>"
-    GLM41V_PAD_TOKEN = "<|endoftext|>"
-    GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>"
-    GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>"
-
-    CHAT_FORMAT = (
-        "[gMASK]<sop>\n"
-        "{%- for msg in messages -%}"
-            "{%- if msg.role == 'system' -%}"
-                "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
-            "{%- elif msg.role == 'user' -%}"
-                "<|user|>\n"
-                "{%- if msg.content is string -%}"
-                    "{{ msg.content }}"
-                "{%- else -%}"
-                    "{%- for item in msg.content -%}"
-                        "{%- if item.type == 'image_url' or 'image_url' in item -%}"
-                            "<|begin_of_image|>"
-                            "{%- if item.image_url is string -%}"
-                                "{{- item.image_url -}}"
-                            "{%- else -%}"
-                                "{{- item.image_url.url -}}"
-                            "{%- endif -%}"
-                            "<|end_of_image|>"
-                        "{%- elif item.type == 'text' -%}"
-                            "{{ item.text }}"
-                        "{%- endif -%}"
-                    "{%- endfor -%}"
-                "{%- endif -%}{{ GLM41V_EOS_TOKEN }}"
-            "{%- elif msg.role == 'assistant' -%}"
-                "{%- if msg.metadata -%}"
-                    "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
-                "{%- else -%}"
-                    "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
-                "{%- endif -%}"
-            "{%- endif -%}"
-        "{%- endfor -%}"
-        "{%- if add_generation_prompt -%}"
-            "<|assistant|>\n"
-        "{%- endif -%}"
-    )
-
-    def __call__(self, **kwargs):
-        self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN
-        # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json
-        stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", "</answer>"] # Stop token patch
-        kwargs['stop'] = stop_tokens
-
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix} - Start processing")
-
-        # Use parent implementation
-        return super().__call__(**kwargs)
-
-
-class GLM46VChatHandler(MTMDChatHandler):
-    GLM46V_EOS_TOKEN = "<|endoftext|>"
-    GLM46V_PAD_TOKEN = "<|endoftext|>"
-    GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>"
-    GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>"
-
-    CHAT_FORMAT = (
-        "[gMASK]<sop>"
-        "{%- if tools -%}"
-            "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n"
-            "You are provided with function signatures within <tools></tools> XML tags:\n<tools>\n"
-            "{%- for tool in tools -%}"
-                "{{ tool | tojson(ensure_ascii=False) }}\n"
-            "{%- endfor -%}"
-            "</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n"
-            "<tool_call>{function-name}\n<arg_key>{arg-key-1}</arg_key>\n<arg_value>{arg-value-1}</arg_value>\n...\n</tool_call>"
-        "{%- endif -%}"
-
-        "{%- for m in messages -%}"
-            "{%- if m.role == 'system' -%}"
-                "<|system|>\n{{ m.content }}"
-            "{%- elif m.role == 'user' -%}"
-                "<|user|>\n"
-                "{%- if m.content is string -%}"
-                    "{{ m.content }}"
-                "{%- else -%}"
-                    "{%- for item in m.content -%}"
-                        "{%- if item.type == 'image_url' or 'image_url' in item -%}"
-                            "<|begin_of_image|>"
-                            "{%- if item.image_url is string -%}"
-                                "{{- item.image_url -}}"
-                            "{%- else -%}"
-                                "{{- item.image_url.url -}}"
-                            "{%- endif -%}"
-                            "<|end_of_image|>"
-                        "{%- elif item.type == 'text' -%}"
-                            "{{ item.text }}"
-                        "{%- endif -%}"
-                    "{%- endfor -%}"
-                "{%- endif -%}"
-                # If enable_thinking is disabled, insert `/nothink` according to the source code logic.
-                "{{ '/nothink' if not enable_thinking else '' }}"
-            "{%- elif m.role == 'assistant' -%}"
-                "<|assistant|>"
-                "{%- if enable_thinking -%}"
-                    "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}"
-                    "\n<think>{{ reasoning.strip() }}</think>"
-                "{%- else -%}"
-                    "\n<think></think>"
-                "{%- endif -%}"
-                "{{ '\n' + m.content.strip() if m.content.strip() else '' }}"
-            "{%- endif -%}"
-            "{{ GLM46V_EOS_TOKEN }}"
-        "{%- endfor -%}"
-
-        "{%- if add_generation_prompt -%}"
-            "<|assistant|>\n"
-            "{{ '<think>' if enable_thinking else '<think></think>\n' }}"
-        "{%- endif -%}"
-    )
-
-    def __init__(self, enable_thinking: bool = True, **kwargs):
-        """
-        GLM-4.6V Handler
-        Parameters:
-        - enable_thinking (bool): Whether to enable the model's think process. The default is True.
-        """
-        self.enable_thinking = enable_thinking
-        super().__init__(**kwargs)
-
-    def __call__(self, **kwargs):
-        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
-        self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN
-
-        # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json
-        kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch
-
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
-
-        return super().__call__(**kwargs)
-
-
-class GraniteDoclingChatHandler(MTMDChatHandler):
-    """
-    Handler for Granite-Docling models.
-
-    Format(512x512): <loc_xmin><loc_ymin><loc_xmax><loc_ymax>Content
-
-    Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!!
-                    Since the model does not have special tokens for the start and end of an image,
-                    it is recommended to process only one image at a time.
-                    You can iterate through the images individually for recognition.
-
-    """
-    GRANITE_BOS_TOKEN = "<|start_of_role|>"
-    GRANITE_EOS_TOKEN = "<|end_of_text|>"
-    GRANITE_PAD_TOKEN = "<|end_of_text|>"
-    GRANITE_IMAGE_TOKEN = "<image>"
-
-    CHAT_FORMAT = (
-        "{%- for message in messages -%}"
-            "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}"
-            "{%- if message['content'] is string -%}"
-                "{{- message['content'] -}}"
-            "{%- else -%}"
-                "{%- for part in message['content'] -%}"
-                    "{%- if part['type'] == 'text' -%}"
-                        "{{- part['text'] -}}"
-                    "{%- elif part['type'] == 'image_url' -%}"
-                        "{%- if part.image_url is string -%}"
-                            "{{- part.image_url -}}"
-                        "{%- else -%}"
-                            "{{- part.image_url.url -}}"
-                        "{%- endif -%}"
-                    "{%- endif -%}"
-                "{%- endfor -%}"
-            "{%- endif -%}"
-            "{{- '<|end_of_text|>\n' -}}"
-        "{%- endfor -%}"
-        "{%- if add_generation_prompt -%}"
-            "{{- '<|start_of_role|>assistant' -}}"
-            # Support the 'controls' parameter if present in generation arguments
-            "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}"
-            "{{- '<|end_of_role|>' -}}"
-        "{%- endif -%}"
-    )
-
-    def __init__(self, controls: dict = None, **kwargs):
-        """
-        Granite-Docling Handler
-        Args:
-            controls (dict, optional): Operational parameters passed to the assistant role.
-
-            The 'controls' parameter is used to guide the model's behavior or output format.
-            Common examples for 'controls' include:
-             - Document Parsing: {"mode": "document_parsing", "format": "json"}
-        """
-        self.controls = controls
-        super().__init__(**kwargs)
-
-    def __call__(self, **kwargs):
-        # Inject controls into the template environment
-        self.extra_template_arguments["controls"] = self.controls
-        self.DEFAULT_SYSTEM_MESSAGE = None
-        kwargs['stop'] = [self.GRANITE_EOS_TOKEN]
-
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix} - Start processing")
-
-
-        return super().__call__(**kwargs)
-
-
-class LFM2VLChatHandler(MTMDChatHandler):
-    LFM2VL_BOS_TOKEN = "<|startoftext|>"
-    LFM2VL_EOS_TOKEN = "<|im_end|>"
-    LFM2VL_IMAGE_START_TOKEN = "<|image_start|>"
-    LFM2VL_IMAGE_END_TOKEN = "<|image_end|>"
-
-    CHAT_FORMAT = (
-        "{%- for message in messages -%}"
-            "{{ '<|im_start|>' + message['role'] + '\n' }}"
-            "{%- if message['content'] is string -%}"
-                "{{ message['content'] }}"
-            "{%- else -%}"
-                "{%- for content in message['content'] -%}"
-                    "{%- if 'image_url' in content -%}"
-                        "{%- if content.image_url is string -%}"
-                            "<|image_start|>{{ content.image_url }}<|image_end|>"
-                        "{%- else -%}"
-                            "<|image_start|>{{ content.image_url.url }}<|image_end|>"
-                        "{%- endif -%}"
-                    "{%- elif content['type'] == 'text' -%}"
-                        "{{ content['text'] }}"
-                    "{%- endif -%}"
-                "{%- endfor -%}"
-            "{%- endif -%}"
-            "{{ '<|im_end|>\n' }}"
-        "{%- endfor -%}"
-        "{%- if add_generation_prompt -%}"
-            "{{ '<|im_start|>assistant\n' }}"
-        "{%- endif -%}"
-    )
-
-    def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs):
-        """
-        LFM2-VL Handler
-        LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256
-        """
-        self.image_min_tokens = image_min_tokens
-        self.image_max_tokens = image_max_tokens
-        super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs)
-
-    def __call__(self, **kwargs):
-
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix} - Start processing")
-
-        return super().__call__(**kwargs)
-
-
-class LFM25VLChatHandler(MTMDChatHandler):
-    """
-    Handler for LFM2.5-VL multimodal models.
-
-    Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing.
-    """
-    # Aligned with LFM2.5-VL tokenizer_config
-    LFM25VL_BOS_TOKEN = "<|startoftext|>"
-    LFM25VL_EOS_TOKEN = "<|im_end|>"
-    LFM25VL_PAD_TOKEN = "<|pad|>"
-
-    # Image specific tokens
-    LFM25VL_IMAGE_TOKEN = "<image>"
-    LFM25VL_IMAGE_START_TOKEN = "<|image_start|>"
-    LFM25VL_IMAGE_END_TOKEN = "<|image_end|>"
-    LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>"
-
-    CHAT_FORMAT = (
-        "{{- bos_token -}}\n"
-        "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n"
-        "{%- set ns = namespace(system_prompt='', content='') -%}\n"
-        "{%- if messages[0]['role'] == 'system' -%}\n"
-        "    {%- set ns.system_prompt = messages[0]['content'] -%}\n"
-        "    {%- set messages = messages[1:] -%}\n"
-        "{%- endif -%}\n"
-        "{%- if tools -%}\n"
-        "    {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n"
-        "    {%- for tool in tools -%}\n"
-        "        {%- if tool is not string -%}\n"
-        "            {%- set tool = tool | tojson -%}\n"
-        "        {%- endif -%}\n"
-        "        {%- set ns.system_prompt = ns.system_prompt + tool -%}\n"
-        "        {%- if not loop.last -%}\n"
-        "            {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n"
-        "        {%- endif -%}\n"
-        "    {%- endfor -%}\n"
-        "    {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n"
-        "{%- endif -%}\n"
-        "{%- if ns.system_prompt -%}\n"
-        "    {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n"
-        "{%- endif -%}\n"
-        "{%- set ns.last_assistant_index = -1 -%}\n"
-        "{%- for message in messages -%}\n"
-        "    {%- if message['role'] == 'assistant' -%}\n"
-        "        {%- set ns.last_assistant_index = loop.index0 -%}\n"
-        "    {%- endif -%}\n"
-        "{%- endfor -%}\n"
-        "{%- for message in messages -%}\n"
-        "    {{- '<|im_start|>' + message['role'] + '\\n' -}}\n"
-        "    {%- set content = message['content'] -%}\n"
-        "    {%- if content is not string -%}\n"
-        "        {%- set ns.content = '' -%}\n"
-        "        {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n"
-        "        {%- for item in content -%}\n"
-        "            {%- if item['type'] == 'image_url' -%}\n"
-        "                {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n"
-        "                {%- set ns.content = ns.content + img_val -%}\n"
-        "            {%- elif item['type'] == 'text' -%}\n"
-        "                {%- set ns.content = ns.content + item['text'] -%}\n"
-        "            {%- else -%}\n"
-        "                {%- set ns.content = ns.content + (item | tojson) -%}\n"
-        "            {%- endif -%}\n"
-        "        {%- endfor -%}\n"
-        "        {%- set content = ns.content -%}\n"
-        "    {%- endif -%}\n"
-        "    {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n"
-        "        {%- if '</think>' in content -%}\n"
-        "            {%- set content = content.split('</think>')[-1] | trim -%}\n"
-        "        {%- endif -%}\n"
-        "    {%- endif -%}\n"
-        "    {{- content + '<|im_end|>\\n' -}}\n"
-        "{%- endfor -%}\n"
-        "{%- if add_generation_prompt -%}\n"
-        "    {{- '<|im_start|>assistant\\n' -}}\n"
-        "{%- endif -%}\n"
-    )
-
-    def __init__(self, keep_past_thinking: bool = False, **kwargs):
-        self.keep_past_thinking = keep_past_thinking
-        super().__init__(**kwargs)
-
-
-    def __call__(self, **kwargs):
-        if self.image_min_tokens > 256:
-            if self.verbose:
-                print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.")
-            self.image_min_tokens = -1
-
-        self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking
-
-        kwargs['stop'] = [self.LFM25VL_EOS_TOKEN]
-
-        if self.verbose:
-            print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing")
-        return super().__call__(**kwargs)
-
-
-class PaddleOCRChatHandler(MTMDChatHandler):
-    """
-    Handler for PaddleOCR 1.5/1.6 multimodal models.
-    """
-
-    PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>"
-    PADDLEOCR_BOS_TOKEN = "<s>"
-    PADDLEOCR_EOS_TOKEN = "</s>"
-    PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>"
-    PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>"
-    PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>"
-
-    CHAT_FORMAT = (
-        "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}"
-        "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}"
-        "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}"
-
-        "{{- cls_token -}}"
-        "{%- for message in messages -%}"
-            "{%- if message['role'] == 'user' -%}"
-                "{{- 'User: ' -}}"
-
-                # Robust parsing: Check if content is string or list
-                "{%- if message['content'] is string -%}"
-                    "{{- message['content'] -}}"
-                "{%- else -%}"
-                    # Pass 1: Render all images first
-                    "{%- for content in message['content'] -%}"
-                        "{%- if content['type'] == 'image_url' and 'image_url' in content -%}"
-                            "{{- '<|IMAGE_START|>' -}}"
-                                "{%- if content.image_url is string -%}"
-                                    "{{- content.image_url -}}"
-                                "{%- else -%}"
-                                    "{{- content.image_url.url -}}"
-                                "{%- endif -%}"
-                            "{{- '<|IMAGE_END|>' -}}"
-                        "{%- endif -%}"
-                    "{%- endfor -%}"
-
-                    # Pass 2: Render all text second
-                    "{%- for content in message['content'] -%}"
-                        "{%- if content['type'] == 'text' -%}"
-                            "{{- content['text'] -}}"
-                        "{%- endif -%}"
-                    "{%- endfor -%}"
-                "{%- endif -%}"
-                "{{- '\\n' -}}"
-
-            "{%- elif message['role'] == 'assistant' -%}"
-                "{{- 'Assistant:\\n' -}}"
-                "{%- if message['content'] is string -%}"
-                    "{{- message['content'] -}}"
-                "{%- else -%}"
-                    "{%- for content in message['content'] -%}"
-                        "{%- if content['type'] == 'text' -%}"
-                            "{{- content['text'] -}}"
-                        "{%- endif -%}"
-                    "{%- endfor -%}"
-                "{%- endif -%}"
-                "{{- eos_token -}}"
-
-            "{%- elif message['role'] == 'system' -%}"
-                "{%- if message['content'] is string -%}"
-                    "{{- message['content'] + '\\n' -}}"
-                "{%- else -%}"
-                    "{%- for content in message['content'] -%}"
-                        "{%- if content['type'] == 'text' -%}"
-                            "{{- content['text'] + '\\n' -}}"
-                        "{%- endif -%}"
-                    "{%- endfor -%}"
-                "{%- endif -%}"
-            "{%- endif -%}"
-        "{%- endfor -%}"
-
-        "{%- if add_generation_prompt -%}"
-            "{{- 'Assistant:\\n' -}}"
-        "{%- endif -%}"
-    )
-
-    def __init__(
-        self,
-        image_min_tokens: int = -1,
-        image_max_tokens: int = -1,
-        **kwargs
-    ):
-        self.image_min_tokens = image_min_tokens
-        self.image_max_tokens = image_max_tokens
-        super().__init__(
-            image_min_tokens=self.image_min_tokens,
-            image_max_tokens=self.image_max_tokens,
-            **kwargs
-        )
-
-    def __call__(self, **kwargs):
-        # Set the specific stop token defined in the PaddleOCR template
-        kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN]
-
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix} - Start processing")
-
-        return super().__call__(**kwargs)
-
-
-class Qwen25VLChatHandler(MTMDChatHandler):
-
-    QWEN25_VL_BOS_TOKEN = "<|endoftext|>"
-    QWEN25_VL_PAD_TOKEN = "<|endoftext|>"
-    QWEN25_VL_EOS_TOKEN = "<|im_end|>"
-
-    CHAT_FORMAT = (
-        "{% set image_count = namespace(value=0) %}"
-        "{% for message in messages %}"
-        "{% if loop.first and message['role'] != 'system' %}"
-        "<|im_start|>system\n"
-        "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n"
-        "{% endif %}"
-        "<|im_start|>{{ message['role'] }}\n"
-        "{% if message['content'] is string %}"
-        "{{ message['content'] }}<|im_end|>\n"
-        "{% else %}"
-        "{% for content in message['content'] %}"
-        "{% if content['type'] == 'image_url' %}"
-        "{% if content.image_url is string %}"
-        "{% set image_count.value = image_count.value + 1 %}"
-        "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>"
-        "{% else %}"
-        "{% set image_count.value = image_count.value + 1 %}"
-        "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>"
-        "{% endif %}"
-        "{% elif content['type'] == 'text' %}"
-        "{{ content['text'] }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        "{% endfor %}"
-        "<|im_start|>assistant\n"
-    )
-
-    def __call__(self, **kwargs):
-        kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN]
-
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix} - Start processing")
-
-        # Use parent implementation
-        return super().__call__(**kwargs)
-
-class Qwen3ASRChatHandler(MTMDChatHandler):
-    """
-    Handler for Qwen 3 ASR (Automatic Speech Recognition) models.
-
-    Features:
-    - Highly specialized for Speech-to-Text tasks.
-    - Aggregates all system text into a single cohesive system block.
-    - Drops user text entirely, extracting ONLY audio data into a unified user turn.
-    - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>.
-    - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url.
-    """
-
-    DEFAULT_SYSTEM_MESSAGE = """
-    You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language.
-    You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization.
-    """
-
-    QWEN3_ASR_BOS_TOKEN = "<|im_start|>"
-    QWEN3_ASR_PAD_TOKEN = "<|endoftext|>"
-    QWEN3_ASR_EOS_TOKEN = "<|im_end|>"
-
-
-    QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>"
-    QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>"
-    QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>"
-
-    CHAT_FORMAT = (
-        "{%- set ns = namespace(system_text='') -%}\n"
-        "{%- for m in messages -%}\n"
-        "    {%- if m.role == 'system' -%}\n"
-        "        {%- if m.content is string -%}\n"
-        "            {%- set ns.system_text = ns.system_text + m.content -%}\n"
-        "        {%- else -%}\n"
-        "            {%- for c in m.content -%}\n"
-        "                {%- if c.type == 'text' and (c.text is defined) -%}\n"
-        "                    {%- set ns.system_text = ns.system_text + c.text -%}\n"
-        "                {%- endif -%}\n"
-        "            {%- endfor -%}\n"
-        "        {%- endif -%}\n"
-        "    {%- endif -%}\n"
-        "{%- endfor -%}\n"
-        "\n"
-        "{%- set ns2 = namespace(audio_tokens='') -%}\n"
-        "{%- for m in messages -%}\n"
-        "    {%- if m.content is not string -%}\n"
-        "        {%- for c in m.content -%}\n"
-        "            {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n"
-        "                {#- MTMD Audio Injection -#}\n"
-        "                {%- set audio_val = '' -%}\n"
-        "                {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n"
-        "                    {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n"
-        "                {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n"
-        "                    {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n"
-        "                {%- endif -%}\n"
-        "                {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n"
-        "            {%- endif -%}\n"
-        "        {%- endfor -%}\n"
-        "    {%- endif -%}\n"
-        "{%- endfor -%}\n"
-        "\n"
-        "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n"
-        "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n"
-        "{%- if add_generation_prompt -%}\n"
-        "    {{- '<|im_start|>assistant\\n' -}}\n"
-        "{%- endif -%}\n"
-    )
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def __call__(self, **kwargs):
-        # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token
-        kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN]
-
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)")
-
-        return super().__call__(**kwargs)
-
-class Qwen3VLChatHandler(MTMDChatHandler):
-
-    QWEN3_VL_BOS_TOKEN = "<|endoftext|>"
-    QWEN3_VL_PAD_TOKEN = "<|endoftext|>"
-    QWEN3_VL_EOS_TOKEN = "<|im_end|>"
-
-    CHAT_FORMAT = (
-        "{{- '<|im_start|>system\n' -}}"
-        "{%- if messages[0].content is string and messages[0].role == 'system' -%}"
-            "{{- messages[0].content -}}"
-        "{%- elif messages[0].role == 'system' -%}"
-            "{%- if 'text' in messages[0].content -%}"
-                "{{- messages[0].content.text -}}"
-            "{%- else -%}"
-                "{{- 'You are a helpful assistant.' -}}"
-            "{%- endif -%}"
-        "{%- endif -%}"
-        "{%- if tools -%}"
-            "{{- '\n\n' -}}"
-            "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>' -}}"
-            "{%- for tool in tools -%}"
-                "{{- '\n' -}}"
-                "{{- tool | tojson -}}"
-            "{%- endfor -%}"
-            "{{- '\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>' -}}"
-        "{%- endif -%}"
-        "{{- '<|im_end|>\n' -}}"
-        "{%- set image_count = namespace(value=0) -%}"
-        #"{%- set video_count = namespace(value=0) -%}"
-        "{%- for message in messages -%}"
-            "{%- if message.role == 'tool' -%}"
-                "{{- '<|im_start|>user\n<tool_response>\n' -}}"
-            "{%- elif message.role != 'system' -%}"
-                "{{- '<|im_start|>' + message.role + '\n' -}}"
-            "{%- endif -%}"
-            "{%- if message.content is string and message.role != 'system' -%}"
-                "{{- message.content -}}"
-            "{%- elif message.role != 'system' -%}"
-                "{%- for content in message.content -%}"
-                    "{%- if 'image_url' in content -%}"
-                        "{%- set image_count.value = image_count.value + 1 -%}"
-                        "{%- if add_vision_id -%}"
-                            "{{- 'Picture ' -}}"
-                            "{{- image_count.value | string -}}"
-                            "{{- ': ' -}}"
-                        "{%- endif -%}"
-                        "{{- '<|vision_start|>' -}}"
-                        "{%- if content.image_url is string -%}"
-                            "{{- content.image_url -}}"
-                        "{%- else -%}"
-                            "{{- content.image_url.url -}}"
-                        "{%- endif -%}"
-                        "{{- '<|vision_end|>' -}}"
-                    "{%- endif -%}"
-                    # Video not supported yet
-                    "{%- if 'text' in content -%}"
-                        "{{- content.text -}}"
-                    "{%- endif -%}"
-                "{%- endfor -%}"
-            "{%- endif -%}"
-            "{%- if message.role == 'assistant' -%}"
-                "{%- if message.tool_calls -%}"
-                    "{%- for tool_call in message.tool_calls -%}"
-                        "{%- if (loop.first and message.content) or (not loop.first) -%}"
-                            "{{- '\n' -}}"
-                        "{%- endif -%}"
-                        "{%- if tool_call.function -%}"
-                            "{%- set tool_call = tool_call.function -%}"
-                        "{%- endif -%}"
-                        "{{- '<tool_call>\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}"
-                        "{%- if tool_call.arguments is string -%}"
-                            "{{- tool_call.arguments -}}"
-                        "{%- else -%}"
-                            "{{- tool_call.arguments | tojson -}}"
-                        "{%- endif -%}"
-                        "{{- '}\n</tool_call>' -}}"
-                    "{%- endfor -%}"
-                "{%- endif -%}"
-            "{%- elif message.role == 'tool' -%}"
-                "{{- '</tool_response>' -}}"
-            "{%- endif -%}"
-            "{%- if message.role != 'system' -%}"
-                "{{- '<|im_end|>\n' -}}"
-            "{%- endif -%}"
-        "{%- endfor -%}"
-        "{%- if add_generation_prompt -%}"
-            "{{- '<|im_start|>assistant\n' -}}"
-            "{%- if force_reasoning -%}"
-                "{{- '<think>\n' -}}"
-            "{%- endif -%}"
-        "{%- endif -%}"
-    )
-
-    def __init__(
-        self,
-        force_reasoning: bool = False,
-        add_vision_id: bool = True,
-        **kwargs,
-    ):
-        """
-        Parameters:
-        - force_reasoning (bool):
-            - True: Force the reasoning in the model by adding <think> to the chat template.
-            - False (default): Don't force the reasoning.
-        - add_vision_id (bool):
-            - True (default): Count all the images. Recommended for multi-image.
-            - False: Doesn't count the images. Can save tokens with single-image.
-        """
-        super().__init__(**kwargs)
-        self.force_reasoning = force_reasoning
-        self.extra_template_arguments["force_reasoning"] = force_reasoning
-        self.extra_template_arguments["add_vision_id"] = add_vision_id
-
-    def __call__(self, **kwargs):
-        kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN]
-
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing")
-
-        # Use parent implementation
-        return super().__call__(**kwargs)
-
-class Qwen35ChatHandler(MTMDChatHandler):
-    """
-    Handler for Qwen3.5/Qwen3.6 models.
-    """
-    CHAT_FORMAT = (
-        "{%- set image_count = namespace(value=0) -%}"
-        "{%- set video_count = namespace(value=0) -%}"
-        "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}"
-        "    {%- if content is string -%}"
-        "        {{- content -}}"
-        "    {%- elif content is iterable and content is not mapping -%}"
-        "        {%- for item in content -%}"
-        "            {%- if 'image_url' in item or item.type == 'image_url' -%}"
-        "                {%- if is_system_content -%}"
-        "                    {{- raise_exception('System message cannot contain images.') -}}"
-        "                {%- endif -%}"
-        "                {%- if do_vision_count -%}"
-        "                    {%- set image_count.value = image_count.value + 1 -%}"
-        "                {%- endif -%}"
-        "                {%- if add_vision_id -%}"
-        "                    {{- 'Picture ' -}}"
-        "                    {{- image_count.value | string -}}"
-        "                    {{- ': ' -}}"
-        "                {%- endif -%}"
-        "                {{- '<|vision_start|>' -}}"
-        "                {%- if item.image_url is string -%}"
-        "                    {{- item.image_url -}}"
-        "                {%- else -%}"
-        "                    {{- item.image_url.url -}}"
-        "                {%- endif -%}"
-        "                {{- '<|vision_end|>' -}}"
-        "            {%- elif 'video' in item -%}"
-        "                {{- raise_exception('llama.cpp does not currently support video.') -}}"  # Video not supported, raise exception
-        "                {%- if is_system_content -%}"
-        "                    {{- raise_exception('System message cannot contain videos.') -}}"
-        "                {%- endif -%}"
-        "                {%- if do_vision_count -%}"
-        "                    {%- set video_count.value = video_count.value + 1 -%}"
-        "                {%- endif -%}"
-        "                {%- if add_vision_id -%}"
-        "                    {{- 'Video ' ~ video_count.value ~ ': ' -}}"
-        "                {%- endif -%}"
-        "                {{- '<|vision_start|>' -}}"
-        "                {{- item.video -}}"
-        "                {{- '<|vision_end|>' -}}"
-        "            {%- elif 'text' in item -%}"
-        "                {{- item.text -}}"
-        "            {%- else -%}"
-        "                {{- raise_exception('Unexpected item type in content.') -}}"
-        "            {%- endif -%}"
-        "        {%- endfor -%}"
-        "    {%- elif content is none or content is undefined -%}"
-        "        {{- '' -}}"
-        "    {%- else -%}"
-        "        {{- raise_exception('Unexpected content type.') -}}"
-        "    {%- endif -%}"
-        "{%- endmacro -%}"
-        "{%- if not messages -%}"
-        "    {{- raise_exception('No messages provided.') -}}"
-        "{%- endif -%}"
-        "{%- if tools and tools is iterable and tools is not mapping -%}"
-        "    {{- '<|im_start|>system\n' -}}"
-        "    {{- '# Tools\n\nYou have access to the following functions:\n\n<tools>' -}}"
-        "    {%- for tool in tools -%}"
-        "        {{- '\n' -}}"
-        "        {{- tool | tojson -}}"
-        "    {%- endfor -%}"
-        "    {{- '\n</tools>' -}}"
-        "    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' -}}"
-        "    {%- if messages[0].role == 'system' -%}"
-        "        {%- set content = render_content(messages[0].content, false, true) | trim -%}"
-        "        {%- if content -%}"
-        "            {{- '\n\n' + content -}}"
-        "        {%- endif -%}"
-        "    {%- endif -%}"
-        "    {{- '<|im_end|>\n' -}}"
-        "{%- elif messages[0].role == 'system' -%}"
-        "    {%- set content = render_content(messages[0].content, false, true) -%}"
-        "    {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}"
-        "{%- endif -%}"
-        "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}"
-        "{%- for message in messages[::-1] -%}"
-        "    {%- set index = messages | length - 1 - loop.index0 -%}"
-        "    {%- if ns.multi_step_tool and message.role == 'user' -%}"
-        "        {%- set content = render_content(message.content, false) | trim -%}"
-        "        {%- if not (content.startswith('<tool_response>') and content.endswith('</tool_response>')) -%}"
-        "            {%- set ns.multi_step_tool = false -%}"
-        "            {%- set ns.last_query_index = index -%}"
-        "        {%- endif -%}"
-        "    {%- endif -%}"
-        "{%- endfor -%}"
-        "{%- if ns.multi_step_tool -%}"
-        "    {{- raise_exception('No user query found in messages.') -}}"
-        "{%- endif -%}"
-        "{%- for message in messages -%}"
-        "    {%- set content = render_content(message.content, true) | trim -%}"
-        "    {%- if message.role == 'system' -%}"
-        "        {%- if not loop.first -%}"
-        "            {{- raise_exception('System message must be at the beginning.') -}}"
-        "        {%- endif -%}"
-        "    {%- elif message.role == 'user' -%}"
-        "        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}"
-        "    {%- elif message.role == 'assistant' -%}"
-        "        {%- set reasoning_content = '' -%}"
-        "        {%- if message.reasoning_content is string -%}"
-        "            {%- set reasoning_content = message.reasoning_content -%}"
-        "        {%- elif '</think>' in content -%}"
-        "            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') -%}"
-        "            {%- set content = content.split('</think>')[-1].lstrip('\n') -%}"
-        "        {%- endif -%}"
-        "        {%- set reasoning_content = reasoning_content | trim -%}"
-        "        {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}"
-        "            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content -}}"
-        "        {%- else -%}"
-        "            {{- '<|im_start|>' + message.role + '\n' + content -}}"
-        "        {%- endif -%}"
-        "        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}"
-        "            {%- for tool_call in message.tool_calls -%}"
-        "                {%- if tool_call.function is defined -%}"
-        "                    {%- set tool_call = tool_call.function -%}"
-        "                {%- endif -%}"
-        "                {%- if loop.first -%}"
-        "                    {%- if content | trim -%}"
-        "                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' -}}"
-        "                    {%- else -%}"
-        "                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' -}}"
-        "                    {%- endif -%}"
-        "                {%- else -%}"
-        "                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' -}}"
-        "                {%- endif -%}"
-        "                {%- if tool_call.arguments is defined -%}"
-        "                    {%- for (args_name, args_value) in tool_call.arguments | items -%}"
-        "                        {{- '<parameter=' + args_name + '>\n' -}}"
-        "                        {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}"
-        "                        {{- args_value -}}"
-        "                        {{- '\n</parameter>' -}}"
-        "                    {%- endfor -%}"
-        "                {%- endif -%}"
-        "                {{- '</function>\n</tool_call>' -}}"
-        "            {%- endfor -%}"
-        "        {%- endif -%}"
-        "        {{- '<|im_end|>\n' -}}"
-        "    {%- elif message.role == 'tool' -%}"
-        "        {%- if loop.previtem and loop.previtem.role != 'tool' -%}"
-        "            {{- '<|im_start|>user' -}}"
-        "        {%- endif -%}"
-        "        {{- '\n<tool_response>\n' -}}"
-        "        {{- content -}}"
-        "        {{- '\n</tool_response>' -}}"
-        "        {%- if not loop.last and loop.nextitem.role != 'tool' -%}"
-        "            {{- '<|im_end|>\n' -}}"
-        "        {%- elif loop.last -%}"
-        "            {{- '<|im_end|>\n' -}}"
-        "        {%- endif -%}"
-        "    {%- else -%}"
-        "        {{- raise_exception('Unexpected message role.') -}}"
-        "    {%- endif -%}"
-        "{%- endfor -%}"
-        "{%- if add_generation_prompt -%}"
-        "    {{- '<|im_start|>assistant\n' -}}"
-        "    {%- if enable_thinking is defined and enable_thinking is false -%}"
-        "        {{- '<think>\n\n</think>\n\n' -}}"
-        "    {%- else -%}"
-        "        {{- '<think>\n' -}}"
-        "    {%- endif -%}"
-        "{%- endif -%}"
-    )
-
-    def __init__(
-        self,
-        add_vision_id: bool = True,
-        enable_thinking: bool = True,
-        preserve_thinking: bool = False,
-        **kwargs,
-    ):
-        """
-        Parameters:
-        - add_vision_id (bool):
-            - True (default): Count all the images. Recommended for multi-image.
-            - False: Doesn't count the images. Can save tokens with single-image.
-        - enable_thinking (bool):
-            - True (default): Enables reasoning for better results.
-            - False: Disables reasoning for faster results.
-        - preserve_thinking (bool):
-            - True: Keeps <think> reasoning process for ALL historical conversational turns.
-            - False (default): Only keeps <think> for the latest assistant reply to save tokens.
-        """
-        super().__init__(**kwargs)
-        self.enable_thinking = enable_thinking
-        self.preserve_thinking = preserve_thinking
-        self.extra_template_arguments["add_vision_id"] = add_vision_id
-        self.extra_template_arguments["enable_thinking"] = enable_thinking
-        self.extra_template_arguments["preserve_thinking"] = preserve_thinking
-
-    def __call__(self, **kwargs):
-        llama = kwargs['llama']
-
-        if hasattr(llama, 'input_ids'):
-            llama.input_ids.fill(0)
-
-        if self.verbose:
-            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing")
-
-        # Use parent implementation
-        return super().__call__(**kwargs)
-
-
-class Step3VLChatHandler(MTMDChatHandler):
-    """
-    Handler for Step3-VL models.
-    """
-
-    STEP3VL_BOS_TOKEN = "<|im_start|>"
-    STEP3VL_EOS_TOKEN = "<|im_end|>"
-    STEP3VL_PAD_TOKEN = "<|endoftext|>"
-    STEP3VL_IMAGE_TOKEN = "<im_patch>"
-
-    CHAT_FORMAT = (
-        "{%- macro render_content(content) -%}\n"
-        "    {%- if content is none -%}{{- '' -}}\n"
-        "    {%- elif content is string -%}{{- content -}}\n"
-        "    {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n"
-        "    {%- elif content is iterable -%}\n"
-        "        {%- for item in content -%}\n"
-        "            {%- if item.type == 'text' -%}\n"
-        "                {{- item['value'] if 'value' in item else item['text'] -}}\n"
-        "            {%- elif item.type in ['image', 'image_url'] -%}\n"
-        "                {%- set url_val = '' -%}\n"
-        "                {%- if item.image_url -%}\n"
-        "                    {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n"
-        "                {%- endif -%}\n"
-        "                {{- '<im_patch>' + url_val -}}\n"
-        "            {%- endif -%}\n"
-        "        {%- endfor -%}\n"
-        "    {%- endif -%}\n"
-        "{%- endmacro -%}\n"
-        "\n"
-        "{%- if tools -%}\n"
-        "    {{- '<|im_start|>system\\n' -}}\n"
-        "    {%- if messages[0].role == 'system' -%}\n"
-        "        {{- render_content(messages[0].content) + '\\n\\n' -}}\n"
-        "    {%- endif -%}\n"
-        "    {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>' -}}\n"
-        "    {%- for tool in tools -%}\n"
-        "        {{- '\\n' -}}\n"
-        "        {{- tool | tojson -}}\n"
-        "    {%- endfor -%}\n"
-        "    {{- '\\n</tools>\\n\\nAlways adhere to this exact format for tool use:\\n<tool_calls>\\n<tool_call>\\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\\n</tool_call>\\n{additional_tool_calls}</tool_calls>\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags.\\n- `<function-name>` must be an exact match to one of the available tools.\\n- `<args-json-object>` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n"
-        "{%- else -%}\n"
-        "    {%- if messages[0].role == 'system' -%}\n"
-        "        {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n"
-        "    {%- endif -%}\n"
-        "{%- endif -%}\n"
-        "\n"
-        "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n"
-        "{%- for message in messages[::-1] -%}\n"
-        "    {%- set index = (messages|length - 1) - loop.index0 -%}\n"
-        "    {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) -%}\n"
-        "        {%- set ns.multi_step_tool = false -%}\n"
-        "        {%- set ns.last_query_index = index -%}\n"
-        "    {%- endif -%}\n"
-        "{%- endfor -%}\n"
-        "\n"
-        "{%- for message in messages -%}\n"
-        "    {%- set content = render_content(message.content) -%}\n"
-        "    {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n"
-        "        {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n"
-        "        {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n"
-        "    {%- elif message.role == 'assistant' -%}\n"
-        "        {%- if message.reasoning_content is string -%}\n"
-        "            {%- set reasoning_content = render_content(message.reasoning_content) -%}\n"
-        "        {%- else -%}\n"
-        "            {%- if '</think>' in content -%}\n"
-        "                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') -%}\n"
-        "                {%- set content = content.split('</think>')[-1].lstrip('\\n') -%}\n"
-        "            {%- else -%}\n"
-        "                {%- set reasoning_content = '' -%}\n"
-        "            {%- endif -%}\n"
-        "        {%- endif -%}\n"
-        "        {%- if loop.index0 > ns.last_query_index -%}\n"
-        "            {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content + '\\n</think>\\n' + content -}}\n"
-        "        {%- else -%}\n"
-        "            {{- '<|im_start|>' + message.role + '\\n' + content -}}\n"
-        "        {%- endif -%}\n"
-        "        {%- if message.tool_calls -%}\n"
-        "            {{- '\\n<tool_calls>' -}}\n"
-        "            {%- for tool_call in message.tool_calls -%}\n"
-        "                {{- '\\n' -}}\n"
-        "                {%- if tool_call.function -%}\n"
-        "                    {%- set tool_call = tool_call.function -%}\n"
-        "                {%- endif -%}\n"
-        "                {{- '<tool_call>\\n{\"name\": \"' -}}\n"
-        "                {{- tool_call.name -}}\n"
-        "                {{- '\", \"arguments\": ' -}}\n"
-        "                {%- if tool_call.arguments is string -%}\n"
-        "                    {{- tool_call.arguments -}}\n"
-        "                {%- else -%}\n"
-        "                    {{- tool_call.arguments | tojson -}}\n"
-        "                {%- endif -%}\n"
-        "                {{- '}\\n</tool_call>' -}}\n"
-        "            {%- endfor -%}\n"
-        "            {{- '\\n</tool_calls>' -}}\n"
-        "        {%- endif -%}\n"
-        "        {{- '<|im_end|>\\n' -}}\n"
-        "    {%- elif message.role == 'tool' -%}\n"
-        "        {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n"
-        "            {{- '<|im_start|>tool_response' -}}\n"
-        "        {%- endif -%}\n"
-        "        {{- '\\n<tool_response>\\n' -}}\n"
-        "        {{- content -}}\n"
-        "        {{- '\\n</tool_response>' -}}\n"
-        "        {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n"
-        "            {{- '<|im_end|>\\n' -}}\n"
-        "        {%- endif -%}\n"
-        "    {%- endif -%}\n"
-        "{%- endfor -%}\n"
-        "{%- if add_generation_prompt -%}\n"
-        "    {{- '<|im_start|>assistant\\n<think>\\n\\n</think>\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n<think>' -}}\n"
-        "{%- endif -%}\n"
-    )
-
-    def __init__(self, enable_thinking: bool = True, **kwargs):
-        """
-        Initializes the Step3-VL Handler.
-
-        Args:
-            enable_thinking (bool): If False, injects an empty <think> block to bypass reasoning.
-        """
-        self.enable_thinking = enable_thinking
-        super().__init__(**kwargs)
-
-    def __call__(self, **kwargs):
-        # Pass thinking toggle into Jinja
-        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
-
-        # Step3 uses standard <|im_end|> ChatML stop formatting
-        kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN]
-
-        if self.verbose:
-            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
-
-        return super().__call__(**kwargs)
-
-
-@register_chat_completion_handler("chatml-function-calling")
-def chatml_function_calling(
-    llama: llama_core.Llama,
-    messages: List[llama_types.ChatCompletionRequestMessage],
-    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
-    temperature: float = 0.2,
-    top_p: float = 0.95,
-    top_k: int = 40,
-    min_p: float = 0.05,
-    typical_p: float = 1.0,
-    stream: bool = False,
-    stop: Optional[Union[str, List[str]]] = [],
-    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
-    max_tokens: Optional[int] = None,
-    present_penalty: float = 0.0,
-    frequency_penalty: float = 0.0,
-    repeat_penalty: float = 1.1,
-    top_n_sigma: float = -1.00,
-    mirostat_mode: int = 0,
-    mirostat_tau: float = 5.0,
-    mirostat_eta: float = 0.1,
-    xtc_threshold: float = 0.1,
-    xtc_probability: float = 0.0,
-    dry_multiplier: float = 0.0,
-    dry_base: float = 1.75,
-    dry_allowed_length: int = 2,
-    dry_penalty_last_n:int = 0,
-    dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"],
-    adaptive_target : float = -1.0,
-    adaptive_decay : float = 0.9,
-    use_infill: bool = False,
-    model: Optional[str] = None,
-    logits_processor: Optional[llama_core.LogitsProcessorList] = None,
-    grammar: Optional[llama_grammar.LlamaGrammar] = None,
-    logprobs: Optional[bool] = None,
-    top_logprobs: Optional[int] = None,
-    **kwargs,  # type: ignore
-) -> Union[
-    llama_types.CreateChatCompletionResponse,
-    Iterator[llama_types.CreateChatCompletionStreamResponse],
-]:
-    function_calling_template = (
-        "{% for message in messages %}"
-        "<|im_start|>{{ message.role }}\n"
-        # System message
-        "{% if message.role == 'system' %}"
-        "{{ message.content }}"
-        "{% if tool_calls %}"
-        "\n\nYou have access to the following functions:\n"
-        "{% for tool in tools %}"
-        "\nfunctions.{{ tool.function.name }}:\n"
-        "{{ tool.function.parameters | tojson }}"
-        "\n{% endfor %}"
-        "\n\nYou can respond to users messages with either a single message or one or more function calls."
-        "\n\nTo respond with a message begin the message with 'message:', use the following format:"
-        "\n\nmessage:"
-        "\n<message>"
-        "\n\nTo respond with one or more function calls begin the message with 'functions.<function_name>:', use the following format:"
-        "\n\nfunctions.<function_name>:"
-        '\n{ "arg1": "value1", "arg2": "value2" }'
-        "\nfunctions.<function_name>:"
-        '\n{ "arg1": "value1", "arg2": "value2" }'
-        "{% endif %}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        # User message
-        "{% if message.role == 'user' %}"
-        "{{ message.content }}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        # Assistant message
-        "{% if message.role == 'assistant' %}"
-        ## Reglar message
-        "{% if message.content and message.content | length > 0 %}"
-        "{% if tool_calls %}"
-        "message:\n"
-        "{% endif %}"
-        "{{ message.content }}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        ## Function calls
-        "{% if 'tool_calls' in message %}"
-        "{% for tool_call in message.tool_calls %}"
-        "functions.{{ tool_call.function.name }}:\n"
-        "{{ tool_call.function.arguments }}"
-        "{% endfor %}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
-    )
-    template_renderer = ImmutableSandboxedEnvironment(
-        autoescape=jinja2.select_autoescape(["html", "xml"]),
-        undefined=jinja2.StrictUndefined,
-    ).from_string(function_calling_template)
-
-    # Convert legacy functions to tools
-    if functions is not None:
-        tools = [
-            {
-                "type": "function",
-                "function": function,
-            }
-            for function in functions
-        ]
-
-    # Convert legacy function_call to tool_choice
-    if function_call is not None:
-        if isinstance(function_call, str) and (
-            function_call == "none" or function_call == "auto"
-        ):
-            tool_choice = function_call
-        if isinstance(function_call, dict) and "name" in function_call:
-            tool_choice = {
-                "type": "function",
-                "function": {
-                    "name": function_call["name"],
-                },
-            }
-
-    stop = (
-        [stop, "<|im_end|>"]
-        if isinstance(stop, str)
-        else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
-    )
-
-    # Case 1: No tool choice by user
-    if (
-        tool_choice is None
-        or (isinstance(tool_choice, str) and tool_choice == "none")
-        or tools is None
-        or len(tools) == 0
-    ):
-        prompt = template_renderer.render(
-            messages=messages,
-            tools=[],
-            tool_calls=None,
-            add_generation_prompt=True,
-        )
-
-        if response_format is not None and response_format["type"] == "json_object":
-            grammar = _grammar_for_response_format(response_format)
-
-        return _convert_completion_to_chat(
-            llama.create_completion(
-                prompt=prompt,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                stream=stream,
-                stop=stop,
-                max_tokens=max_tokens,
-                present_penalty=present_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                top_n_sigma=top_n_sigma,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                xtc_threshold=xtc_threshold,
-                xtc_probability=xtc_probability,
-                dry_multiplier=dry_multiplier,
-                dry_base=dry_base,
-                dry_allowed_length=dry_allowed_length,
-                dry_penalty_last_n=dry_penalty_last_n,
-                dry_seq_breakers=dry_seq_breakers,
-                adaptive_target=adaptive_target,
-                adaptive_decay=adaptive_decay,
-                use_infill=use_infill,
-                model=model,
-                logits_processor=logits_processor,
-                grammar=grammar,
-                logprobs=top_logprobs if logprobs else None,
-            ),
-            stream=stream,
-        )
-
-    # Case 2: Tool choice by user
-    if isinstance(tool_choice, dict):
-        tool_name = tool_choice["function"]["name"]
-        tool = next(
-            (tool for tool in tools if tool["function"]["name"] == tool_name), None
-        )
-        if tool is None:
-            raise ValueError(f"Tool with name '{tool_name}' not found in tools")
-        prompt = template_renderer.render(
-            messages=messages,
-            tools=tools,
-            tool_calls=True,
-            add_generation_prompt=True,
-        )
-        prompt += f"functions.{tool_name}:\n"
+        prompt += f"functions.{tool_name}:\n"
         try:
             grammar = llama_grammar.LlamaGrammar.from_json_schema(
                 json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
@@ -6956,3 +3539,35 @@ def chatml_function_calling(
         }
 
     raise ValueError("Automatic streaming tool choice is not supported")
+
+# Backward compatibility re-exports.
+# These multimodal chat handlers have been moved to `llama_multimodal`.
+# New code should import them from `llama_cpp.llama_multimodal` instead of
+# `llama_cpp.llama_chat_format`.
+from llama_cpp.llama_multimodal import (
+    MTMDChatHandler,
+    GenericMTMDChatHandler,
+    Llava15ChatHandler,
+    ObsidianChatHandler,
+    MoondreamChatHandler,
+    Llava16ChatHandler,
+    NanoLlavaChatHandler,
+    Llama3VisionAlphaChatHandler,
+    Llama3VisionAlpha,
+    MiniCPMv26ChatHandler,
+    MiniCPMv45ChatHandler,
+    MiniCPMV46ChatHandler,
+    Gemma3ChatHandler,
+    Gemma4ChatHandler,
+    GLM41VChatHandler,
+    GLM46VChatHandler,
+    GraniteDoclingChatHandler,
+    LFM2VLChatHandler,
+    LFM25VLChatHandler,
+    PaddleOCRChatHandler,
+    Qwen25VLChatHandler,
+    Qwen3ASRChatHandler,
+    Qwen3VLChatHandler,
+    Qwen35ChatHandler,
+    Step3VLChatHandler
+)
diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py
new file mode 100644
index 0000000000..a055869543
--- /dev/null
+++ b/llama_cpp/llama_multimodal.py
@@ -0,0 +1,3473 @@
+from __future__ import annotations
+
+import base64
+import ctypes
+import json
+import os
+import sys
+import zlib
+
+from contextlib import ExitStack
+from typing import (
+    Any,
+    Dict,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    Protocol,
+    TYPE_CHECKING,
+    cast,
+)
+
+import urllib.request
+from urllib.error import URLError, HTTPError
+
+import llama_cpp.llama_cpp as llama_cpp_lib
+import llama_cpp.llama_types as llama_types
+import llama_cpp.llama_grammar as llama_grammar
+
+if TYPE_CHECKING:
+    import llama_cpp.llama as llama_core
+
+from ._logger import ggml_log_callback
+
+from llama_cpp.llama_chat_format import (
+    _convert_completion_to_chat,
+    _convert_completion_to_chat_function,
+    _grammar_for_response_format,
+    ImmutableSandboxedEnvironment
+)
+
+class MTMDChatHandler:
+    DEFAULT_SYSTEM_MESSAGE: Optional[str] = (
+"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, "
+"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful."
+    )
+
+    CHAT_FORMAT = (
+        "{{ bos_token if bos_token is defined else '' }}"
+        "{% for message in messages %}"
+            "{% if message.role == 'system' %}"
+                "{{ message.content }}"
+            "{% elif message.role == 'user' %}"
+                "USER: "
+                "{% if message.content is string %}"
+                    "{{ message.content }}"
+                "{% elif message.content is iterable %}"
+                    "{% for content in message.content %}"
+                        "{% if content.type == 'image_url' %}"
+                            "{{ content.image_url if content.image_url is string else content.image_url.url }}"
+                        "{% elif content.type == 'audio_url' %}"
+                            "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}"
+                        "{% elif content.type == 'input_audio' %}"
+                            "{% if content.input_audio is string %}"
+                                "{{ content.input_audio }}"
+                            "{% else %}"
+                                "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}"
+                            "{% endif %}"
+                        "{% elif content.type == 'video_url' %}"
+                            "{{ content.video_url if content.video_url is string else content.video_url.url }}"
+                        "{% elif content.type == 'text' %}"
+                            "{{ content.text }}"
+                        "{% endif %}"
+                    "{% endfor %}"
+                "{% endif %}"
+
+            "{% elif message.role == 'assistant' and message.content is not none %}"
+                "ASSISTANT: {{ message.content }}"
+            "{% endif %}"
+            "{{ \"\n\" }}"
+        "{% endfor %}"
+
+        "{% if eos_token is defined %}"
+            "{{ eos_token }}"
+        "{% endif %}"
+
+        "{% if add_generation_prompt %}"
+            "ASSISTANT: "
+        "{% endif %}"
+    )
+
+    def __init__(
+        self,
+        mmproj_path: Optional[str] = None,
+        verbose: bool = True,
+        use_gpu: bool = True,
+        image_min_tokens: int = -1,
+        image_max_tokens: int = -1,
+        chat_template_override: Optional[str] = None,
+        batch_max_tokens: int = 1024,
+        **kwargs
+    ):
+
+        self.log_prefix = self.__class__.__name__
+        self.verbose = verbose
+
+        # Backward compatibility: `clip_model_path` was the old name for `mmproj_path`.
+        # Accept it for existing user code, warn during initialization, and normalize
+        # all internal usage to `mmproj_path`.
+        clip_model_path = kwargs.pop("clip_model_path", None)
+        if mmproj_path is None and clip_model_path is not None:
+            mmproj_path = clip_model_path
+            if self.verbose:
+                print(
+                    f"{self.log_prefix}(__init__): `clip_model_path` is deprecated; "
+                    "please use `mmproj_path` instead.",
+                    file=sys.stderr,
+                )
+
+        if kwargs:
+            unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys())
+            raise TypeError(
+                f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n"
+                f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}."
+            )
+
+        if mmproj_path is None:
+            raise ValueError(
+                f"{self.log_prefix}(__init__): `mmproj_path` is required. "
+                "`clip_model_path` is accepted only as a deprecated compatibility alias."
+            )
+
+        self.mmproj_path = mmproj_path
+        if not os.path.exists(self.mmproj_path):
+            raise ValueError(
+                f"{self.log_prefix}(__init__): mmproj path does not exist: {self.mmproj_path}"
+            )
+
+        self.image_min_tokens = image_min_tokens
+        self.image_max_tokens = image_max_tokens
+        self.batch_max_tokens = batch_max_tokens
+        self.use_gpu = use_gpu
+
+        import llama_cpp.mtmd_cpp as mtmd_cpp
+        self._mtmd_cpp = mtmd_cpp
+        self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
+        self.extra_template_arguments: dict[str, Any] = {}
+
+        self.is_support_vision = False
+        self.is_support_audio = False
+        self.is_support_video = False
+
+        # Pre-compile Jinja template
+        if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None:
+            self.chat_format = self.CHAT_FORMAT
+        elif chat_template_override is not None:
+            self.chat_format = chat_template_override
+
+        self._chat_format_parser_tags = []
+        self._change_chat_template(self.chat_format)
+
+        self._exit_stack = ExitStack()
+
+    def _change_chat_template(self, new_template: str):
+        self.chat_template = ImmutableSandboxedEnvironment(
+            trim_blocks=True,
+            lstrip_blocks=True
+        ).from_string(new_template)
+
+    def _init_mtmd_context(self, llama_model: llama_core.Llama):
+        """Initialize mtmd context with the llama model."""
+        if self.mtmd_ctx is not None:
+            return  # Already initialized
+
+        self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0))
+
+        # Get default parameters
+        self.mctx_params = self._mtmd_cpp.mtmd_context_params_default()
+        self.mctx_params.use_gpu = self.use_gpu
+        self.mctx_params.print_timings = self.verbose
+        self.mctx_params.n_threads = llama_model.n_threads
+        self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO
+        self.mctx_params.warmup = True
+        if self.image_min_tokens > 0:
+            self.mctx_params.image_min_tokens = self.image_min_tokens
+        if self.image_max_tokens > 0:
+            self.mctx_params.image_max_tokens = self.image_max_tokens
+        if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0:
+            raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) "
+                                f"cannot be less than image_min_tokens ({self.image_min_tokens}).")
+        self.mctx_params.batch_max_tokens = self.batch_max_tokens
+
+        # Cache the model's eos token and bos token
+        self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore')
+        self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore')
+
+        # Cache the mtmd_default_marker
+        self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8')
+
+        # Initialize mtmd context
+        self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
+            self.mmproj_path.encode(),
+            llama_model.model,
+            self.mctx_params
+        )
+
+        if self.mtmd_ctx is None:
+            raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}")
+
+        # Check if vision is supported
+        self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx)
+        if self.is_support_vision:
+            if self.verbose:
+                print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr)
+        else:
+            if self.verbose:
+                print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr)
+
+        # Check if audio is supported
+        self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx)
+        if self.is_support_audio:
+            if self.verbose:
+                print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr)
+        else:
+            if self.verbose:
+                print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr)
+
+        # Check if video is supported
+        self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx)
+        if self.is_support_video:
+            if self.verbose:
+                print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr)
+        else:
+            if self.verbose:
+                print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr)
+
+    def close(self) -> None:
+        """Explicitly free the mtmd context and vision model resources."""
+        if getattr(self, "mtmd_ctx", None) is not None:
+            try:
+                self._mtmd_cpp.mtmd_free(self.mtmd_ctx)
+            except Exception:
+                pass
+            self.mtmd_ctx = None
+            self.mctx_params = None
+            self.chat_template = None
+
+        if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"):
+            self._exit_stack.close()
+            self._exit_stack = None
+
+    def __del__(self) -> None:
+        self.close()
+
+    def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]:
+        """
+        Extracts all media payloads (images, audio) sequentially to maintain exact chronological order.
+        Strictly enforces capability checks, raising exceptions if unsupported media is passed.
+
+        Returns:
+            media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio).
+        """
+        media_items: List[Dict[str, str]] = []
+        for message in messages:
+            if isinstance(message.get("content"), list):
+                for content in message["content"]:
+                    content_type = content.get("type", "")
+
+                    # 1. Vision Processing
+                    if content_type == "image_url":
+                        if not self.is_support_vision:
+                            raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.")
+
+                        url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"]
+                        media_items.append({"url": url, "type": "image"})
+
+                    # 2. Audio Processing
+                    elif content_type in ["audio", "audio_url", "input_audio"]:
+                        if not self.is_support_audio:
+                            raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.")
+
+                        # Case A: Handle custom/forward-compatible audio_url format
+                        if content_type == "audio_url" or content_type == "audio":
+                            audio_url = content[content_type]
+                            url = audio_url if isinstance(audio_url, str) else audio_url["url"]
+                            media_items.append({"url": url, "type": "audio"})
+                        # Case B: Handle OpenAI standard input_audio format
+                        elif content_type == "input_audio":
+                            input_audio = content.get("input_audio", {})
+                            if isinstance(input_audio, dict) and "data" in input_audio:
+                                # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic
+                                # input_audio: {
+                                #     data: audio.base64Data,
+                                #     format: audio.mimeType.includes('wav') ? 'wav' : 'mp3'
+                                # }
+                                audio_data = input_audio.get("data", "")
+                                audio_format = input_audio.get("format", "")
+
+                                # Strictly align with llama.cpp (require wav/mp3)
+                                if audio_format not in ["wav", "mp3"]:
+                                    raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'")
+
+                                # Format as a Data URI to reuse the unified load_media logic
+                                media_items.append({
+                                    "url": f"data:audio/{audio_format};base64,{audio_data}",
+                                    "type": "audio"
+                                })
+                            else:
+                                # Just a raw base64 data
+                                url = input_audio if isinstance(input_audio, str) else ""
+                                if url:
+                                    media_items.append({"url": url, "type": "audio"})
+
+                    # 3. Video Processing
+                    elif content_type == "video_url":
+                        if not self.is_support_video:
+                            raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.")
+
+                        video_url = content["video_url"]
+                        url = video_url if isinstance(video_url, str) else video_url["url"]
+                        media_items.append({"url": url, "type": "video"})
+
+                    # 4. Text & Unknown Types
+                    elif content_type == "text":
+                        continue
+                    else:
+                        if self.verbose:
+                            print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr)
+        return media_items
+
+    def _create_bitmap_from_bytes(self, media_bytes: bytes):
+        """
+        Constructs an mtmd_bitmap structure from a raw byte buffer containing media data.
+
+        Supported formats:
+          - Images (via stb_image): jpg, png, bmp, etc.
+          - Audio (via miniaudio): wav, mp3, flac.
+          - Video: depends on whether MTMD_VIDEO was enabled at build time.
+
+        Note:
+          - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes.
+          - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing.
+
+        Args:
+            media_bytes (bytes): The raw byte content of the media file.
+
+        Returns:
+            bitmap: mtmd_bitmap *
+            video_ctx: mtmd_helper_video * or NULL
+        """
+        if self.mtmd_ctx is None:
+            raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.")
+
+        if not media_bytes:
+            raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.")
+
+        buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes)
+
+        wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
+            self.mtmd_ctx,
+            buf,
+            len(media_bytes),
+            False,
+        )
+
+        if not wrapper.bitmap:
+            if wrapper.video_ctx:
+                self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx)
+
+            raise ValueError(
+                f"{self.log_prefix}(_create_bitmap_from_bytes): "
+                "Failed to load media from bytes "
+                "(unsupported media format, corrupted data, or missing helper support)."
+            )
+
+        return wrapper.bitmap, wrapper.video_ctx
+
+    def _is_text_chunk(self, chunk_type: int) -> bool:
+        """Return True if `chunk_type` is the MTMD text chunk type enum value."""
+        return (
+            chunk_type
+            == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT
+        )
+
+    def _is_image_chunk(self, chunk_type: int) -> bool:
+        """Return True if `chunk_type` is the MTMD image chunk type enum value."""
+        return (
+            chunk_type
+            == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE
+        )
+
+    def _is_audio_chunk(self, chunk_type: int) -> bool:
+        """Return True if `chunk_type` is the MTMD audio chunk type enum value."""
+        return (
+            chunk_type
+            == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO
+        )
+
+    def _process_mtmd_prompt(
+        self,
+        llama: llama_core.Llama,
+        messages: List[llama_types.ChatCompletionRequestMessage],
+        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        add_generation_prompt: bool = True,
+    ) -> Tuple[List[int], List[tuple], Any, List[Any]]:
+        """
+        Core multimodal preprocessing pipeline.
+        Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger.
+
+        Features:
+        - Thread-safe concurrent media decoding to eliminate I/O bottlenecks.
+        - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens.
+        - Strict RAII-style C++ memory management to prevent leaks on failure.
+
+        Returns:
+            full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching.
+            chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id).
+            chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller).
+            bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation.
+        """
+        # 1. Inject default system prompt if omitted by the user
+        system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "")
+        if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
+            messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages
+
+        media_items = self._get_media_items(messages)
+        media_marker = self.media_marker
+
+        # 2. Render the chat template and replace actual URLs with C++ media markers
+        text = self.chat_template.render(
+            messages=messages,
+            add_generation_prompt=add_generation_prompt,
+            eos_token=self.mtmd_eos_token,
+            bos_token=self.mtmd_bos_token,
+            functions=functions,
+            function_call=function_call,
+            tools=tools,
+            tool_choice=tool_choice,
+            **getattr(self, 'extra_template_arguments', {})
+        )
+
+        for tag in self._chat_format_parser_tags:
+            if tag not in text:
+                continue
+
+            text = text.replace(tag, media_marker)
+
+        # Replace image_url by media_marker in text
+        for item in media_items:
+            text = text.replace(item["url"], media_marker)
+
+        if self.verbose:
+            print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr)
+            print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr)
+
+        # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding
+        bitmaps = [None] * len(media_items)
+        bitmap_cleanup = []
+        video_cleanup = []
+        chunks = None
+
+        try:
+            # Concurrent Media Decoding
+            import concurrent.futures
+            if media_items:
+                def _create_bitmap_func(idx: int, item: dict):
+                    media_bytes = self.load_media(item["url"], item["type"])
+                    bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes)
+                    return idx, bitmap, video_ctx
+                # This method uses multi-threaded parallel processing to convert images or audio to bitmaps,
+                # which can be used in the future to process large numbers of video frames.
+                max_workers = min(llama.n_threads, len(media_items))
+                with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                    futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)]
+
+                    for future in concurrent.futures.as_completed(futures):
+                        idx, bitmap, video_ctx = future.result()
+
+                        bitmaps[idx] = bitmap
+                        bitmap_cleanup.append(bitmap)
+
+                        if video_ctx:
+                            video_cleanup.append(video_ctx)
+
+                # Strict validation: Abort if any thread failed to decode its assigned media
+                if any(b is None for b in bitmaps):
+                    raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.")
+                else:
+                    if self.verbose:
+                        print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.")
+            else:
+                # If there are no images, set the bitmaps to empty.
+                bitmaps = []
+
+            # 4. Initialize mtmd_input_chunks
+            input_text = self._mtmd_cpp.mtmd_input_text()
+            input_text.text = text.encode('utf-8')
+            input_text.add_special = (llama.n_tokens == 0)
+            input_text.parse_special = True
+
+            chunks = self._mtmd_cpp.mtmd_input_chunks_init()
+            if chunks is None:
+                raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.")
+
+            # 5. Hybrid Tokenization (Text + Media binding)
+            if len(bitmaps) > 0:
+                bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps)
+                result = self._mtmd_cpp.mtmd_tokenize(
+                    self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps)
+                )
+            else:
+                result = self._mtmd_cpp.mtmd_tokenize(
+                    self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0
+                )
+
+            if result != 0:
+                raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.")
+
+            # Video helper contexts only need to stay alive until mtmd_tokenize() completes.
+            if video_cleanup:
+                for video_ctx in video_cleanup:
+                    self._mtmd_cpp.mtmd_helper_video_free(video_ctx)
+                video_cleanup.clear()
+
+            # 6. Virtual Token Ledger Construction
+            full_prompt_ids = []
+            chunk_token_spans = []
+            current_idx = 0
+            n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks)
+
+            # Cursor to track the actual media contents (URLs or base64 data) provided by the user
+            media_items_count = len(media_items)
+            media_items_cur = 0
+            last_media_id = None
+
+            for i in range(n_chunks):
+                chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i)
+                if chunk is None: continue
+                chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
+
+                if self._is_text_chunk(chunk_type):
+                    # Extract standard text token IDs
+                    n_tokens_out = ctypes.c_size_t()
+                    tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out))
+                    if tokens_ptr and n_tokens_out.value > 0:
+                        tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)]
+                        chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None))
+                        full_prompt_ids.extend(tokens)
+                        current_idx += len(tokens)
+                elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type):
+                    # Extract media properties
+                    # Note(JamePeng):
+                    # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models).
+                    # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample.
+                    # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise
+                    chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)
+
+                    if media_items_cur < media_items_count:
+                        # The C++ parser only sees identical placeholders (e.g., "<__media__>").
+                        # We MUST inject the actual media content's identity here.
+                        real_media_url = media_items[media_items_cur]["url"]
+                        # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5)
+                        # Generate a deterministic, unique negative ID for this specific image/audio.
+                        # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()).
+                        # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with
+                        #   positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k).
+                        # This empowers `longest_token_prefix` to correctly identify and reuse cached images,
+                        # while instantly breaking the match if the image content changes.
+                        # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100
+                        media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100
+                        last_media_id = media_id
+                        media_items_cur += 1
+                    elif last_media_id is not None:
+                        # video may expand into multiple image chunks from one media marker
+                        media_id = last_media_id
+                    else:
+                        # Magic Negative Number as fallback :)
+                        media_id = -314159
+
+                    if self.verbose:
+                        print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ")
+
+                    chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id))
+
+                    # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache
+                    full_prompt_ids.extend([media_id] * chunk_n_tokens)
+                    current_idx += chunk_n_tokens
+                else:
+                    raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.")
+
+            return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup
+
+        except Exception as e:
+            # Ensure no useless pointers remain upon any failure
+            # Free chunks
+            if chunks is not None:
+                self._mtmd_cpp.mtmd_input_chunks_free(chunks)
+                chunks = None
+            # Free bitmaps
+            if len(bitmap_cleanup) > 0:
+                for bitmap in bitmap_cleanup:
+                    self._mtmd_cpp.mtmd_bitmap_free(bitmap)
+                bitmap_cleanup = None
+            # Free videos
+            if len(video_cleanup) > 0:
+                for video_ctx in video_cleanup:
+                    self._mtmd_cpp.mtmd_helper_video_free(video_ctx)
+                video_cleanup = None
+
+            bitmaps = None
+
+            raise e
+
+    def __call__(
+        self,
+        *,
+        llama: llama_core.Llama,
+        messages: List[llama_types.ChatCompletionRequestMessage],
+        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        temperature: float = 0.2,
+        top_p: float = 0.95,
+        top_k: int = 40,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        stream: bool = False,
+        stop: Optional[Union[str, List[str]]] = [],
+        seed: Optional[int] = None,
+        response_format: Optional[
+            llama_types.ChatCompletionRequestResponseFormat
+        ] = None,
+        max_tokens: Optional[int] = None,
+        present_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        repeat_penalty: float = 1.1,
+        top_n_sigma: float = -1.00,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        xtc_threshold: float = 0.1,
+        xtc_probability: float = 0.0,
+        dry_multiplier: float = 0.0,
+        dry_base: float = 1.75,
+        dry_allowed_length: int = 2,
+        dry_penalty_last_n:int = 0,
+        dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"],
+        adaptive_target : float = -1.0,
+        adaptive_decay : float = 0.9,
+        use_infill: bool = False,
+        model: Optional[str] = None,
+        logits_processor: Optional[llama_core.LogitsProcessorList] = None,
+        grammar: Optional[llama_grammar.LlamaGrammar] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        add_generation_prompt: bool = True,
+        reasoning_budget: int = -1,
+        reasoning_start: str = "<think>",
+        reasoning_end: str = "</think>",
+        reasoning_budget_message: Optional[str] = None,
+        reasoning_start_in_prompt: bool = False,
+        reasoning_start_max_tokens: Optional[int] = 32,
+        **kwargs,  # type: ignore
+    ) -> Union[
+        llama_types.CreateChatCompletionResponse,
+        Iterator[llama_types.CreateChatCompletionStreamResponse],
+    ]:
+        # 1. Initialize mtmd context
+        self._init_mtmd_context(llama)
+        assert self.mtmd_ctx is not None
+
+        # 2. Concurrent Preprocessing & Ledger Construction
+        full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt(
+            llama=llama,
+            messages=messages,
+            functions=functions,
+            function_call=function_call,
+            tools=tools,
+            tool_choice=tool_choice,
+            add_generation_prompt=add_generation_prompt,
+        )
+
+        if self.verbose:
+            print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr)
+
+        try:
+            # 3. KV Cache Synchronization & State Rollback
+            # Compares the virtual ledger with physical history to prevent Cache Poisoning.
+            current_history = llama.input_ids[:llama.n_tokens].tolist()
+            longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose)
+
+            if longest_prefix < llama.n_tokens:
+                if llama.is_hybrid and llama._hybrid_cache_mgr is not None:
+                    if llama._hybrid_cache_mgr.max_checkpoints > 0:
+                        if self.verbose:
+                            print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). "
+                                f"Searching for nearest checkpoint...", file=sys.stderr)
+
+                        best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0)
+                        if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0):
+                            llama.n_tokens = best_ckpt.pos
+                            if self.verbose:
+                                print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr)
+                        else:
+                            if self.verbose:
+                                print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr)
+                            llama._hybrid_cache_mgr.clear()
+                            llama._ctx.memory_clear(True)
+                            llama.n_tokens = 0
+                    else:
+                        if self.verbose:
+                            print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr)
+                        llama._hybrid_cache_mgr.clear()
+                        llama._ctx.memory_clear(True)
+                        llama.n_tokens = 0
+                else:
+                    if self.verbose:
+                        print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr)
+                    llama._ctx.memory_seq_rm(0, longest_prefix, -1)
+                    llama.n_tokens = longest_prefix
+
+            n_past = llama.n_tokens
+
+            for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans:
+                # Skip previously matched chunks
+                if end_idx <= n_past:
+                    continue
+
+                if self._is_text_chunk(chunk_type):
+                    unprocessed_start = max(start_idx, n_past) - start_idx
+                    n_tokens_out = ctypes.c_size_t()
+                    tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out))
+
+                    if tokens_ptr and n_tokens_out.value > 0:
+                        all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)]
+                        tokens_to_eval = all_tokens[unprocessed_start:]
+
+                        if tokens_to_eval:
+                            if self.verbose:
+                                print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr)
+                            # Text evaluation delegates shift and chunking to native llama.eval
+                            llama.eval(tokens_to_eval)
+                            n_past = llama.n_tokens
+
+                elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type):
+                    chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr)
+
+                    if self.verbose:
+                        media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO"
+                        print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr)
+
+                    # Stage 5: Multimodal Physical OOM Defense
+                    if n_past + chunk_n_tokens > llama.n_ctx():
+                        if not llama._ctx.memory_can_shift():
+                            raise RuntimeError(
+                                f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend "
+                                f"(n_pos_per_embd > 1 or incompatible M-RoPE). "
+                                f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), "
+                                f"You MUST increase n_ctx to fit the dialogue."
+                            )
+                        else:
+                            # Safely discard oldest tokens while preserving system prompts
+                            n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch
+                            n_keep = min(llama.n_keep, n_past)
+                            n_discard = min(n_discard, n_past - n_keep)
+
+                            if n_discard <= 0:
+                                raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.")
+
+                            if self.verbose:
+                                print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr)
+
+                            # Execute physical memory shift
+                            llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard)
+                            llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard)
+
+                            # Shift python virtual array to match
+                            remaining_len = n_past - (n_keep + n_discard)
+                            if remaining_len > 0:
+                                llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past]
+
+                            n_past -= n_discard
+                            llama.n_tokens = n_past
+
+                    # Execute C++ Multimodal Black-box Extraction
+                    new_n_past = llama_cpp_lib.llama_pos(0)
+                    result = self._mtmd_cpp.mtmd_helper_eval_chunk_single(
+                        self.mtmd_ctx,
+                        llama._ctx.ctx,
+                        chunk_ptr,
+                        llama_cpp_lib.llama_pos(n_past),
+                        llama_cpp_lib.llama_seq_id(0),
+                        llama.n_batch,
+                        True, # logits_last = True, drastically saves computational overhead
+                        ctypes.byref(new_n_past)
+                    )
+
+                    if result != 0:
+                        raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.")
+
+                    # Update Ledger with "Negative Reverse Vocabulary" IDs
+                    llama.input_ids[n_past : new_n_past.value] = media_id
+                    n_past = new_n_past.value
+                    llama.n_tokens = n_past
+
+            # Extract the final, perfectly synchronized prompt sequence
+            prompt = llama.input_ids[: llama.n_tokens].tolist()
+
+            # End-of-Turn Checkpoint
+            # Anchors the state ONLY after the entire multi-modal turn is processed
+            if (
+                llama.is_hybrid
+                and llama._hybrid_cache_mgr is not None
+                and llama._hybrid_cache_mgr.max_checkpoints > 0
+            ):
+                if self.verbose:
+                    print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr)
+
+                llama._hybrid_cache_mgr.save_checkpoint(
+                    current_pos=llama.n_tokens,
+                    tokens=prompt,
+                    seq_id=0
+                )
+        finally:
+            # Cleanup chunks
+            if chunks is not None:
+                self._mtmd_cpp.mtmd_input_chunks_free(chunks)
+                chunks = None
+            # Cleanup bitmaps
+            if bitmap_cleanup:
+                for bitmap in bitmap_cleanup:
+                    self._mtmd_cpp.mtmd_bitmap_free(bitmap)
+                bitmap_cleanup.clear()
+            bitmap_array = None
+
+        # Handle response format and tools (same as before)
+        if response_format is not None and response_format["type"] == "json_object":
+            grammar = _grammar_for_response_format(response_format)
+
+        # Convert legacy functions to tools
+        if functions is not None:
+            tools = [
+                {
+                    "type": "function",
+                    "function": function,
+                }
+                for function in functions
+            ]
+
+        # Convert legacy function_call to tool_choice
+        if function_call is not None:
+            if isinstance(function_call, str) and (
+                function_call == "none" or function_call == "auto"
+            ):
+                tool_choice = function_call
+            if isinstance(function_call, dict) and "name" in function_call:
+                tool_choice = {
+                    "type": "function",
+                    "function": {
+                        "name": function_call["name"],
+                    },
+                }
+
+        tool = None
+        if (
+            tool_choice is not None
+            and isinstance(tool_choice, dict)
+            and tools is not None
+        ):
+            name = tool_choice["function"]["name"]
+            tool = next((t for t in tools if t["function"]["name"] == name), None)
+            if tool is None:
+                raise ValueError(f"Tool choice '{name}' not found in tools.")
+            schema = tool["function"]["parameters"]
+            try:
+                # create grammar from json schema
+                grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                    json.dumps(schema), verbose=llama.verbose
+                )
+            except Exception as e:
+                if llama.verbose:
+                    print(str(e), file=sys.stderr)
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                )
+
+        completion_or_chunks = llama.create_completion(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            logprobs=top_logprobs if logprobs else None,
+            stream=stream,
+            stop=stop,
+            seed=seed,
+            max_tokens=max_tokens,
+            present_penalty=present_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            top_n_sigma=top_n_sigma,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            xtc_threshold=xtc_threshold,
+            xtc_probability=xtc_probability,
+            dry_multiplier=dry_multiplier,
+            dry_base=dry_base,
+            dry_allowed_length=dry_allowed_length,
+            dry_penalty_last_n=dry_penalty_last_n,
+            dry_seq_breakers=dry_seq_breakers,
+            adaptive_target=adaptive_target,
+            adaptive_decay=adaptive_decay,
+            use_infill=use_infill,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+            logit_bias=logit_bias,
+            reasoning_budget=reasoning_budget,
+            reasoning_start=reasoning_start,
+            reasoning_end=reasoning_end,
+            reasoning_budget_message=reasoning_budget_message,
+            reasoning_start_in_prompt=reasoning_start_in_prompt,
+            reasoning_start_max_tokens=reasoning_start_max_tokens,
+        )
+
+        if tool is not None:
+            tool_name = tool["function"]["name"]
+            return _convert_completion_to_chat_function(
+                tool_name, completion_or_chunks, stream
+            )
+        return _convert_completion_to_chat(completion_or_chunks, stream=stream)
+
+    def load_media(self, media_url: str, media_type: str) -> bytes:
+        """
+        Unified dispatcher for loading media payloads.
+        Routes the URL/URI to the specific image, audio, or video processor based on the media_type.
+        """
+        if media_type == "image":
+            return self._load_image(media_url)
+
+        elif media_type == "audio":
+            audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio")
+            try:
+                self.detect_audio_format(audio_bytes)
+            except ValueError as e:
+                raise ValueError(f"{self.log_prefix}(load_media): {e}")
+            return audio_bytes
+
+        elif media_type == "video":
+            return self._load_bytes(media_url, timeout=30, kind="video")
+
+        else:
+            raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'")
+
+    @staticmethod
+    def detect_audio_format(audio_bytes: bytes) -> str:
+        """
+        Pure utility function: Detects the audio format from magic bytes.
+        Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility
+        and avoid false positives (e.g., AVI files disguised as RIFF).
+        """
+        length = len(audio_bytes)
+
+        if length < 12:
+            raise ValueError("Audio data is corrupted or too small (less than 12 bytes).")
+
+        # RIFF & WAVE magic bytes verification
+        is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE"
+
+        # ID3 metadata or MPEG sync word verification
+        is_mp3 = length >= 3 and (
+            audio_bytes.startswith(b"ID3") or
+            (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0)
+        )
+
+        # FLAC magic bytes verification
+        is_flac = audio_bytes.startswith(b"fLaC")
+
+        if is_wav:
+            return "wav"
+        elif is_mp3:
+            return "mp3"
+        elif is_flac:
+            return "flac"
+        else:
+            raise ValueError(
+                "Unsupported audio format detected via magic bytes. "
+                "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC."
+            )
+
+    DEFAULT_HTTP_HEADERS = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/148.0.0.0 Safari/537.36"
+        ),
+    }
+
+    @staticmethod
+    def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes:
+        """
+        Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL.
+        """
+        media_bytes = b""
+
+        # 1. Handle data URI
+        if media_url.strip().startswith("data:"):
+            comma_pos = media_url.find(",")
+            if comma_pos == -1:
+                raise ValueError("Invalid data URI: missing comma separator")
+
+            base64_data = media_url[comma_pos + 1:]
+            media_bytes = base64.b64decode(base64_data)
+
+        # 2. Handle local file path
+        elif os.path.exists(media_url):
+            with open(media_url, "rb") as f:
+                media_bytes = f.read()
+
+        # 3. Handle remote URL via HTTP/HTTPS
+        else:
+            req = urllib.request.Request(
+                media_url,
+                headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS,
+            )
+            try:
+                with urllib.request.urlopen(req, timeout=timeout) as f:
+                    media_bytes = f.read()
+            except (URLError, HTTPError) as e:
+                raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}")
+
+        if not media_bytes:
+            raise ValueError(f"Empty {kind} data received")
+
+        return media_bytes
+
+    @staticmethod
+    def _load_image(image_url: str) -> bytes:
+        """
+        Load an image from either a URL or a data URI and return it as JPEG bytes.
+
+        Supports:
+        - Remote images via HTTP/HTTPS (with proper User-Agent)
+        - Data URIs (base64-encoded, e.g., data:image/png;base64,...)
+        - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background
+        - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
+
+        Returns:
+            JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models.
+        """
+        # 1. Load image bytes from image_url
+        image_bytes = MTMDChatHandler._load_bytes(
+            image_url,
+            timeout=15,
+            kind="image",
+        )
+
+        # 2. Check if image_bytes is empty.
+        if not image_bytes:
+            raise ValueError("Empty image data received")
+
+        # 3. Open image with Pillow
+        try:
+            from PIL import Image, ImageStat
+        except ImportError:
+            raise ImportError("Pillow is required for image processing. Install with: pip install pillow")
+
+        import io
+        image = Image.open(io.BytesIO(image_bytes))
+
+        # 4. Handle transparency (RGBA, LA, P with transparency, etc.)
+        if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info):
+            # Use alpha channel as mask
+            if image.mode == "P":
+                image = image.convert("RGBA")
+
+            alpha = image.split()[-1]  # Last channel is alpha
+            # Compute average brightness of visible (non-transparent) pixels
+            stat = ImageStat.Stat(image.convert("L"), mask=alpha)
+
+            # Choose background: white for dark content, black for bright content
+            bg_color = (255, 255, 255)  # white
+            if stat.count[0] > 0 and stat.mean[0] > 127:
+                bg_color = (0, 0, 0)  # black
+
+            background = Image.new("RGB", image.size, bg_color)
+            background.paste(image, mask=alpha)
+            image = background
+
+        # 5. Ensure RGB mode for formats like CMYK, palette, etc.
+        elif image.mode != "RGB":
+            image = image.convert("RGB")
+
+        # 6. Save as high-quality JPEG, suitable for most vision models.
+        output = io.BytesIO()
+        image.save(output, format="JPEG", quality=95, optimize=True, progressive=True)
+        return output.getvalue()
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id: str,
+        filename: Optional[str],
+        local_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+        cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        **kwargs: Any,
+    ) -> "MTMDChatHandler":
+        import fnmatch
+        from pathlib import Path
+
+        try:
+            from huggingface_hub import hf_hub_download, HfFileSystem  # type: ignore
+            from huggingface_hub.utils import validate_repo_id  # type: ignore
+        except ImportError:
+            raise ImportError(
+                "Llama.from_pretrained requires the huggingface_hub package. "
+                "You can install it with `pip install --upgrade huggingface_hub`."
+            )
+
+        validate_repo_id(repo_id)
+
+        hffs = HfFileSystem()
+
+        files = [
+            file["name"] if isinstance(file, dict) else file
+            for file in hffs.ls(repo_id)  # type: ignore
+        ]
+
+        # split each file into repo_id, subfolder, filename
+        file_list: List[str] = []
+        for file in files:
+            rel_path = Path(file).relative_to(repo_id)
+            file_list.append(str(rel_path))
+
+        matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)]  # type: ignore
+
+        if len(matching_files) == 0:
+            raise ValueError(
+                f"No file found in {repo_id} that match {filename}\n\n"
+                f"Available Files:\n{json.dumps(file_list)}"
+            )
+
+        if len(matching_files) > 1:
+            raise ValueError(
+                f"Multiple files found in {repo_id} matching {filename}\n\n"
+                f"Available Files:\n{json.dumps(files)}"
+            )
+
+        (matching_file,) = matching_files
+
+        subfolder = str(Path(matching_file).parent)
+        filename = Path(matching_file).name
+
+        # download the file
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            subfolder=subfolder,
+            local_dir=cast(Union[str, Path, None], local_dir),
+            local_dir_use_symlinks=local_dir_use_symlinks,
+            cache_dir=cast(Union[str, Path, None], cache_dir),
+        )
+
+        if local_dir is None:
+            model_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=filename,
+                subfolder=subfolder,
+                local_dir=local_dir,
+                local_dir_use_symlinks=local_dir_use_symlinks,
+                cache_dir=cast(Union[str, Path, None], cache_dir),
+                local_files_only=True,
+            )
+        else:
+            model_path = os.path.join(local_dir, filename)
+
+        return cls(
+            mmproj_path=model_path,
+            **kwargs,
+        )
+
+# Experiments are not recommended for this purpose at this time.
+class GenericMTMDChatHandler(MTMDChatHandler):
+    KNOWN_MEDIA_TAGS = [
+        "<|image_pad|>",
+        "<|audio_pad|>",
+        "<|video_pad|>",
+        "<|image|>",
+        "<|audio|>",
+        "<|video|>",
+        "[IMG]"
+    ]
+
+    def __init__(
+        self,
+        chat_format: str,
+        mmproj_path: str,
+        verbose: bool = True,
+        **kwargs
+    ) -> None:
+
+        self.chat_format = chat_format
+        if self.chat_format is None:
+            raise ValueError("Failed to get model chat template automatically.")
+
+        self.verbose = verbose
+        if self.verbose:
+            print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)
+
+        super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs)
+
+    def __call__(self, **kwargs):
+        self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format]
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing")
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
+
+class Llava15ChatHandler(MTMDChatHandler):
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+            "{% if message.role == 'system' %}"
+                "{{ message.content }}"
+            "{% endif %}"
+
+            "{% if message.role == 'user' %}"
+                "{% if message.content is string %}"
+                    "\nUSER: {{ message.content }}"
+                "{% elif message.content is iterable %}"
+                    "\nUSER: "
+                    "{% for content in message.content %}"
+                        "{% if content.type == 'image_url' %}"
+                            "{{ content.image_url if content.image_url is string else content.image_url.url }}"
+                        "{% endif %}"
+                    "{% endfor %}"
+                    "{% for content in message.content %}"
+                        "{% if content.type == 'text' %}"
+                            "{{ content.text }}"
+                        "{% endif %}"
+                    "{% endfor %}"
+                "{% endif %}"
+            "{% endif %}"
+
+            "{% if message.role == 'assistant' and message.content is not none %}"
+                "\nASSISTANT: {{ message.content }}"
+            "{% endif %}"
+        "{% endfor %}"
+
+        "{% if add_generation_prompt %}"
+            "\nASSISTANT: "
+        "{% endif %}"
+    )
+
+
+class ObsidianChatHandler(MTMDChatHandler):
+    # Prompt Format
+    # The model followed ChatML format. However, with ### as the seperator
+
+    # <|im_start|>user
+    # What is this sign about?\n<image>
+    # ###
+    # <|im_start|>assistant
+    # The sign is about bullying, and it is placed on a black background with a red background.
+    # ###
+
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        # System message
+        "{% if message.role == 'system' %}"
+        "<|im_start|>system\n"
+        "{{ message.content }}\n"
+        "###\n"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "<|im_start|>user\n"
+        "{% if message.content is string %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% if message.content is iterable %}"
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' and content.image_url is string %}"
+        "{{ content.image_url }}"
+        "{% endif %}"
+        "{% if content.type == 'image_url' and content.image_url is mapping %}"
+        "{{ content.image_url.url }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        "###\n"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        "<|im_start|>assistant\n"
+        "{{ message.content }}"
+        "###\n"
+        "{% endif %}"
+        "{% endfor %}"
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "<|im_start|>assistant\n"
+        "{% endif %}"
+    )
+
+
+class MoondreamChatHandler(MTMDChatHandler):
+    # Chat Format:
+    # f"<image>\n\n{chat_history}Question: {question}\n\nAnswer:"
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        "{% if message.role == 'user' %}"
+        "{% if message.content is iterable %}"
+        # <image>
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{{ content.image_url }}\n\n"
+        "{% endif %}"
+        "{% if content.image_url is mapping %}"
+        "{{ content.image_url.url }}\n\n"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+        # Question:
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "Question: {{ content.text }}\n\n"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        # Question:
+        "{% if message.content is string %}"
+        "Question: {{ message.content }}\n\n"
+        "{% endif %}"
+        "{% endif %}"
+        # Answer:
+        "{% if message.role == 'assistant' %}"
+        "Answer:{{ message.content }}\n\n"
+        "{% endif %}"
+        "{% endfor %}"
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "Answer:"
+        "{% endif %}"
+    )
+
+
+class Llava16ChatHandler(MTMDChatHandler):
+    # Example prompt
+    # "DEFAULT_SYSTEM_MESSAGE + USER: <image>\nWhat is shown in this image? ASSISTANT:"
+
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        "{% if message.role == 'system' %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% if message.role == 'user' %}"
+        "{% if message.content is iterable %}"
+        # <image>
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{{ content.image_url }}\n"
+        "{% endif %}"
+        "{% if content.image_url is mapping %}"
+        "{{ content.image_url.url }}\n"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+        # Question:
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        # Question:
+        "{% if message.content is string %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% endif %}"
+        # Answer:
+        "{% if message.role == 'assistant' %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% endfor %}"
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "Answer:"
+        "{% endif %}"
+    )
+
+
+class NanoLlavaChatHandler(MTMDChatHandler):
+    # Prompt Format
+    # The model follow the ChatML standard, however, without \n at the end of <|im_end|>:
+
+    # <|im_start|>system
+    # Answer the question<|im_end|><|im_start|>user
+    # <image>
+    # What is the picture about?<|im_end|><|im_start|>assistant
+    DEFAULT_SYSTEM_MESSAGE = "Answer the question"
+
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        # System message
+        "{% if message.role == 'system' %}"
+        "<|im_start|>system\n"
+        "{{ message.content }}"
+        "<|im_end|>"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "<|im_start|>user\n"
+        "{% if message.content is string %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% if message.content is iterable %}"
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' and content.image_url is string %}"
+        "{{ content.image_url }}"
+        "{% endif %}"
+        "{% if content.type == 'image_url' and content.image_url is mapping %}"
+        "{{ content.image_url.url }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        "<|im_end|>"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        "<|im_start|>assistant\n"
+        "{{ message.content }}"
+        "<|im_end|>"
+        "{% endif %}"
+        "{% endfor %}"
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "<|im_start|>assistant\n"
+        "{% endif %}"
+    )
+
+
+class Llama3VisionAlphaChatHandler(MTMDChatHandler):
+    # question = "<image>" + q
+
+    # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        "<|start_header_id|>"
+        "{% if message.role == 'user' %}"
+        "user<|end_header_id|>\n\n"
+        "{% if message.content is iterable %}"
+        # <image>
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{{ content.image_url }}"
+        "{% endif %}"
+        "{% if content.image_url is mapping %}"
+        "{{ content.image_url.url }}"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+        # Question:
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        # Question:
+        "{% if message.content is string %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% endif %}"
+        # Answer:
+        "{% if message.role == 'assistant' %}"
+        "assistant<|end_header_id|>\n\n"
+        "{{ message.content }}"
+        "{% endif %}"
+        "<|eot_id|>"
+        "{% endfor %}"
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+        "{% endif %}"
+    )
+
+
+# alias
+Llama3VisionAlpha = Llama3VisionAlphaChatHandler
+
+
+class MiniCPMv26ChatHandler(MTMDChatHandler):
+
+    CHAT_FORMAT = (
+        "{% set image_count = namespace(value=0) %}"
+        "{% for message in messages %}"
+        "{% if loop.first and messages[0]['role'] != 'system' %}"
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "{% endif %}"
+        "<|im_start|>{{ message['role'] }}\n"
+        "{% if message['content'] is iterable %}"
+        "{% for content in message['content'] %}"
+        "{% if content.type == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{% set image_count.value = image_count.value + 1 %}"
+        "<image_id>{{ image_count.value }}</image_id>: <image>{{ content.image_url }}</image>"
+        "{% endif %}"
+        "{% if content.image_url is mapping %}"
+        "{% set image_count.value = image_count.value + 1 %}"
+        "<image_id>{{ image_count.value }}</image_id>: <image>{{ content.image_url.url }}</image>"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% for content in message['content'] %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        "{% if message['content'] is string %}"
+        "{{ message['content'] }}"
+        "{% endif %}"
+        "<|im_end|>\n"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}"
+        "<|im_start|>assistant\n"
+        "{% endif %}"
+    )
+
+
+class MiniCPMv45ChatHandler(MTMDChatHandler):
+    """
+    Handler for MiniCPM-V 4.5 models.
+
+    Supports:
+    - Multi-step tool calls with <tool_call> and <tool_response> XML tags.
+    - Integrated reasoning (thinking) process with <think> tags.
+    - Specialized system prompt handling with tool definitions.
+    - Global image numbering for multi-image processing.
+    """
+
+    # Model specific control tokens
+    MINICPMV_BOS_TOKEN = "<|im_start|>"
+    MINICPMV_EOS_TOKEN = "<|im_end|>"
+    MINICPMV_PAD_TOKEN = "<|endoftext|>"
+
+    # Image placeholder tags
+    MINICPMV_IMAGE_START_TOKEN = "<image>"
+    MINICPMV_IMAGE_END_TOKEN = "</image>"
+    MINICPMV_IMAGE_ID_START_TOKEN = "<image_id>"
+    MINICPMV_IMAGE_ID_END_TOKEN = "</image_id>"
+
+    CHAT_FORMAT = (
+        # --- 1. First System Message & Tools Definitions ---
+        "{%- if tools %}"
+            "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}"
+            "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}"
+            "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}"
+            "{{- 'You are provided with function signatures within <tools></tools> XML tags:\\n<tools>' }}"
+            "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}"
+            "{{- '\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\\n</tool_call>" + MINICPMV_EOS_TOKEN + "\\n' }}"
+        "{%- elif messages[0].role == 'system' %}"
+            "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}"
+        "{%- endif %}"
+
+        # --- 2. Message Stream Processing ---
+        "{% set image_count = namespace(value=0) %}"
+        "{%- for message in messages %}"
+            # --- Unified Role Handling (User, Assistant, and subsequent Systems) ---
+            "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}"
+                "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}"
+
+                "{%- set content = message.content %}"
+                "{%- if content is not string %}"
+                    "{%- set ns = namespace(content_str='') %}"
+                    "{%- for item in content %}"
+                        # --- Explicit image_url type and value checking ---
+                        "{%- if item.type == 'image_url' %}"
+                            "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}"
+                            "{%- set image_count.value = image_count.value + 1 %}"
+                            # Format: <image_id>N</image_id>: <image>IMAGE_URL</image>
+                            "{%- set ns.content_str = ns.content_str + '<image_id>' + (image_count.value | string) + '</image_id>: <image>' + image_url + '</image>' %}"
+                        "{%- elif item.type == 'text' %}"
+                            "{%- set ns.content_str = ns.content_str + item.text %}"
+                        "{%- endif %}"
+                    "{%- endfor %}"
+                    "{%- set content = ns.content_str %}"
+                "{%- endif %}"
+
+                "{{- content -}}"
+
+                # Append tool_calls to assistant messages if they exist
+                "{%- if message.role == 'assistant' and message.tool_calls %}"
+                    "{%- for tool_call in message.tool_calls %}"
+                        "{%- set tc = tool_call.function if tool_call.function else tool_call %}"
+                        "{{- '\\n<tool_call>\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}"
+                        "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}"
+                        "{{- '}\\n</tool_call>' }}"
+                    "{%- endfor %}"
+                "{%- endif %}"
+                "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}"
+
+            # --- Specialized Tool Response Handling ---
+            # Group consecutive tool responses under a single user-like block
+            "{%- elif message.role == 'tool' %}"
+                "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}"
+                    "{{- '" + MINICPMV_BOS_TOKEN + "user' }}"
+                "{%- endif %}"
+                "{{- '\\n<tool_response>\\n' + message.content + '\\n</tool_response>' }}"
+                "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}"
+                    "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}"
+                "{%- endif %}"
+            "{%- endif %}"
+        "{%- endfor %}"
+
+        # --- 3. Generation Prompt ---
+        "{%- if add_generation_prompt %}"
+            "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}"
+            # Handle thinking/reasoning block visibility based on configuration
+            "{%- if enable_thinking is defined and enable_thinking is false %}"
+                "{{- '<think>\\n\\n</think>\\n\\n' }}"
+            "{%- elif enable_thinking is defined and enable_thinking is true %}"
+                "{{- '<think>\\n' }}"
+            "{%- endif %}"
+        "{%- endif %}"
+    )
+
+    def __init__(self, enable_thinking: bool = True, **kwargs):
+        """
+        Initializes the MiniCPM-V 4.5 Handler.
+
+        Args:
+            enable_thinking (bool): If True, model generates reasoning before the final answer.
+            **kwargs: Additional arguments for the base MTMDChatHandler.
+        """
+        self.enable_thinking = enable_thinking
+        super().__init__(**kwargs)
+
+    def __call__(self, **kwargs):
+        # Inject thinking control flag into the template
+        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
+
+        # Set stop token patch
+        kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN]
+
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
+        return super().__call__(**kwargs)
+
+
+class MiniCPMV46ChatHandler(MTMDChatHandler):
+    """
+    Handler for MiniCPM-V-4.6 models.
+
+    Features:
+    - Aligned with official tokenizer_config.json special tokens.
+    - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens.
+    - Integrated MTMD-style URL and Base64 injection for visual content.
+    - Specialized `<tool_call>` and `<tool_response>` block generation.
+    - Autonomously folds previous reasoning paths using `last_query_index`.
+    - Toggles `<think>` block generation via `enable_thinking` (Defaults to False).
+    """
+
+    # Core tokens
+    MINICPM_BOS_TOKEN = "<|im_start|>"
+    MINICPM_EOS_TOKEN = "<|im_end|>"
+    MINICPM_PAD_TOKEN = "<|endoftext|>"
+
+    # Vision tokens
+    MINICPM_VISION_BOS_TOKEN = "<|vision_start|>"
+    MINICPM_VISION_EOS_TOKEN = "<|vision_end|>"
+    MINICPM_IMAGE_TOKEN = "<|image_pad|>"
+    MINICPM_VIDEO_TOKEN = "<|video_pad|>"
+
+    CHAT_FORMAT = (
+        "{%- if enable_thinking is not defined -%}\n"
+        "    {%- set enable_thinking = false -%}\n"
+        "{%- endif -%}\n"
+        "{%- macro render_content(content, is_system_content=false) -%}\n"
+        "    {%- if content is string -%}\n"
+        "        {{- content -}}\n"
+        "    {%- elif content is iterable and content is not mapping -%}\n"
+        "        {%- set ns = namespace(parts=[]) -%}\n"
+        "        {%- for item in content -%}\n"
+        "            {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n"
+        "                {%- if is_system_content -%}\n"
+        "                    {{- raise_exception('System message cannot contain images.') -}}\n"
+        "                {%- endif -%}\n"
+        "                {%- set url_val = '' -%}\n"
+        "                {%- if item.type == 'image_url' -%}\n"
+        "                    {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n"
+        "                {%- endif -%}\n"
+        "                {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n"
+        # "            {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n"
+        # "                {%- if is_system_content -%}\n"
+        # "                    {{- raise_exception('System message cannot contain videos.') -}}\n"
+        # "                {%- endif -%}\n"
+        # "                {%- set url_val = '' -%}\n"
+        # "                {%- if item.type == 'video_url' -%}\n"
+        # "                    {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n"
+        # "                {%- endif -%}\n"
+        # "                {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n"
+        "            {%- elif 'text' in item -%}\n"
+        "                {%- set ns.parts = ns.parts + [item.text] -%}\n"
+        "            {%- else -%}\n"
+        "                {{- raise_exception('Unexpected item type in content.') -}}\n"
+        "            {%- endif -%}\n"
+        "        {%- endfor -%}\n"
+        "        {{- ns.parts | join('\\n') -}}\n"
+        "    {%- elif content is none or content is undefined -%}\n"
+        "        {{- '' -}}\n"
+        "    {%- else -%}\n"
+        "        {{- raise_exception('Unexpected content type.') -}}\n"
+        "    {%- endif -%}\n"
+        "{%- endmacro -%}\n"
+        "{%- if not messages %}\n"
+        "    {{- raise_exception('No messages provided.') }}\n"
+        "{%- endif %}\n"
+        "{%- if tools and tools is iterable and tools is not mapping %}\n"
+        "    {{- '<|im_start|>system\\n' }}\n"
+        "    {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n<tools>' }}\n"
+        "    {%- for tool in tools %}\n"
+        "        {{- '\\n' }}\n"
+        "        {{- tool | tojson }}\n"
+        "    {%- endfor %}\n"
+        "    {{- '\\n</tools>' }}\n"
+        "    {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>\\nvalue_1\\n</parameter>\\n<parameter=example_parameter_2>\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\nReminder:\\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n</IMPORTANT>' }}\n"
+        "    {%- if messages[0].role == 'system' %}\n"
+        "        {%- set content = render_content(messages[0].content, true)|trim %}\n"
+        "        {%- if content %}\n"
+        "            {{- '\\n\\n' + content }}\n"
+        "        {%- endif %}\n"
+        "    {%- endif %}\n"
+        "    {{- '<|im_end|>\\n' }}\n"
+        "{%- else %}\n"
+        "    {%- if messages[0].role == 'system' %}\n"
+        "        {%- set content = render_content(messages[0].content, true)|trim %}\n"
+        "        {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n"
+        "    {%- endif %}\n"
+        "{%- endif %}\n"
+        "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n"
+        "{%- for message in messages[::-1] %}\n"
+        "    {%- set index = (messages|length - 1) - loop.index0 %}\n"
+        "    {%- if ns.multi_step_tool and message.role == 'user' %}\n"
+        "        {%- set content = render_content(message.content)|trim %}\n"
+        "        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}\n"
+        "            {%- set ns.multi_step_tool = false %}\n"
+        "            {%- set ns.last_query_index = index %}\n"
+        "        {%- endif %}\n"
+        "    {%- endif %}\n"
+        "{%- endfor %}\n"
+        "{%- if ns.multi_step_tool %}\n"
+        "    {{- raise_exception('No user query found in messages.') }}\n"
+        "{%- endif %}\n"
+        "{%- for message in messages %}\n"
+        "    {%- set content = render_content(message.content)|trim %}\n"
+        "    {%- if message.role == 'system' %}\n"
+        "        {%- if not loop.first %}\n"
+        "            {{- raise_exception('System message must be at the beginning.') }}\n"
+        "        {%- endif %}\n"
+        "    {%- elif message.role == 'user' %}\n"
+        "        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n"
+        "    {%- elif message.role == 'assistant' %}\n"
+        "        {%- set reasoning_content = '' %}\n"
+        "        {%- if message.reasoning_content is string %}\n"
+        "            {%- set reasoning_content = message.reasoning_content %}\n"
+        "        {%- else %}\n"
+        "            {%- if '</think>' in content %}\n"
+        "                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n"
+        "                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n"
+        "            {%- endif %}\n"
+        "        {%- endif %}\n"
+        "        {%- set reasoning_content = reasoning_content|trim %}\n"
+        "        {%- if loop.index0 > ns.last_query_index %}\n"
+        "            {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content + '\\n</think>\\n\\n' + content }}\n"
+        "        {%- else %}\n"
+        "            {{- '<|im_start|>' + message.role + '\\n' + content }}\n"
+        "        {%- endif %}\n"
+        "        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n"
+        "            {%- for tool_call in message.tool_calls %}\n"
+        "                {%- if tool_call.function is defined %}\n"
+        "                    {%- set tool_call = tool_call.function %}\n"
+        "                {%- endif %}\n"
+        "                {%- if loop.first %}\n"
+        "                    {%- if content|trim %}\n"
+        "                        {{- '\\n\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n"
+        "                    {%- else %}\n"
+        "                        {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n"
+        "                    {%- endif %}\n"
+        "                {%- else %}\n"
+        "                    {{- '\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n"
+        "                {%- endif %}\n"
+        "                {%- if tool_call.arguments is defined %}\n"
+        "                    {%- for args_name, args_value in tool_call.arguments|items %}\n"
+        "                        {{- '<parameter=' + args_name + '>\\n' }}\n"
+        "                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n"
+        "                        {{- args_value }}\n"
+        "                        {{- '\\n</parameter>\\n' }}\n"
+        "                    {%- endfor %}\n"
+        "                {%- endif %}\n"
+        "                {{- '</function>\\n</tool_call>' }}\n"
+        "            {%- endfor %}\n"
+        "        {%- endif %}\n"
+        "        {{- '<|im_end|>\\n' }}\n"
+        "    {%- elif message.role == 'tool' %}\n"
+        "        {%- if loop.previtem and loop.previtem.role != 'tool' %}\n"
+        "            {{- '<|im_start|>user' }}\n"
+        "        {%- endif %}\n"
+        "        {{- '\\n<tool_response>\\n' }}\n"
+        "        {{- content }}\n"
+        "        {{- '\\n</tool_response>' }}\n"
+        "        {%- if not loop.last and loop.nextitem.role != 'tool' %}\n"
+        "            {{- '<|im_end|>\\n' }}\n"
+        "        {%- elif loop.last %}\n"
+        "            {{- '<|im_end|>\\n' }}\n"
+        "        {%- endif %}\n"
+        "    {%- else %}\n"
+        "        {{- raise_exception('Unexpected message role.') }}\n"
+        "    {%- endif %}\n"
+        "{%- endfor %}\n"
+        "{%- if add_generation_prompt %}\n"
+        "    {{- '<|im_start|>assistant\\n' }}\n"
+        "    {%- if enable_thinking is defined and enable_thinking is false %}\n"
+        "        {{- '<think>\\n\\n</think>\\n\\n' }}\n"
+        "    {%- else %}\n"
+        "        {{- '<think>\\n' }}\n"
+        "    {%- endif %}\n"
+        "{%- endif %}\n"
+    )
+
+    def __init__(self, enable_thinking: bool = True, **kwargs):
+        """
+        Initializes the MiniCPM-V-4.6 Handler.
+
+        Args:
+            enable_thinking (bool): Controls whether to open a `<think>` block for reasoning.
+                                    Defaults to False as per the standard template logic.
+        """
+        self.enable_thinking = enable_thinking
+        super().__init__(**kwargs)
+
+    def __call__(self, **kwargs):
+        # Inject the thinking variable into the Jinja environment
+        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
+
+        # MiniCPM uses standard <|im_end|> ChatML stop formatting
+        kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN]
+
+        if self.verbose:
+            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
+
+        return super().__call__(**kwargs)
+
+
+class Gemma3ChatHandler(MTMDChatHandler):
+
+    GEMMA3_BOI_TOKEN  = "<start_of_image>"
+    GEMMA3_EOI_TOKEN = "<end_of_image>"
+    GEMMA3_BOS_TOKEN = "<bos>"
+    GEMMA3_EOS_TOKEN = "<eos>"
+
+    CHAT_FORMAT = (
+        "{% if messages[0]['role'] == 'system' %}"
+        "{% set loop_messages = messages[1:] %}"
+        "{% if messages[0]['content'] is string %}"
+        "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}"
+        "{% else %}"
+        "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}"
+        "{% endif %}"
+        "{% else %}"
+        "{% set loop_messages = messages %}"
+        "{% set first_user_prefix = '' %}"
+        "{% endif %}"
+
+        "{% for message in loop_messages %}"
+        "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+        "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}"
+        "{% endif %}"
+
+        "{% if message['role'] == 'assistant' %}"
+        "{% set role = 'model' %}"
+        "{% else %}"
+        "{% set role = message['role'] %}"
+        "{% endif %}"
+
+        "{{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else '') }}"
+
+        "{% if message['content'] is string %}"
+        "{{ message['content'] | trim }}"
+        "{% elif message['content'] is iterable %}"
+        "{% for item in message['content'] %}"
+        "{% if item['type'] == 'image_url' and item['image_url'] is string %}"
+        "{{ '<start_of_image>' + item['image_url'] + '<end_of_image>' }}"
+        "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}"
+        "{{ '<start_of_image>' + item['image_url']['url'] + '<end_of_image>' }}"
+        "{% elif item['type'] == 'text' %}"
+        "{{ item['text'] | trim }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% else %}"
+        "{{ raise_exception('Invalid content type') }}"
+        "{% endif %}"
+
+        "<end_of_turn>\n"
+        "{% endfor %}"
+
+        "{% if add_generation_prompt %}"
+        "<start_of_turn>model\n"
+        "{% endif %}"
+    )
+
+
+class Gemma4ChatHandler(MTMDChatHandler):
+    """
+    Handler for Gemma 4 models.
+
+    Note on `enable_thinking`:
+        The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models.
+        It is NOT supported by Gemma4 E2B and E4B models.
+
+    [Important Note for Audio Processing!]
+        It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models.
+        Other quantizations are known to have degraded performance;
+        ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463
+    """
+
+    # The special token in Gemma 4
+    GEMMA4_BOI_TOKEN  = "<|image>"
+    GEMMA4_EOI_TOKEN = "<image|>"
+    GEMMA4_BOA_TOKEN  = "<|audio>"
+    GEMMA4_EOA_TOKEN = "<audio|>"
+    GEMMA4_BOS_TOKEN = "<bos>"
+    GEMMA4_EOS_TOKEN = "<eos>"
+    GEMMA4_SOT_TOKEN = "<|turn>"
+    GEMMA4_EOT_TOKEN = "<turn|>"
+    GEMMA4_SOC_TOKEN = "<|channel>"
+    GEMMA4_EOC_TOKEN = "<channel|>"
+    GEMMA4_STC_TOKEN = "<|tool_call>"
+    GEMMA4_ETC_TOKEN = "<tool_call|>"
+    GEMMA4_STD_TOKEN = "<|tool>"
+    GEMMA4_ETD_TOKEN = "<tool|>"
+    GEMMA4_STR_TOKEN = "<|tool_response>"
+    GEMMA4_ETR_TOKEN = "<tool_response|>"
+
+    CHAT_FORMAT = (
+        "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n"
+        "    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n"
+        "    {%- set ns = namespace(found_first=false) -%}\n"
+        "    {%- for key, value in properties | dictsort -%}\n"
+        "        {%- set add_comma = false -%}\n"
+        "        {%- if not filter_keys or key not in standard_keys -%}\n"
+        "            {%- if ns.found_first %},{% endif -%}\n"
+        "            {%- set ns.found_first = true -%}\n"
+        "            {{ key }}:{\n"
+        "            {%- if value['description'] -%}\n"
+        "                description:<|\"|>{{ value['description'] }}<|\"|>\n"
+        "                {%- set add_comma = true -%}\n"
+        "            {%- endif -%}\n"
+        "            {%- if value['type'] | upper == 'STRING' -%}\n"
+        "                {%- if value['enum'] -%}\n"
+        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
+        "                    enum:{{ format_argument(value['enum']) }}\n"
+        "                {%- endif -%}\n"
+        "            {%- elif value['type'] | upper == 'ARRAY' -%}\n"
+        "                {%- if value['items'] is mapping and value['items'] -%}\n"
+        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
+        "                    items:{\n"
+        "                    {%- set ns_items = namespace(found_first=false) -%}\n"
+        "                    {%- for item_key, item_value in value['items'] | dictsort -%}\n"
+        "                        {%- if item_value is not none -%}\n"
+        "                            {%- if ns_items.found_first %},{% endif -%}\n"
+        "                            {%- set ns_items.found_first = true -%}\n"
+        "                            {%- if item_key == 'properties' -%}\n"
+        "                                properties:{\n"
+        "                                {%- if item_value is mapping -%}\n"
+        "                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n"
+        "                                {%- endif -%}\n"
+        "                                }\n"
+        "                            {%- elif item_key == 'required' -%}\n"
+        "                                required:[\n"
+        "                                {%- for req_item in item_value -%}\n"
+        "                                    <|\"|>{{- req_item -}}<|\"|>\n"
+        "                                    {%- if not loop.last %},{% endif -%}\n"
+        "                                {%- endfor -%}\n"
+        "                                ]\n"
+        "                            {%- elif item_key == 'type' -%}\n"
+        "                                {%- if item_value is string -%}\n"
+        "                                    type:{{ format_argument(item_value | upper) }}\n"
+        "                                {%- else -%}\n"
+        "                                    type:{{ format_argument(item_value | map('upper') | list) }}\n"
+        "                                {%- endif -%}\n"
+        "                            {%- else -%}\n"
+        "                                {{ item_key }}:{{ format_argument(item_value) }}\n"
+        "                            {%- endif -%}\n"
+        "                        {%- endif -%}\n"
+        "                    {%- endfor -%}\n"
+        "                    }\n"
+        "                {%- endif -%}\n"
+        "            {%- endif -%}\n"
+        "            {%- if value['nullable'] %}\n"
+        "                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
+        "                nullable:true\n"
+        "            {%- endif -%}\n"
+        "            {%- if value['type'] | upper == 'OBJECT' -%}\n"
+        "                {%- if value['properties'] is defined and value['properties'] is mapping -%}\n"
+        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
+        "                    properties:{\n"
+        "                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n"
+        "                    }\n"
+        "                {%- elif value is mapping -%}\n"
+        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
+        "                    properties:{\n"
+        "                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n"
+        "                    }\n"
+        "                {%- endif -%}\n"
+        "                {%- if value['required'] -%}\n"
+        "                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
+        "                    required:[\n"
+        "                    {%- for item in value['required'] | default([]) -%}\n"
+        "                        <|\"|>{{- item -}}<|\"|>\n"
+        "                        {%- if not loop.last %},{% endif -%}\n"
+        "                    {%- endfor -%}\n"
+        "                    ]\n"
+        "                {%- endif -%}\n"
+        "            {%- endif -%}\n"
+        "            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n"
+        "            type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n"
+        "        {%- endif -%}\n"
+        "    {%- endfor -%}\n"
+        "{%- endmacro -%}\n"
+        "{%- macro format_function_declaration(tool_data) -%}\n"
+        "    declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n"
+        "    {%- set params = tool_data['function']['parameters'] -%}\n"
+        "    {%- if params -%}\n"
+        "        ,parameters:{\n"
+        "        {%- if params.get('properties') -%}\n"
+        "            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n"
+        "        {%- endif -%}\n"
+        "        {%- if params.get('required') -%}\n"
+        "            required:[\n"
+        "            {%- for item in params['required'] -%}\n"
+        "                <|\"|>{{- item -}}<|\"|>\n"
+        "                {{- ',' if not loop.last -}}\n"
+        "            {%- endfor -%}\n"
+        "            ],\n"
+        "        {%- endif -%}\n"
+        "        {%- if params.get('type') -%}\n"
+        "            type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n"
+        "        {%- endif -%}\n"
+        "    {%- endif -%}\n"
+        "    {%- if 'response' in tool_data['function'] -%}\n"
+        "        {%- set response_declaration = tool_data['function']['response'] -%}\n"
+        "        ,response:{\n"
+        "        {%- if response_declaration['description'] -%}\n"
+        "            description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n"
+        "        {%- endif -%}\n"
+        "        {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n"
+        "            type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n"
+        "        {%- endif -%}\n"
+        "    {%- endif -%}\n"
+        "    }\n"
+        "{%- endmacro -%}\n"
+        "{%- macro format_argument(argument, escape_keys=True) -%}\n"
+        "    {%- if argument is string -%}\n"
+        "        {{- '<|\"|>' + argument + '<|\"|>' -}}\n"
+        "    {%- elif argument is boolean -%}\n"
+        "        {{- 'true' if argument else 'false' -}}\n"
+        "    {%- elif argument is mapping -%}\n"
+        "        {{- '{' -}}\n"
+        "        {%- set ns = namespace(found_first=false) -%}\n"
+        "        {%- for key, value in argument | dictsort -%}\n"
+        "            {%- if ns.found_first %},{% endif -%}\n"
+        "            {%- set ns.found_first = true -%}\n"
+        "            {%- if escape_keys -%}\n"
+        "                {{- '<|\"|>' + key + '<|\"|>' -}}\n"
+        "            {%- else -%}\n"
+        "                {{- key -}}\n"
+        "            {%- endif -%}\n"
+        "            :{{- format_argument(value, escape_keys=escape_keys) -}}\n"
+        "        {%- endfor -%}\n"
+        "        {{- '}' -}}\n"
+        "    {%- elif argument is sequence -%}\n"
+        "        {{- '[' -}}\n"
+        "        {%- for item in argument -%}\n"
+        "            {{- format_argument(item, escape_keys=escape_keys) -}}\n"
+        "            {%- if not loop.last %},{% endif -%}\n"
+        "        {%- endfor -%}\n"
+        "        {{- ']' -}}\n"
+        "    {%- else -%}\n"
+        "        {{- argument -}}\n"
+        "    {%- endif -%}\n"
+        "{%- endmacro -%}\n"
+        "{%- macro strip_thinking(text) -%}\n"
+        "    {%- set ns = namespace(result='') -%}\n"
+        "    {%- for part in text.split('<channel|>') -%}\n"
+        "        {%- if '<|channel>' in part -%}\n"
+        "            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n"
+        "        {%- else -%}\n"
+        "            {%- set ns.result = ns.result + part -%}\n"
+        "        {%- endif -%}\n"
+        "    {%- endfor -%}\n"
+        "    {{- ns.result | trim -}}\n"
+        "{%- endmacro -%}\n"
+        "\n"
+        "{%- macro format_tool_response_block(tool_name, response) -%}\n"
+        "    {{- '<|tool_response>' -}}\n"
+        "    {%- if response is mapping -%}\n"
+        "        {{- 'response:' + tool_name + '{' -}}\n"
+        "        {%- for key, value in response | dictsort -%}\n"
+        "            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n"
+        "            {%- if not loop.last %},{% endif -%}\n"
+        "        {%- endfor -%}\n"
+        "        {{- '}' -}}\n"
+        "    {%- else -%}\n"
+        "        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n"
+        "    {%- endif -%}\n"
+        "    {{- '<tool_response|>' -}}\n"
+        "{%- endmacro -%}\n"
+        "\n"
+        "{%- set ns = namespace(prev_message_type=None) -%}\n"
+        "{%- set loop_messages = messages -%}\n"
+        "{{- bos_token -}}\n"
+        "{#- Handle System/Tool Definitions Block -#}\n"
+        "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n"
+        "    {{- '<|turn>system\\n' -}}\n"
+        "    {#- Inject Thinking token at the very top of the FIRST system turn -#}\n"
+        "    {%- if enable_thinking is defined and enable_thinking -%}\n"
+        "        {{- '<|think|>\\n' -}}\n"
+        "        {%- set ns.prev_message_type = 'think' -%}\n"
+        "    {%- endif -%}\n"
+        "    {%- if messages[0]['role'] in ['system', 'developer'] -%}\n"
+        "        {%- if messages[0]['content'] is string -%}\n"
+        "            {{- messages[0]['content'] | trim -}}\n"
+        "        {%- elif messages[0]['content'] is sequence -%}\n"
+        "            {%- for item in messages[0]['content'] -%}\n"
+        "                {{- item['text'] | trim + ' '-}}\n"
+        "            {%- endfor -%}\n"
+        "        {%- endif -%}\n"
+        "        {%- set loop_messages = messages[1:] -%}\n"
+        "    {%- endif -%}\n"
+        "    {%- if tools -%}\n"
+        "        {%- for tool in tools %}\n"
+        "            {{- '<|tool>' -}}\n"
+        "            {{- format_function_declaration(tool) | trim -}}\n"
+        "            {{- '<tool|>' -}}\n"
+        "        {%- endfor %}\n"
+        "        {%- set ns.prev_message_type = 'tool' -%}\n"
+        "    {%- endif -%}\n"
+        "    {{- '<turn|>\\n' -}}\n"
+        "{%- endif %}\n"
+        "\n"
+        "{#- Pre-scan: find last user message index for reasoning guard -#}\n"
+        "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n"
+        "{%- for i in range(loop_messages | length) -%}\n"
+        "    {%- if loop_messages[i]['role'] == 'user' -%}\n"
+        "        {%- set ns_turn.last_user_idx = i -%}\n"
+        "    {%- endif -%}\n"
+        "{%- endfor -%}\n"
+        "\n"
+        "{#- Loop through messages -#}\n"
+        "{%- for message in loop_messages -%}\n"
+        "    {%- if message['role'] != 'tool' -%}\n"
+        "    {%- set ns.prev_message_type = None -%}\n"
+        "    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n"
+        "    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n"
+        "    {%- set prev_nt = namespace(role=None, found=false) -%}\n"
+        "    {%- if loop.index0 > 0 -%}\n"
+        "        {%- for j in range(loop.index0 - 1, -1, -1) -%}\n"
+        "            {%- if not prev_nt.found -%}\n"
+        "                {%- if loop_messages[j]['role'] != 'tool' -%}\n"
+        "                    {%- set prev_nt.role = loop_messages[j]['role'] -%}\n"
+        "                    {%- set prev_nt.found = true -%}\n"
+        "                {%- endif -%}\n"
+        "            {%- endif -%}\n"
+        "        {%- endfor -%}\n"
+        "    {%- endif -%}\n"
+        "    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n"
+        "    {%- if not continue_same_model_turn -%}\n"
+        "        {{- '<|turn>' + role + '\\n' }}\n"
+        "    {%- endif -%}\n"
+        "\n"
+        "    {#- Render reasoning/reasoning_content as thinking channel -#}\n"
+        "    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n"
+        "    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n"
+        "        {{- '<|channel>thought\\n' + thinking_text + '\\n<channel|>' -}}\n"
+        "    {%- endif -%}\n"
+        "\n"
+        "            {%- if message.get('tool_calls') -%}\n"
+        "                {%- for tool_call in message['tool_calls'] -%}\n"
+        "                    {%- set function = tool_call['function'] -%}\n"
+        "                    {{- '<|tool_call>call:' + function['name'] + '{' -}}\n"
+        "                    {%- if function['arguments'] is mapping -%}\n"
+        "                        {%- set ns_args = namespace(found_first=false) -%}\n"
+        "                        {%- for key, value in function['arguments'] | dictsort -%}\n"
+        "                            {%- if ns_args.found_first %},{% endif -%}\n"
+        "                            {%- set ns_args.found_first = true -%}\n"
+        "                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n"
+        "                        {%- endfor -%}\n"
+        "                    {%- elif function['arguments'] is string -%}\n"
+        "                        {{- function['arguments'] -}}\n"
+        "                    {%- endif -%}\n"
+        "                    {{- '}<tool_call|>' -}}\n"
+        "                {%- endfor -%}\n"
+        "                {%- set ns.prev_message_type = 'tool_call' -%}\n"
+        "            {%- endif -%}\n"
+        "\n"
+        "            {%- set ns_tr_out = namespace(flag=false) -%}\n"
+        "            {%- if message.get('tool_responses') -%}\n"
+        "                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n"
+        "                {%- for tool_response in message['tool_responses'] -%}\n"
+        "                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n"
+        "                    {%- set ns_tr_out.flag = true -%}\n"
+        "                    {%- set ns.prev_message_type = 'tool_response' -%}\n"
+        "                {%- endfor -%}\n"
+        "            {%- elif message.get('tool_calls') -%}\n"
+        "                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n"
+        "                {%- set ns_tool_scan = namespace(stopped=false) -%}\n"
+        "                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n"
+        "                    {%- if ns_tool_scan.stopped -%}\n"
+        "                    {%- elif loop_messages[k]['role'] != 'tool' -%}\n"
+        "                        {%- set ns_tool_scan.stopped = true -%}\n"
+        "                    {%- else -%}\n"
+        "                        {%- set follow = loop_messages[k] -%}\n"
+        "                        {#- Resolve tool_call_id to function name -#}\n"
+        "                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n"
+        "                        {%- for tc in message['tool_calls'] -%}\n"
+        "                            {%- if tc.get('id') == follow.get('tool_call_id') -%}\n"
+        "                                {%- set ns_tname.name = tc['function']['name'] -%}\n"
+        "                            {%- endif -%}\n"
+        "                        {%- endfor -%}\n"
+        "                        {#- Handle content as string or content-parts array -#}\n"
+        "                        {%- set tool_body = follow.get('content') -%}\n"
+        "                        {%- if tool_body is string -%}\n"
+        "                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n"
+        "                        {%- elif tool_body is sequence and tool_body is not string -%}\n"
+        "                            {%- set ns_txt = namespace(s='') -%}\n"
+        "                            {%- for part in tool_body -%}\n"
+        "                                {%- if part.get('type') == 'text' -%}\n"
+        "                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n"
+        "                                {%- endif -%}\n"
+        "                            {%- endfor -%}\n"
+        "                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n"
+        "                            {%- for part in tool_body -%}\n"
+        "                                {%- if part.get('type') == 'image_url' -%}\n"
+        "                                    {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n"
+        "                                    {{- '<|image|>' + url_val -}}\n"
+        "                                {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n"
+        "                                    {%- if part.get('type') == 'audio_url' -%}\n"
+        "                                        {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n"
+        "                                        {{- '<|audio|>' + audio_val -}}\n"
+        "                                    {%- elif part.get('type') == 'input_audio' -%}\n"
+        "                                        {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n"
+        "                                        {{- '<|audio|>' + audio_val -}}\n"
+        "                                    {%- endif -%}\n"
+        # "                              {%- elif part.get('type') == 'video_url' -%}\n"
+        # "                                  {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n"
+        # "                                  {{- '<|video|>' + video_val -}}\n"
+        "                                {%- endif -%}\n"
+        "                            {%- endfor -%}\n"
+        "                        {%- else -%}\n"
+        "                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n"
+        "                        {%- endif -%}\n"
+        "                        {%- set ns_tr_out.flag = true -%}\n"
+        "                        {%- set ns.prev_message_type = 'tool_response' -%}\n"
+        "                    {%- endif -%}\n"
+        "                {%- endfor -%}\n"
+        "            {%- endif -%}\n"
+        "\n"
+        "            {%- set captured_content -%}\n"
+        "            {%- if message['content'] is string -%}\n"
+        "                {%- if role == 'model' -%}\n"
+        "                    {{- strip_thinking(message['content']) -}}\n"
+        "                {%- else -%}\n"
+        "                    {{- message['content'] | trim -}}\n"
+        "                {%- endif -%}\n"
+        "            {%- elif message['content'] is sequence -%}\n"
+        "                {%- for item in message['content'] -%}\n"
+        "                    {%- if item['type'] == 'text' -%}\n"
+        "                        {%- if role == 'model' -%}\n"
+        "                            {{- strip_thinking(item['text']) -}}\n"
+        "                        {%- else -%}\n"
+        "                            {{- item['text'] | trim -}}\n"
+        "                        {%- endif -%}\n"
+        "                    {%- elif item['type'] == 'image_url' -%}\n"
+        "                        {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n"
+        "                        {{- '<|image|>' + url_val -}}\n"
+        "                        {%- set ns.prev_message_type = 'image' -%}\n"
+        "                    {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n"
+        "                        {%- if item['type'] == 'audio_url' -%}\n"
+        "                            {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n"
+        "                            {{- '<|audio|>' + audio_val -}}\n"
+        "                        {%- elif item['type'] == 'input_audio' -%}\n"
+        "                            {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n"
+        "                            {{- '<|audio|>' + audio_val -}}\n"
+        "                        {%- endif -%}\n"
+        "                        {%- set ns.prev_message_type = 'audio' -%}\n"
+        "                    {%- endif -%}\n"
+        # "                    {%- elif item['type'] == 'video_url' -%}\n"
+        # "                        {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n"
+        # "                        {{- '<|video|>' + video_val -}}\n"
+        # "                        {%- set ns.prev_message_type = 'video' -%}\n"
+        "                {%- endfor -%}\n"
+        "            {%- endif -%}\n"
+        "            {%- endset -%}\n"
+        "\n"
+        "            {{- captured_content -}}\n"
+        "            {%- set has_content = captured_content | trim | length > 0 -%}\n"
+        "\n"
+        "        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n"
+        "            {{- '<|tool_response>' -}}\n"
+        "        {%- elif not (ns_tr_out.flag and not has_content) -%}\n"
+        "            {{- '<turn|>\\n' -}}\n"
+        "        {%- endif -%}\n"
+        "    {%- endif -%}\n"
+        "{%- endfor -%}\n"
+        "\n"
+        "{%- if add_generation_prompt -%}\n"
+        "    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n"
+        "        {{- '<|turn>model\\n' -}}\n"
+        "        {%- if not enable_thinking | default(false) -%}\n"
+        "            {{- '<|channel>thought\\n<channel|>' -}}\n"
+        "        {%- endif -%}\n"
+        "    {%- endif -%}\n"
+        "{%- endif -%}\n"
+    )
+
+    def __init__(self, enable_thinking: bool = True, **kwargs):
+        """
+        Initializes the Gemma 4 Handler.
+
+        Args:
+            enable_thinking (bool): Controls whether the <|think|> tag is injected and
+                                    manages <|channel>thought behavior.
+                                    Note: ONLY supported on Gemma4 31B and 26BA4B models.
+                                    NOT supported on Gemma4 E2B and E4B models.
+        """
+        self.enable_thinking = enable_thinking
+        super().__init__(**kwargs)
+
+    def __call__(self, **kwargs):
+        # Inject the thinking variable into the Jinja environment
+        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
+
+        # Set the stop token based on Gemma 4's format (<turn|>)
+        # generation_config.json:   "eos_token_id": [1, 106, 50]
+        kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN]
+
+        if self.verbose:
+            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
+
+        return super().__call__(**kwargs)
+
+
+class GLM41VChatHandler(MTMDChatHandler):
+    # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32.
+
+    GLM41V_EOS_TOKEN = "<|endoftext|>"
+    GLM41V_PAD_TOKEN = "<|endoftext|>"
+    GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>"
+    GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>"
+
+    CHAT_FORMAT = (
+        "[gMASK]<sop>\n"
+        "{%- for msg in messages -%}"
+            "{%- if msg.role == 'system' -%}"
+                "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
+            "{%- elif msg.role == 'user' -%}"
+                "<|user|>\n"
+                "{%- if msg.content is string -%}"
+                    "{{ msg.content }}"
+                "{%- else -%}"
+                    "{%- for item in msg.content -%}"
+                        "{%- if item.type == 'image_url' or 'image_url' in item -%}"
+                            "<|begin_of_image|>"
+                            "{%- if item.image_url is string -%}"
+                                "{{- item.image_url -}}"
+                            "{%- else -%}"
+                                "{{- item.image_url.url -}}"
+                            "{%- endif -%}"
+                            "<|end_of_image|>"
+                        "{%- elif item.type == 'text' -%}"
+                            "{{ item.text }}"
+                        "{%- endif -%}"
+                    "{%- endfor -%}"
+                "{%- endif -%}{{ GLM41V_EOS_TOKEN }}"
+            "{%- elif msg.role == 'assistant' -%}"
+                "{%- if msg.metadata -%}"
+                    "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
+                "{%- else -%}"
+                    "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
+                "{%- endif -%}"
+            "{%- endif -%}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+            "<|assistant|>\n"
+        "{%- endif -%}"
+    )
+
+    def __call__(self, **kwargs):
+        self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN
+        # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json
+        stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", "</answer>"] # Stop token patch
+        kwargs['stop'] = stop_tokens
+
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing")
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
+
+
+class GLM46VChatHandler(MTMDChatHandler):
+    GLM46V_EOS_TOKEN = "<|endoftext|>"
+    GLM46V_PAD_TOKEN = "<|endoftext|>"
+    GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>"
+    GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>"
+
+    CHAT_FORMAT = (
+        "[gMASK]<sop>"
+        "{%- if tools -%}"
+            "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n"
+            "You are provided with function signatures within <tools></tools> XML tags:\n<tools>\n"
+            "{%- for tool in tools -%}"
+                "{{ tool | tojson(ensure_ascii=False) }}\n"
+            "{%- endfor -%}"
+            "</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n"
+            "<tool_call>{function-name}\n<arg_key>{arg-key-1}</arg_key>\n<arg_value>{arg-value-1}</arg_value>\n...\n</tool_call>"
+        "{%- endif -%}"
+
+        "{%- for m in messages -%}"
+            "{%- if m.role == 'system' -%}"
+                "<|system|>\n{{ m.content }}"
+            "{%- elif m.role == 'user' -%}"
+                "<|user|>\n"
+                "{%- if m.content is string -%}"
+                    "{{ m.content }}"
+                "{%- else -%}"
+                    "{%- for item in m.content -%}"
+                        "{%- if item.type == 'image_url' or 'image_url' in item -%}"
+                            "<|begin_of_image|>"
+                            "{%- if item.image_url is string -%}"
+                                "{{- item.image_url -}}"
+                            "{%- else -%}"
+                                "{{- item.image_url.url -}}"
+                            "{%- endif -%}"
+                            "<|end_of_image|>"
+                        "{%- elif item.type == 'text' -%}"
+                            "{{ item.text }}"
+                        "{%- endif -%}"
+                    "{%- endfor -%}"
+                "{%- endif -%}"
+                # If enable_thinking is disabled, insert `/nothink` according to the source code logic.
+                "{{ '/nothink' if not enable_thinking else '' }}"
+            "{%- elif m.role == 'assistant' -%}"
+                "<|assistant|>"
+                "{%- if enable_thinking -%}"
+                    "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}"
+                    "\n<think>{{ reasoning.strip() }}</think>"
+                "{%- else -%}"
+                    "\n<think></think>"
+                "{%- endif -%}"
+                "{{ '\n' + m.content.strip() if m.content.strip() else '' }}"
+            "{%- endif -%}"
+            "{{ GLM46V_EOS_TOKEN }}"
+        "{%- endfor -%}"
+
+        "{%- if add_generation_prompt -%}"
+            "<|assistant|>\n"
+            "{{ '<think>' if enable_thinking else '<think></think>\n' }}"
+        "{%- endif -%}"
+    )
+
+    def __init__(self, enable_thinking: bool = True, **kwargs):
+        """
+        GLM-4.6V Handler
+        Parameters:
+        - enable_thinking (bool): Whether to enable the model's think process. The default is True.
+        """
+        self.enable_thinking = enable_thinking
+        super().__init__(**kwargs)
+
+    def __call__(self, **kwargs):
+        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
+        self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN
+
+        # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json
+        kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch
+
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
+
+        return super().__call__(**kwargs)
+
+
+class GraniteDoclingChatHandler(MTMDChatHandler):
+    """
+    Handler for Granite-Docling models.
+
+    Format(512x512): <loc_xmin><loc_ymin><loc_xmax><loc_ymax>Content
+
+    Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!!
+                    Since the model does not have special tokens for the start and end of an image,
+                    it is recommended to process only one image at a time.
+                    You can iterate through the images individually for recognition.
+
+    """
+    GRANITE_BOS_TOKEN = "<|start_of_role|>"
+    GRANITE_EOS_TOKEN = "<|end_of_text|>"
+    GRANITE_PAD_TOKEN = "<|end_of_text|>"
+    GRANITE_IMAGE_TOKEN = "<image>"
+
+    CHAT_FORMAT = (
+        "{%- for message in messages -%}"
+            "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}"
+            "{%- if message['content'] is string -%}"
+                "{{- message['content'] -}}"
+            "{%- else -%}"
+                "{%- for part in message['content'] -%}"
+                    "{%- if part['type'] == 'text' -%}"
+                        "{{- part['text'] -}}"
+                    "{%- elif part['type'] == 'image_url' -%}"
+                        "{%- if part.image_url is string -%}"
+                            "{{- part.image_url -}}"
+                        "{%- else -%}"
+                            "{{- part.image_url.url -}}"
+                        "{%- endif -%}"
+                    "{%- endif -%}"
+                "{%- endfor -%}"
+            "{%- endif -%}"
+            "{{- '<|end_of_text|>\n' -}}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+            "{{- '<|start_of_role|>assistant' -}}"
+            # Support the 'controls' parameter if present in generation arguments
+            "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}"
+            "{{- '<|end_of_role|>' -}}"
+        "{%- endif -%}"
+    )
+
+    def __init__(self, controls: dict = None, **kwargs):
+        """
+        Granite-Docling Handler
+        Args:
+            controls (dict, optional): Operational parameters passed to the assistant role.
+
+            The 'controls' parameter is used to guide the model's behavior or output format.
+            Common examples for 'controls' include:
+             - Document Parsing: {"mode": "document_parsing", "format": "json"}
+        """
+        self.controls = controls
+        super().__init__(**kwargs)
+
+    def __call__(self, **kwargs):
+        # Inject controls into the template environment
+        self.extra_template_arguments["controls"] = self.controls
+        self.DEFAULT_SYSTEM_MESSAGE = None
+        kwargs['stop'] = [self.GRANITE_EOS_TOKEN]
+
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing")
+
+
+        return super().__call__(**kwargs)
+
+
+class LFM2VLChatHandler(MTMDChatHandler):
+    LFM2VL_BOS_TOKEN = "<|startoftext|>"
+    LFM2VL_EOS_TOKEN = "<|im_end|>"
+    LFM2VL_IMAGE_START_TOKEN = "<|image_start|>"
+    LFM2VL_IMAGE_END_TOKEN = "<|image_end|>"
+
+    CHAT_FORMAT = (
+        "{%- for message in messages -%}"
+            "{{ '<|im_start|>' + message['role'] + '\n' }}"
+            "{%- if message['content'] is string -%}"
+                "{{ message['content'] }}"
+            "{%- else -%}"
+                "{%- for content in message['content'] -%}"
+                    "{%- if 'image_url' in content -%}"
+                        "{%- if content.image_url is string -%}"
+                            "<|image_start|>{{ content.image_url }}<|image_end|>"
+                        "{%- else -%}"
+                            "<|image_start|>{{ content.image_url.url }}<|image_end|>"
+                        "{%- endif -%}"
+                    "{%- elif content['type'] == 'text' -%}"
+                        "{{ content['text'] }}"
+                    "{%- endif -%}"
+                "{%- endfor -%}"
+            "{%- endif -%}"
+            "{{ '<|im_end|>\n' }}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+            "{{ '<|im_start|>assistant\n' }}"
+        "{%- endif -%}"
+    )
+
+    def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs):
+        """
+        LFM2-VL Handler
+        LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256
+        """
+        self.image_min_tokens = image_min_tokens
+        self.image_max_tokens = image_max_tokens
+        super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs)
+
+    def __call__(self, **kwargs):
+
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing")
+
+        return super().__call__(**kwargs)
+
+
+class LFM25VLChatHandler(MTMDChatHandler):
+    """
+    Handler for LFM2.5-VL multimodal models.
+
+    Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing.
+    """
+    # Aligned with LFM2.5-VL tokenizer_config
+    LFM25VL_BOS_TOKEN = "<|startoftext|>"
+    LFM25VL_EOS_TOKEN = "<|im_end|>"
+    LFM25VL_PAD_TOKEN = "<|pad|>"
+
+    # Image specific tokens
+    LFM25VL_IMAGE_TOKEN = "<image>"
+    LFM25VL_IMAGE_START_TOKEN = "<|image_start|>"
+    LFM25VL_IMAGE_END_TOKEN = "<|image_end|>"
+    LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>"
+
+    CHAT_FORMAT = (
+        "{{- bos_token -}}\n"
+        "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n"
+        "{%- set ns = namespace(system_prompt='', content='') -%}\n"
+        "{%- if messages[0]['role'] == 'system' -%}\n"
+        "    {%- set ns.system_prompt = messages[0]['content'] -%}\n"
+        "    {%- set messages = messages[1:] -%}\n"
+        "{%- endif -%}\n"
+        "{%- if tools -%}\n"
+        "    {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n"
+        "    {%- for tool in tools -%}\n"
+        "        {%- if tool is not string -%}\n"
+        "            {%- set tool = tool | tojson -%}\n"
+        "        {%- endif -%}\n"
+        "        {%- set ns.system_prompt = ns.system_prompt + tool -%}\n"
+        "        {%- if not loop.last -%}\n"
+        "            {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n"
+        "        {%- endif -%}\n"
+        "    {%- endfor -%}\n"
+        "    {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n"
+        "{%- endif -%}\n"
+        "{%- if ns.system_prompt -%}\n"
+        "    {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n"
+        "{%- endif -%}\n"
+        "{%- set ns.last_assistant_index = -1 -%}\n"
+        "{%- for message in messages -%}\n"
+        "    {%- if message['role'] == 'assistant' -%}\n"
+        "        {%- set ns.last_assistant_index = loop.index0 -%}\n"
+        "    {%- endif -%}\n"
+        "{%- endfor -%}\n"
+        "{%- for message in messages -%}\n"
+        "    {{- '<|im_start|>' + message['role'] + '\\n' -}}\n"
+        "    {%- set content = message['content'] -%}\n"
+        "    {%- if content is not string -%}\n"
+        "        {%- set ns.content = '' -%}\n"
+        "        {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n"
+        "        {%- for item in content -%}\n"
+        "            {%- if item['type'] == 'image_url' -%}\n"
+        "                {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n"
+        "                {%- set ns.content = ns.content + img_val -%}\n"
+        "            {%- elif item['type'] == 'text' -%}\n"
+        "                {%- set ns.content = ns.content + item['text'] -%}\n"
+        "            {%- else -%}\n"
+        "                {%- set ns.content = ns.content + (item | tojson) -%}\n"
+        "            {%- endif -%}\n"
+        "        {%- endfor -%}\n"
+        "        {%- set content = ns.content -%}\n"
+        "    {%- endif -%}\n"
+        "    {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n"
+        "        {%- if '</think>' in content -%}\n"
+        "            {%- set content = content.split('</think>')[-1] | trim -%}\n"
+        "        {%- endif -%}\n"
+        "    {%- endif -%}\n"
+        "    {{- content + '<|im_end|>\\n' -}}\n"
+        "{%- endfor -%}\n"
+        "{%- if add_generation_prompt -%}\n"
+        "    {{- '<|im_start|>assistant\\n' -}}\n"
+        "{%- endif -%}\n"
+    )
+
+    def __init__(self, keep_past_thinking: bool = False, **kwargs):
+        self.keep_past_thinking = keep_past_thinking
+        super().__init__(**kwargs)
+
+
+    def __call__(self, **kwargs):
+        if self.image_min_tokens > 256:
+            if self.verbose:
+                print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.")
+            self.image_min_tokens = -1
+
+        self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking
+
+        kwargs['stop'] = [self.LFM25VL_EOS_TOKEN]
+
+        if self.verbose:
+            print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing")
+        return super().__call__(**kwargs)
+
+
+class PaddleOCRChatHandler(MTMDChatHandler):
+    """
+    Handler for PaddleOCR 1.5/1.6 multimodal models.
+    """
+
+    PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>"
+    PADDLEOCR_BOS_TOKEN = "<s>"
+    PADDLEOCR_EOS_TOKEN = "</s>"
+    PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>"
+    PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>"
+    PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>"
+
+    CHAT_FORMAT = (
+        "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}"
+        "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}"
+        "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}"
+
+        "{{- cls_token -}}"
+        "{%- for message in messages -%}"
+            "{%- if message['role'] == 'user' -%}"
+                "{{- 'User: ' -}}"
+
+                # Robust parsing: Check if content is string or list
+                "{%- if message['content'] is string -%}"
+                    "{{- message['content'] -}}"
+                "{%- else -%}"
+                    # Pass 1: Render all images first
+                    "{%- for content in message['content'] -%}"
+                        "{%- if content['type'] == 'image_url' and 'image_url' in content -%}"
+                            "{{- '<|IMAGE_START|>' -}}"
+                                "{%- if content.image_url is string -%}"
+                                    "{{- content.image_url -}}"
+                                "{%- else -%}"
+                                    "{{- content.image_url.url -}}"
+                                "{%- endif -%}"
+                            "{{- '<|IMAGE_END|>' -}}"
+                        "{%- endif -%}"
+                    "{%- endfor -%}"
+
+                    # Pass 2: Render all text second
+                    "{%- for content in message['content'] -%}"
+                        "{%- if content['type'] == 'text' -%}"
+                            "{{- content['text'] -}}"
+                        "{%- endif -%}"
+                    "{%- endfor -%}"
+                "{%- endif -%}"
+                "{{- '\\n' -}}"
+
+            "{%- elif message['role'] == 'assistant' -%}"
+                "{{- 'Assistant:\\n' -}}"
+                "{%- if message['content'] is string -%}"
+                    "{{- message['content'] -}}"
+                "{%- else -%}"
+                    "{%- for content in message['content'] -%}"
+                        "{%- if content['type'] == 'text' -%}"
+                            "{{- content['text'] -}}"
+                        "{%- endif -%}"
+                    "{%- endfor -%}"
+                "{%- endif -%}"
+                "{{- eos_token -}}"
+
+            "{%- elif message['role'] == 'system' -%}"
+                "{%- if message['content'] is string -%}"
+                    "{{- message['content'] + '\\n' -}}"
+                "{%- else -%}"
+                    "{%- for content in message['content'] -%}"
+                        "{%- if content['type'] == 'text' -%}"
+                            "{{- content['text'] + '\\n' -}}"
+                        "{%- endif -%}"
+                    "{%- endfor -%}"
+                "{%- endif -%}"
+            "{%- endif -%}"
+        "{%- endfor -%}"
+
+        "{%- if add_generation_prompt -%}"
+            "{{- 'Assistant:\\n' -}}"
+        "{%- endif -%}"
+    )
+
+    def __init__(
+        self,
+        image_min_tokens: int = -1,
+        image_max_tokens: int = -1,
+        **kwargs
+    ):
+        self.image_min_tokens = image_min_tokens
+        self.image_max_tokens = image_max_tokens
+        super().__init__(
+            image_min_tokens=self.image_min_tokens,
+            image_max_tokens=self.image_max_tokens,
+            **kwargs
+        )
+
+    def __call__(self, **kwargs):
+        # Set the specific stop token defined in the PaddleOCR template
+        kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN]
+
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing")
+
+        return super().__call__(**kwargs)
+
+
+class Qwen25VLChatHandler(MTMDChatHandler):
+
+    QWEN25_VL_BOS_TOKEN = "<|endoftext|>"
+    QWEN25_VL_PAD_TOKEN = "<|endoftext|>"
+    QWEN25_VL_EOS_TOKEN = "<|im_end|>"
+
+    CHAT_FORMAT = (
+        "{% set image_count = namespace(value=0) %}"
+        "{% for message in messages %}"
+        "{% if loop.first and message['role'] != 'system' %}"
+        "<|im_start|>system\n"
+        "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n"
+        "{% endif %}"
+        "<|im_start|>{{ message['role'] }}\n"
+        "{% if message['content'] is string %}"
+        "{{ message['content'] }}<|im_end|>\n"
+        "{% else %}"
+        "{% for content in message['content'] %}"
+        "{% if content['type'] == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{% set image_count.value = image_count.value + 1 %}"
+        "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>"
+        "{% else %}"
+        "{% set image_count.value = image_count.value + 1 %}"
+        "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>"
+        "{% endif %}"
+        "{% elif content['type'] == 'text' %}"
+        "{{ content['text'] }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        "{% endfor %}"
+        "<|im_start|>assistant\n"
+    )
+
+    def __call__(self, **kwargs):
+        kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN]
+
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing")
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
+
+class Qwen3ASRChatHandler(MTMDChatHandler):
+    """
+    Handler for Qwen 3 ASR (Automatic Speech Recognition) models.
+
+    Features:
+    - Highly specialized for Speech-to-Text tasks.
+    - Aggregates all system text into a single cohesive system block.
+    - Drops user text entirely, extracting ONLY audio data into a unified user turn.
+    - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>.
+    - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url.
+    """
+
+    DEFAULT_SYSTEM_MESSAGE = """
+    You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language.
+    You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization.
+    """
+
+    QWEN3_ASR_BOS_TOKEN = "<|im_start|>"
+    QWEN3_ASR_PAD_TOKEN = "<|endoftext|>"
+    QWEN3_ASR_EOS_TOKEN = "<|im_end|>"
+
+
+    QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>"
+    QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>"
+    QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>"
+
+    CHAT_FORMAT = (
+        "{%- set ns = namespace(system_text='') -%}\n"
+        "{%- for m in messages -%}\n"
+        "    {%- if m.role == 'system' -%}\n"
+        "        {%- if m.content is string -%}\n"
+        "            {%- set ns.system_text = ns.system_text + m.content -%}\n"
+        "        {%- else -%}\n"
+        "            {%- for c in m.content -%}\n"
+        "                {%- if c.type == 'text' and (c.text is defined) -%}\n"
+        "                    {%- set ns.system_text = ns.system_text + c.text -%}\n"
+        "                {%- endif -%}\n"
+        "            {%- endfor -%}\n"
+        "        {%- endif -%}\n"
+        "    {%- endif -%}\n"
+        "{%- endfor -%}\n"
+        "\n"
+        "{%- set ns2 = namespace(audio_tokens='') -%}\n"
+        "{%- for m in messages -%}\n"
+        "    {%- if m.content is not string -%}\n"
+        "        {%- for c in m.content -%}\n"
+        "            {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n"
+        "                {#- MTMD Audio Injection -#}\n"
+        "                {%- set audio_val = '' -%}\n"
+        "                {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n"
+        "                    {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n"
+        "                {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n"
+        "                    {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n"
+        "                {%- endif -%}\n"
+        "                {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n"
+        "            {%- endif -%}\n"
+        "        {%- endfor -%}\n"
+        "    {%- endif -%}\n"
+        "{%- endfor -%}\n"
+        "\n"
+        "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n"
+        "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n"
+        "{%- if add_generation_prompt -%}\n"
+        "    {{- '<|im_start|>assistant\\n' -}}\n"
+        "{%- endif -%}\n"
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, **kwargs):
+        # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token
+        kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN]
+
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)")
+
+        return super().__call__(**kwargs)
+
+class Qwen3VLChatHandler(MTMDChatHandler):
+
+    QWEN3_VL_BOS_TOKEN = "<|endoftext|>"
+    QWEN3_VL_PAD_TOKEN = "<|endoftext|>"
+    QWEN3_VL_EOS_TOKEN = "<|im_end|>"
+
+    CHAT_FORMAT = (
+        "{{- '<|im_start|>system\n' -}}"
+        "{%- if messages[0].content is string and messages[0].role == 'system' -%}"
+            "{{- messages[0].content -}}"
+        "{%- elif messages[0].role == 'system' -%}"
+            "{%- if 'text' in messages[0].content -%}"
+                "{{- messages[0].content.text -}}"
+            "{%- else -%}"
+                "{{- 'You are a helpful assistant.' -}}"
+            "{%- endif -%}"
+        "{%- endif -%}"
+        "{%- if tools -%}"
+            "{{- '\n\n' -}}"
+            "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>' -}}"
+            "{%- for tool in tools -%}"
+                "{{- '\n' -}}"
+                "{{- tool | tojson -}}"
+            "{%- endfor -%}"
+            "{{- '\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>' -}}"
+        "{%- endif -%}"
+        "{{- '<|im_end|>\n' -}}"
+        "{%- set image_count = namespace(value=0) -%}"
+        #"{%- set video_count = namespace(value=0) -%}"
+        "{%- for message in messages -%}"
+            "{%- if message.role == 'tool' -%}"
+                "{{- '<|im_start|>user\n<tool_response>\n' -}}"
+            "{%- elif message.role != 'system' -%}"
+                "{{- '<|im_start|>' + message.role + '\n' -}}"
+            "{%- endif -%}"
+            "{%- if message.content is string and message.role != 'system' -%}"
+                "{{- message.content -}}"
+            "{%- elif message.role != 'system' -%}"
+                "{%- for content in message.content -%}"
+                    "{%- if 'image_url' in content -%}"
+                        "{%- set image_count.value = image_count.value + 1 -%}"
+                        "{%- if add_vision_id -%}"
+                            "{{- 'Picture ' -}}"
+                            "{{- image_count.value | string -}}"
+                            "{{- ': ' -}}"
+                        "{%- endif -%}"
+                        "{{- '<|vision_start|>' -}}"
+                        "{%- if content.image_url is string -%}"
+                            "{{- content.image_url -}}"
+                        "{%- else -%}"
+                            "{{- content.image_url.url -}}"
+                        "{%- endif -%}"
+                        "{{- '<|vision_end|>' -}}"
+                    "{%- endif -%}"
+                    # Video not supported yet
+                    "{%- if 'text' in content -%}"
+                        "{{- content.text -}}"
+                    "{%- endif -%}"
+                "{%- endfor -%}"
+            "{%- endif -%}"
+            "{%- if message.role == 'assistant' -%}"
+                "{%- if message.tool_calls -%}"
+                    "{%- for tool_call in message.tool_calls -%}"
+                        "{%- if (loop.first and message.content) or (not loop.first) -%}"
+                            "{{- '\n' -}}"
+                        "{%- endif -%}"
+                        "{%- if tool_call.function -%}"
+                            "{%- set tool_call = tool_call.function -%}"
+                        "{%- endif -%}"
+                        "{{- '<tool_call>\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}"
+                        "{%- if tool_call.arguments is string -%}"
+                            "{{- tool_call.arguments -}}"
+                        "{%- else -%}"
+                            "{{- tool_call.arguments | tojson -}}"
+                        "{%- endif -%}"
+                        "{{- '}\n</tool_call>' -}}"
+                    "{%- endfor -%}"
+                "{%- endif -%}"
+            "{%- elif message.role == 'tool' -%}"
+                "{{- '</tool_response>' -}}"
+            "{%- endif -%}"
+            "{%- if message.role != 'system' -%}"
+                "{{- '<|im_end|>\n' -}}"
+            "{%- endif -%}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+            "{{- '<|im_start|>assistant\n' -}}"
+            "{%- if force_reasoning -%}"
+                "{{- '<think>\n' -}}"
+            "{%- endif -%}"
+        "{%- endif -%}"
+    )
+
+    def __init__(
+        self,
+        force_reasoning: bool = False,
+        add_vision_id: bool = True,
+        **kwargs,
+    ):
+        """
+        Parameters:
+        - force_reasoning (bool):
+            - True: Force the reasoning in the model by adding <think> to the chat template.
+            - False (default): Don't force the reasoning.
+        - add_vision_id (bool):
+            - True (default): Count all the images. Recommended for multi-image.
+            - False: Doesn't count the images. Can save tokens with single-image.
+        """
+        super().__init__(**kwargs)
+        self.force_reasoning = force_reasoning
+        self.extra_template_arguments["force_reasoning"] = force_reasoning
+        self.extra_template_arguments["add_vision_id"] = add_vision_id
+
+    def __call__(self, **kwargs):
+        kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN]
+
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing")
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
+
+class Qwen35ChatHandler(MTMDChatHandler):
+    """
+    Handler for Qwen3.5/Qwen3.6 models.
+    """
+    CHAT_FORMAT = (
+        "{%- set image_count = namespace(value=0) -%}"
+        "{%- set video_count = namespace(value=0) -%}"
+        "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}"
+        "    {%- if content is string -%}"
+        "        {{- content -}}"
+        "    {%- elif content is iterable and content is not mapping -%}"
+        "        {%- for item in content -%}"
+        "            {%- if 'image_url' in item or item.type == 'image_url' -%}"
+        "                {%- if is_system_content -%}"
+        "                    {{- raise_exception('System message cannot contain images.') -}}"
+        "                {%- endif -%}"
+        "                {%- if do_vision_count -%}"
+        "                    {%- set image_count.value = image_count.value + 1 -%}"
+        "                {%- endif -%}"
+        "                {%- if add_vision_id -%}"
+        "                    {{- 'Picture ' -}}"
+        "                    {{- image_count.value | string -}}"
+        "                    {{- ': ' -}}"
+        "                {%- endif -%}"
+        "                {{- '<|vision_start|>' -}}"
+        "                {%- if item.image_url is string -%}"
+        "                    {{- item.image_url -}}"
+        "                {%- else -%}"
+        "                    {{- item.image_url.url -}}"
+        "                {%- endif -%}"
+        "                {{- '<|vision_end|>' -}}"
+        "            {%- elif 'video' in item -%}"
+        "                {{- raise_exception('llama.cpp does not currently support video.') -}}"  # Video not supported, raise exception
+        "                {%- if is_system_content -%}"
+        "                    {{- raise_exception('System message cannot contain videos.') -}}"
+        "                {%- endif -%}"
+        "                {%- if do_vision_count -%}"
+        "                    {%- set video_count.value = video_count.value + 1 -%}"
+        "                {%- endif -%}"
+        "                {%- if add_vision_id -%}"
+        "                    {{- 'Video ' ~ video_count.value ~ ': ' -}}"
+        "                {%- endif -%}"
+        "                {{- '<|vision_start|>' -}}"
+        "                {{- item.video -}}"
+        "                {{- '<|vision_end|>' -}}"
+        "            {%- elif 'text' in item -%}"
+        "                {{- item.text -}}"
+        "            {%- else -%}"
+        "                {{- raise_exception('Unexpected item type in content.') -}}"
+        "            {%- endif -%}"
+        "        {%- endfor -%}"
+        "    {%- elif content is none or content is undefined -%}"
+        "        {{- '' -}}"
+        "    {%- else -%}"
+        "        {{- raise_exception('Unexpected content type.') -}}"
+        "    {%- endif -%}"
+        "{%- endmacro -%}"
+        "{%- if not messages -%}"
+        "    {{- raise_exception('No messages provided.') -}}"
+        "{%- endif -%}"
+        "{%- if tools and tools is iterable and tools is not mapping -%}"
+        "    {{- '<|im_start|>system\n' -}}"
+        "    {{- '# Tools\n\nYou have access to the following functions:\n\n<tools>' -}}"
+        "    {%- for tool in tools -%}"
+        "        {{- '\n' -}}"
+        "        {{- tool | tojson -}}"
+        "    {%- endfor -%}"
+        "    {{- '\n</tools>' -}}"
+        "    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' -}}"
+        "    {%- if messages[0].role == 'system' -%}"
+        "        {%- set content = render_content(messages[0].content, false, true) | trim -%}"
+        "        {%- if content -%}"
+        "            {{- '\n\n' + content -}}"
+        "        {%- endif -%}"
+        "    {%- endif -%}"
+        "    {{- '<|im_end|>\n' -}}"
+        "{%- elif messages[0].role == 'system' -%}"
+        "    {%- set content = render_content(messages[0].content, false, true) -%}"
+        "    {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}"
+        "{%- endif -%}"
+        "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}"
+        "{%- for message in messages[::-1] -%}"
+        "    {%- set index = messages | length - 1 - loop.index0 -%}"
+        "    {%- if ns.multi_step_tool and message.role == 'user' -%}"
+        "        {%- set content = render_content(message.content, false) | trim -%}"
+        "        {%- if not (content.startswith('<tool_response>') and content.endswith('</tool_response>')) -%}"
+        "            {%- set ns.multi_step_tool = false -%}"
+        "            {%- set ns.last_query_index = index -%}"
+        "        {%- endif -%}"
+        "    {%- endif -%}"
+        "{%- endfor -%}"
+        "{%- if ns.multi_step_tool -%}"
+        "    {{- raise_exception('No user query found in messages.') -}}"
+        "{%- endif -%}"
+        "{%- for message in messages -%}"
+        "    {%- set content = render_content(message.content, true) | trim -%}"
+        "    {%- if message.role == 'system' -%}"
+        "        {%- if not loop.first -%}"
+        "            {{- raise_exception('System message must be at the beginning.') -}}"
+        "        {%- endif -%}"
+        "    {%- elif message.role == 'user' -%}"
+        "        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}"
+        "    {%- elif message.role == 'assistant' -%}"
+        "        {%- set reasoning_content = '' -%}"
+        "        {%- if message.reasoning_content is string -%}"
+        "            {%- set reasoning_content = message.reasoning_content -%}"
+        "        {%- elif '</think>' in content -%}"
+        "            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') -%}"
+        "            {%- set content = content.split('</think>')[-1].lstrip('\n') -%}"
+        "        {%- endif -%}"
+        "        {%- set reasoning_content = reasoning_content | trim -%}"
+        "        {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}"
+        "            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content -}}"
+        "        {%- else -%}"
+        "            {{- '<|im_start|>' + message.role + '\n' + content -}}"
+        "        {%- endif -%}"
+        "        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}"
+        "            {%- for tool_call in message.tool_calls -%}"
+        "                {%- if tool_call.function is defined -%}"
+        "                    {%- set tool_call = tool_call.function -%}"
+        "                {%- endif -%}"
+        "                {%- if loop.first -%}"
+        "                    {%- if content | trim -%}"
+        "                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' -}}"
+        "                    {%- else -%}"
+        "                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' -}}"
+        "                    {%- endif -%}"
+        "                {%- else -%}"
+        "                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' -}}"
+        "                {%- endif -%}"
+        "                {%- if tool_call.arguments is defined -%}"
+        "                    {%- for (args_name, args_value) in tool_call.arguments | items -%}"
+        "                        {{- '<parameter=' + args_name + '>\n' -}}"
+        "                        {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}"
+        "                        {{- args_value -}}"
+        "                        {{- '\n</parameter>' -}}"
+        "                    {%- endfor -%}"
+        "                {%- endif -%}"
+        "                {{- '</function>\n</tool_call>' -}}"
+        "            {%- endfor -%}"
+        "        {%- endif -%}"
+        "        {{- '<|im_end|>\n' -}}"
+        "    {%- elif message.role == 'tool' -%}"
+        "        {%- if loop.previtem and loop.previtem.role != 'tool' -%}"
+        "            {{- '<|im_start|>user' -}}"
+        "        {%- endif -%}"
+        "        {{- '\n<tool_response>\n' -}}"
+        "        {{- content -}}"
+        "        {{- '\n</tool_response>' -}}"
+        "        {%- if not loop.last and loop.nextitem.role != 'tool' -%}"
+        "            {{- '<|im_end|>\n' -}}"
+        "        {%- elif loop.last -%}"
+        "            {{- '<|im_end|>\n' -}}"
+        "        {%- endif -%}"
+        "    {%- else -%}"
+        "        {{- raise_exception('Unexpected message role.') -}}"
+        "    {%- endif -%}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+        "    {{- '<|im_start|>assistant\n' -}}"
+        "    {%- if enable_thinking is defined and enable_thinking is false -%}"
+        "        {{- '<think>\n\n</think>\n\n' -}}"
+        "    {%- else -%}"
+        "        {{- '<think>\n' -}}"
+        "    {%- endif -%}"
+        "{%- endif -%}"
+    )
+
+    def __init__(
+        self,
+        add_vision_id: bool = True,
+        enable_thinking: bool = True,
+        preserve_thinking: bool = False,
+        **kwargs,
+    ):
+        """
+        Parameters:
+        - add_vision_id (bool):
+            - True (default): Count all the images. Recommended for multi-image.
+            - False: Doesn't count the images. Can save tokens with single-image.
+        - enable_thinking (bool):
+            - True (default): Enables reasoning for better results.
+            - False: Disables reasoning for faster results.
+        - preserve_thinking (bool):
+            - True: Keeps <think> reasoning process for ALL historical conversational turns.
+            - False (default): Only keeps <think> for the latest assistant reply to save tokens.
+        """
+        super().__init__(**kwargs)
+        self.enable_thinking = enable_thinking
+        self.preserve_thinking = preserve_thinking
+        self.extra_template_arguments["add_vision_id"] = add_vision_id
+        self.extra_template_arguments["enable_thinking"] = enable_thinking
+        self.extra_template_arguments["preserve_thinking"] = preserve_thinking
+
+    def __call__(self, **kwargs):
+        llama = kwargs['llama']
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if self.verbose:
+            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing")
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
+
+
+class Step3VLChatHandler(MTMDChatHandler):
+    """
+    Handler for Step3-VL models.
+    """
+
+    STEP3VL_BOS_TOKEN = "<|im_start|>"
+    STEP3VL_EOS_TOKEN = "<|im_end|>"
+    STEP3VL_PAD_TOKEN = "<|endoftext|>"
+    STEP3VL_IMAGE_TOKEN = "<im_patch>"
+
+    CHAT_FORMAT = (
+        "{%- macro render_content(content) -%}\n"
+        "    {%- if content is none -%}{{- '' -}}\n"
+        "    {%- elif content is string -%}{{- content -}}\n"
+        "    {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n"
+        "    {%- elif content is iterable -%}\n"
+        "        {%- for item in content -%}\n"
+        "            {%- if item.type == 'text' -%}\n"
+        "                {{- item['value'] if 'value' in item else item['text'] -}}\n"
+        "            {%- elif item.type in ['image', 'image_url'] -%}\n"
+        "                {%- set url_val = '' -%}\n"
+        "                {%- if item.image_url -%}\n"
+        "                    {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n"
+        "                {%- endif -%}\n"
+        "                {{- '<im_patch>' + url_val -}}\n"
+        "            {%- endif -%}\n"
+        "        {%- endfor -%}\n"
+        "    {%- endif -%}\n"
+        "{%- endmacro -%}\n"
+        "\n"
+        "{%- if tools -%}\n"
+        "    {{- '<|im_start|>system\\n' -}}\n"
+        "    {%- if messages[0].role == 'system' -%}\n"
+        "        {{- render_content(messages[0].content) + '\\n\\n' -}}\n"
+        "    {%- endif -%}\n"
+        "    {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>' -}}\n"
+        "    {%- for tool in tools -%}\n"
+        "        {{- '\\n' -}}\n"
+        "        {{- tool | tojson -}}\n"
+        "    {%- endfor -%}\n"
+        "    {{- '\\n</tools>\\n\\nAlways adhere to this exact format for tool use:\\n<tool_calls>\\n<tool_call>\\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\\n</tool_call>\\n{additional_tool_calls}</tool_calls>\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags.\\n- `<function-name>` must be an exact match to one of the available tools.\\n- `<args-json-object>` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n"
+        "{%- else -%}\n"
+        "    {%- if messages[0].role == 'system' -%}\n"
+        "        {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n"
+        "    {%- endif -%}\n"
+        "{%- endif -%}\n"
+        "\n"
+        "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n"
+        "{%- for message in messages[::-1] -%}\n"
+        "    {%- set index = (messages|length - 1) - loop.index0 -%}\n"
+        "    {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) -%}\n"
+        "        {%- set ns.multi_step_tool = false -%}\n"
+        "        {%- set ns.last_query_index = index -%}\n"
+        "    {%- endif -%}\n"
+        "{%- endfor -%}\n"
+        "\n"
+        "{%- for message in messages -%}\n"
+        "    {%- set content = render_content(message.content) -%}\n"
+        "    {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n"
+        "        {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n"
+        "        {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n"
+        "    {%- elif message.role == 'assistant' -%}\n"
+        "        {%- if message.reasoning_content is string -%}\n"
+        "            {%- set reasoning_content = render_content(message.reasoning_content) -%}\n"
+        "        {%- else -%}\n"
+        "            {%- if '</think>' in content -%}\n"
+        "                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') -%}\n"
+        "                {%- set content = content.split('</think>')[-1].lstrip('\\n') -%}\n"
+        "            {%- else -%}\n"
+        "                {%- set reasoning_content = '' -%}\n"
+        "            {%- endif -%}\n"
+        "        {%- endif -%}\n"
+        "        {%- if loop.index0 > ns.last_query_index -%}\n"
+        "            {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content + '\\n</think>\\n' + content -}}\n"
+        "        {%- else -%}\n"
+        "            {{- '<|im_start|>' + message.role + '\\n' + content -}}\n"
+        "        {%- endif -%}\n"
+        "        {%- if message.tool_calls -%}\n"
+        "            {{- '\\n<tool_calls>' -}}\n"
+        "            {%- for tool_call in message.tool_calls -%}\n"
+        "                {{- '\\n' -}}\n"
+        "                {%- if tool_call.function -%}\n"
+        "                    {%- set tool_call = tool_call.function -%}\n"
+        "                {%- endif -%}\n"
+        "                {{- '<tool_call>\\n{\"name\": \"' -}}\n"
+        "                {{- tool_call.name -}}\n"
+        "                {{- '\", \"arguments\": ' -}}\n"
+        "                {%- if tool_call.arguments is string -%}\n"
+        "                    {{- tool_call.arguments -}}\n"
+        "                {%- else -%}\n"
+        "                    {{- tool_call.arguments | tojson -}}\n"
+        "                {%- endif -%}\n"
+        "                {{- '}\\n</tool_call>' -}}\n"
+        "            {%- endfor -%}\n"
+        "            {{- '\\n</tool_calls>' -}}\n"
+        "        {%- endif -%}\n"
+        "        {{- '<|im_end|>\\n' -}}\n"
+        "    {%- elif message.role == 'tool' -%}\n"
+        "        {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n"
+        "            {{- '<|im_start|>tool_response' -}}\n"
+        "        {%- endif -%}\n"
+        "        {{- '\\n<tool_response>\\n' -}}\n"
+        "        {{- content -}}\n"
+        "        {{- '\\n</tool_response>' -}}\n"
+        "        {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n"
+        "            {{- '<|im_end|>\\n' -}}\n"
+        "        {%- endif -%}\n"
+        "    {%- endif -%}\n"
+        "{%- endfor -%}\n"
+        "{%- if add_generation_prompt -%}\n"
+        "    {{- '<|im_start|>assistant\\n<think>\\n\\n</think>\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n<think>' -}}\n"
+        "{%- endif -%}\n"
+    )
+
+    def __init__(self, enable_thinking: bool = True, **kwargs):
+        """
+        Initializes the Step3-VL Handler.
+
+        Args:
+            enable_thinking (bool): If False, injects an empty <think> block to bypass reasoning.
+        """
+        self.enable_thinking = enable_thinking
+        super().__init__(**kwargs)
+
+    def __call__(self, **kwargs):
+        # Pass thinking toggle into Jinja
+        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
+
+        # Step3 uses standard <|im_end|> ChatML stop formatting
+        kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN]
+
+        if self.verbose:
+            print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
+
+        return super().__call__(**kwargs)

From d84b0c21fa4a131df5c17ddd1b2447929dc1973f Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sun, 14 Jun 2026 22:08:32 +0800
Subject: [PATCH 23/36] fix(model): handle missing chat templates

- Update LlamaModel.model_chat_template() to return Optional[str] and accept
name=None for the default model chat template.

- llama_model_chat_template() may return nullptr when no chat template is
available. Handle that case explicitly instead of decoding a null pointer, and
return None so callers can apply their own fallback logic.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/_internals.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 434921e6bd..91befb2247 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -152,12 +152,17 @@ def model_size(self) -> int:
         """
         return llama_cpp.llama_model_size(self.model)
 
-    def model_chat_template(self, name: bytes) -> str:
+    def model_chat_template(self, name: Optional[bytes] = None) -> Optional[str]:
         """
-        Get the default chat template. Returns nullptr if not available
-        If name is NULL, returns the default chat template
+        Get a chat template from the model.
+
+        If name is None, returns the default chat template.
+        Returns None if no chat template is available.
         """
-        return llama_cpp.llama_model_chat_template(self.model, name).decode("utf-8")
+        template = llama_cpp.llama_model_chat_template(self.model, name)
+        if template is None:
+            return None
+        return template.decode("utf-8")
 
     def n_params(self) -> int:
         """

From c9745316d748cec408b07c2f3a43fd97fa921e73 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sun, 14 Jun 2026 23:43:33 +0800
Subject: [PATCH 24/36] feat(mtmd): enhance generic chat template support

- Enhance GenericMTMDChatHandler to better support model-provided chat templates.

- Allow the generic handler to accept an optional named chat template, load it
from the model at call time via llama_model_chat_template(), fall back to the
model's default chat template, and finally use the built-in MTMD CHAT_FORMAT
when no model template is available.

- Also expand the generic media placeholder list for common multimodal templates
and document the handler as a template-driven MTMD implementation. This prepares
the generic path for a later render-driven placeholder replacement pass.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/llama.py            |   2 +
 llama_cpp/llama_multimodal.py | 118 +++++++++++++++++++++++++++++++---
 2 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index dbc60eaf76..b6a2c8d5a7 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -174,6 +174,7 @@ def __init__(
         log_filters: Optional[Sequence[str]] = None,
         log_filters_case_sensitive: bool = True,
         # Extra Params
+        chat_template_name: Optional[str] = None,
         chat_handler_kwargs: Dict[str, Any] = {},
         **kwargs,  # type: ignore
     ):
@@ -721,6 +722,7 @@ def __init__(
                 chat_format = self.metadata.get("tokenizer.chat_template", None),
                 mmproj_path = mmproj_path,
                 verbose = self.verbose,
+                chat_template_name=chat_template_name,
                 **chat_handler_kwargs
             )
 
diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py
index a055869543..a0f7e594e4 100644
--- a/llama_cpp/llama_multimodal.py
+++ b/llama_cpp/llama_multimodal.py
@@ -91,6 +91,8 @@ class MTMDChatHandler:
         "{% endif %}"
     )
 
+    KNOWN_MEDIA_TAGS: List[str] = []
+
     def __init__(
         self,
         mmproj_path: Optional[str] = None,
@@ -1189,41 +1191,137 @@ def from_pretrained(
             **kwargs,
         )
 
-# Experiments are not recommended for this purpose at this time.
+# Generic template-driven MTMD handler.
 class GenericMTMDChatHandler(MTMDChatHandler):
+    """
+    Generic MTMD chat handler backed by the model-provided chat template.
+
+    This handler is intentionally template-driven. It renders the model's
+    tokenizer.chat_template first, then normalizes rendered media URLs or
+    placeholder tokens into MTMD media markers before tokenization.
+
+    It is designed for model templates that emit media placeholders such as
+    <|image_pad|>, <|image|>, <image>, [IMG], or Kimi-style <|media_pad|>.
+    Model-specific handlers may still be preferable when a model requires
+    special stop tokens, generation flags, or non-standard template arguments.
+    """
+
     KNOWN_MEDIA_TAGS = [
+        # Pad placeholders inside model-specific wrappers.
         "<|image_pad|>",
         "<|audio_pad|>",
         "<|video_pad|>",
+
+        # Direct placeholders inside Gemma/Llama/GLM-style wrappers.
         "<|image|>",
         "<|audio|>",
         "<|video|>",
-        "[IMG]"
+
+        # LLaVA / LFM / Mistral-style placeholders.
+        "<image>",
+        "<audio>",
+        "<video>",
+        "[IMG]",
+
+        # Kimi-style placeholders.
+        "<|media_pad|>",
+        "<|kimi_k25_video_placeholder|>",
     ]
 
     def __init__(
         self,
-        chat_format: str,
+        chat_format: Optional[str],
         mmproj_path: str,
         verbose: bool = True,
+        chat_template_name: Optional[str] = None,
         **kwargs
     ) -> None:
-
         self.chat_format = chat_format
-        if self.chat_format is None:
-            raise ValueError("Failed to get model chat template automatically.")
-
+        self.chat_template_name = chat_template_name
         self.verbose = verbose
-        if self.verbose:
-            print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)
+
+        if self.verbose and self.chat_format is not None:
+            print(
+                f"{self.__class__.__name__}.__init__: using provided chat template:\n"
+                f"```jinja\n{self.chat_format}\n```",
+                file=sys.stderr,
+            )
 
         super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs)
 
+    def _resolve_chat_format(self, llama: llama_core.Llama) -> str:
+        # Highest priority: use the template explicitly provided by the caller.
+        if self.chat_format is not None:
+            return self.chat_format
+
+        chat_format = None
+
+        # The Llama instance is only available at call time, so query llama.cpp here
+        # for either the requested named template or the model's default template.
+        try:
+            name = (
+                self.chat_template_name.encode("utf-8")
+                if self.chat_template_name is not None
+                else None
+            )
+            chat_format = llama._model.model_chat_template(name)
+        except Exception as exc:
+            if self.verbose:
+                print(
+                    f"{self.log_prefix}: failed to load chat template"
+                    f"{f' {self.chat_template_name!r}' if self.chat_template_name else ''} "
+                    f"from llama model: {exc}",
+                    file=sys.stderr,
+                )
+
+        # If a named template is unavailable, try the default model template.
+        if chat_format is None and self.chat_template_name is not None:
+            try:
+                chat_format = llama._model.model_chat_template(None)
+                if self.verbose and chat_format is not None:
+                    print(
+                        f"{self.log_prefix}: chat template {self.chat_template_name!r} "
+                        "not found; using default model chat template.",
+                        file=sys.stderr,
+                    )
+            except Exception as exc:
+                if self.verbose:
+                    print(
+                        f"{self.log_prefix}: failed to load default model chat template: {exc}",
+                        file=sys.stderr,
+                    )
+
+        # Last resort: use the generic built-in MTMD template.
+        if chat_format is None:
+            chat_format = self.CHAT_FORMAT
+            if self.verbose:
+                print(
+                    f"{self.log_prefix}: no model chat template found; "
+                    "using MTMDChatHandler built-in CHAT_FORMAT.",
+                    file=sys.stderr,
+                )
+
+        self.chat_format = chat_format
+        return chat_format
+
     def __call__(self, **kwargs):
+        llama = kwargs["llama"]
+
+        self._resolve_chat_format(llama)
+
+        if self.chat_format is None:
+            raise ValueError(
+                f"{self.log_prefix}: failed to resolve a chat template. "
+                "`chat_format` must be a Jinja chat template string. You may pass it "
+                "directly, read it from a chat_template.jinja file, set a valid "
+                "`chat_template_name` for a named template stored in the model, or use "
+                "a model that provides tokenizer.chat_template metadata."
+            )
+
         self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format]
 
         if self.verbose:
-            print(f"{self.log_prefix} - Start processing")
+            print(f"{self.log_prefix} - Start processing", file=sys.stderr)
 
         # Use parent implementation
         return super().__call__(**kwargs)

From d5a9f646ea63e95dacdcc41d6db5cfb6c647d400 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Mon, 15 Jun 2026 02:08:19 +0800
Subject: [PATCH 25/36] Update Submodule vendor/llama.cpp e8067a8..8edaca9

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index e8067a8b36..8edaca9034 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e8067a8b3624aa40cc88ecb2940060e5d65b7532
+Subproject commit 8edaca9034bd6dbe729895315b922bdd90d9766b

From 284df51035c5caac143956982a739bbae13cded7 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Mon, 15 Jun 2026 02:19:31 +0800
Subject: [PATCH 26/36] feat(mtmd): broaden multimodal media extraction

- Broaden MTMD media extraction to support common multimodal content shapes used
by model chat templates.

- In addition to OpenAI-style image_url/audio_url/video_url chunks, accept
image/audio/video typed chunks and direct media keys such as {"image": "..."},
{"audio": "..."}, or {"video": "..."}. This keeps the extracted media list
aligned with templates that emit placeholders for image, audio, or video content
without requiring URL-specific chunk names.

- Add a shared helper for extracting URLs, local paths, existing data URIs, or
inline base64 payloads from media content items. Preserve capability checks,
strict input_audio format validation, and explicit errors for missing or
ambiguous media payloads.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/llama_multimodal.py | 251 +++++++++++++++++++++++++---------
 1 file changed, 183 insertions(+), 68 deletions(-)

diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py
index a0f7e594e4..f1b320b772 100644
--- a/llama_cpp/llama_multimodal.py
+++ b/llama_cpp/llama_multimodal.py
@@ -256,80 +256,195 @@ def close(self) -> None:
     def __del__(self) -> None:
         self.close()
 
-    def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]:
+    def _get_media_url(
+        self,
+        content: Dict[str, Any],
+        keys: Tuple[str, ...],
+        media_type: str,
+    ) -> str:
         """
-        Extracts all media payloads (images, audio) sequentially to maintain exact chronological order.
-        Strictly enforces capability checks, raising exceptions if unsupported media is passed.
+        Extract a media URL or data URI from a multimodal content item.
 
-        Returns:
-            media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio).
+        Different chat templates and client APIs may represent the same media
+        payload with slightly different keys. For example, an image may appear as
+        `image`, `image_url`, or a typed chunk with `{"type": "image", ...}`.
+        This helper checks the provided keys in order and returns the first usable
+        media payload.
+
+        Returns an empty string when none of the requested keys exist or when the
+        payload shape is unsupported. The caller is responsible for raising a
+        media-type-specific error when an empty value is not acceptable.
+        """
+        # Try keys in priority order. This lets callers prefer canonical fields
+        # such as "image" over compatibility aliases such as "image_url", while
+        # still accepting either representation.
+        value = None
+        for key in keys:
+            if key in content:
+                value = content[key]
+                break
+
+        # String payloads may already be URLs, local paths, or data URIs.
+        if isinstance(value, str):
+            return value
+
+        if isinstance(value, dict):
+            # Common OpenAI-style shape:
+            # {"image_url": {"url": "..."}}
+            if "url" in value:
+                return value["url"]
+
+            # Forward-compatible inline media shape:
+            # {"audio": {"data": "...", "format": "wav"}}
+            #
+            # Convert it to a data URI so downstream media loading does not need
+            # separate branches for raw base64 payloads.
+            if "data" in value and "format" in value:
+                media_format = value.get("format", "")
+                media_data = value.get("data", "")
+                if media_format and media_data:
+                    return f"data:{media_type}/{media_format};base64,{media_data}"
+
+        return ""
+
+    def _get_media_items(
+        self,
+        messages: List[llama_types.ChatCompletionRequestMessage],
+    ) -> List[Dict[str, str]]:
+        """
+        Extract media payloads from chat messages in message/content order.
+
+        Supports OpenAI-style typed media chunks as well as template-friendly
+        variants used by multimodal chat templates, such as:
+        - {"type": "image_url", "image_url": {"url": "..."}}
+        - {"type": "image", "image": "..."}
+        - {"image": "..."}
+        - {"type": "audio_url", "audio_url": {"url": "..."}}
+        - {"type": "audio", "audio": "..."}
+        - {"type": "input_audio", "input_audio": {"data": "...", "format": "wav"}}
+        - {"type": "video_url", "video_url": {"url": "..."}}
+        - {"type": "video", "video": "..."}
+        - {"video": "..."}
+
+        The returned order must match the media placeholders emitted by the rendered
+        chat template as closely as possible.
         """
         media_items: List[Dict[str, str]] = []
+
         for message in messages:
-            if isinstance(message.get("content"), list):
-                for content in message["content"]:
-                    content_type = content.get("type", "")
-
-                    # 1. Vision Processing
-                    if content_type == "image_url":
-                        if not self.is_support_vision:
-                            raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.")
-
-                        url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"]
-                        media_items.append({"url": url, "type": "image"})
-
-                    # 2. Audio Processing
-                    elif content_type in ["audio", "audio_url", "input_audio"]:
-                        if not self.is_support_audio:
-                            raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.")
-
-                        # Case A: Handle custom/forward-compatible audio_url format
-                        if content_type == "audio_url" or content_type == "audio":
-                            audio_url = content[content_type]
-                            url = audio_url if isinstance(audio_url, str) else audio_url["url"]
-                            media_items.append({"url": url, "type": "audio"})
-                        # Case B: Handle OpenAI standard input_audio format
-                        elif content_type == "input_audio":
-                            input_audio = content.get("input_audio", {})
-                            if isinstance(input_audio, dict) and "data" in input_audio:
-                                # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic
-                                # input_audio: {
-                                #     data: audio.base64Data,
-                                #     format: audio.mimeType.includes('wav') ? 'wav' : 'mp3'
-                                # }
-                                audio_data = input_audio.get("data", "")
-                                audio_format = input_audio.get("format", "")
-
-                                # Strictly align with llama.cpp (require wav/mp3)
-                                if audio_format not in ["wav", "mp3"]:
-                                    raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'")
-
-                                # Format as a Data URI to reuse the unified load_media logic
-                                media_items.append({
-                                    "url": f"data:audio/{audio_format};base64,{audio_data}",
-                                    "type": "audio"
-                                })
-                            else:
-                                # Just a raw base64 data
-                                url = input_audio if isinstance(input_audio, str) else ""
-                                if url:
-                                    media_items.append({"url": url, "type": "audio"})
-
-                    # 3. Video Processing
-                    elif content_type == "video_url":
-                        if not self.is_support_video:
-                            raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.")
-
-                        video_url = content["video_url"]
-                        url = video_url if isinstance(video_url, str) else video_url["url"]
-                        media_items.append({"url": url, "type": "video"})
-
-                    # 4. Text & Unknown Types
-                    elif content_type == "text":
-                        continue
+            content_list = message.get("content")
+            if not isinstance(content_list, list):
+                continue
+
+            for content in content_list:
+                if not isinstance(content, dict):
+                    continue
+
+                content_type = content.get("type", "")
+
+                has_image = (
+                    content_type in ("image", "image_url")
+                    or "image" in content
+                    or "image_url" in content
+                )
+                has_audio = (
+                    content_type in ("audio", "audio_url", "input_audio")
+                    or "audio" in content
+                    or "audio_url" in content
+                    or "input_audio" in content
+                )
+                has_video = (
+                    content_type in ("video", "video_url")
+                    or "video" in content
+                    or "video_url" in content
+                )
+
+                media_kind_count = int(has_image) + int(has_audio) + int(has_video)
+                if media_kind_count > 1:
+                    raise ValueError(
+                        f"{self.log_prefix}: content item contains multiple media types; "
+                        "each content item must contain only one of image, audio, or video."
+                    )
+
+                # 1. Vision Processing
+                if has_image:
+                    if not self.is_support_vision:
+                        raise ValueError(
+                            f"{self.log_prefix}: This mmproj model instance does not support image inputs."
+                        )
+
+                    url = self._get_media_url(
+                        content,
+                        keys=("image", "image_url"),
+                        media_type="image",
+                    )
+                    if not url:
+                        raise ValueError(f"{self.log_prefix}: missing image url/data.")
+
+                    media_items.append({"url": url, "type": "image"})
+
+                # 2. Audio Processing
+                elif has_audio:
+                    if not self.is_support_audio:
+                        raise ValueError(
+                            f"{self.log_prefix}: This mmproj model instance does not support audio inputs."
+                        )
+
+                    if content_type == "input_audio" or "input_audio" in content:
+                        input_audio = content.get("input_audio", {})
+
+                        if isinstance(input_audio, dict) and "data" in input_audio:
+                            audio_data = input_audio.get("data", "")
+                            audio_format = input_audio.get("format", "")
+
+                            # Strictly align with llama.cpp.
+                            if audio_format not in ["wav", "mp3"]:
+                                raise ValueError(
+                                    f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'"
+                                )
+
+                            url = f"data:audio/{audio_format};base64,{audio_data}"
+                        else:
+                            url = input_audio if isinstance(input_audio, str) else ""
                     else:
-                        if self.verbose:
-                            print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr)
+                        url = self._get_media_url(
+                            content,
+                            keys=("audio", "audio_url"),
+                            media_type="audio",
+                        )
+
+                    if not url:
+                        raise ValueError(f"{self.log_prefix}: missing audio url/data.")
+
+                    media_items.append({"url": url, "type": "audio"})
+
+                # 3. Video Processing
+                elif has_video:
+                    if not self.is_support_video:
+                        raise ValueError(
+                            f"{self.log_prefix}: This libmtmd build does not support video inputs."
+                        )
+
+                    url = self._get_media_url(
+                        content,
+                        keys=("video", "video_url"),
+                        media_type="video",
+                    )
+                    if not url:
+                        raise ValueError(f"{self.log_prefix}: missing video url/data.")
+
+                    media_items.append({"url": url, "type": "video"})
+
+                # 4. Text & Unknown Types
+                elif content_type == "text" or "text" in content:
+                    continue
+                else:
+                    if self.verbose:
+                        print(
+                            f"{self.log_prefix}: ignored unknown content type '{content_type}'.",
+                            file=sys.stderr,
+                        )
+
         return media_items
 
     def _create_bitmap_from_bytes(self, media_bytes: bytes):

From ca605ea4c4dc24c7a2758cfe832bb8e698c463a5 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Tue, 16 Jun 2026 08:29:03 +0800
Subject: [PATCH 27/36] Update Submodule vendor/llama.cpp 8edaca9..7dad2f1

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 8edaca9034..7dad2f1a17 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8edaca9034bd6dbe729895315b922bdd90d9766b
+Subproject commit 7dad2f1a17d65b5e2034c277125bc9f97573a779

From 6a48d5879469aa461d1885c02e6b884a4fb359cb Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Tue, 16 Jun 2026 08:31:10 +0800
Subject: [PATCH 28/36] Update mtmd API 20260616

- Also rename the llama_cpp import alias to llama_cpp_lib for consistency with the
rest of the MTMD bindings and avoid confusion with the mtmd wrapper module.

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/mtmd_cpp.py | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 4513761a63..4e8053331e 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -27,7 +27,7 @@
     TYPE_CHECKING,
 )
 
-import llama_cpp.llama_cpp as llama_cpp
+import llama_cpp.llama_cpp as llama_cpp_lib
 
 from llama_cpp._ctypes_extensions import (
     load_shared_library,
@@ -277,14 +277,14 @@ def mtmd_context_params_default() -> mtmd_context_params:
 @ctypes_function_mtmd(
     "mtmd_init_from_file", [
         c_char_p,
-        llama_cpp.llama_model_p_ctypes,
+        llama_cpp_lib.llama_model_p_ctypes,
         mtmd_context_params,
     ],
     mtmd_context_p_ctypes,
 )
 def mtmd_init_from_file(
     mmproj_fname: c_char_p,
-    text_model: llama_cpp.llama_model_p,
+    text_model: llama_cpp_lib.llama_model_p,
     ctx_params: mtmd_context_params,
     /,
 ) -> mtmd_context_p:
@@ -562,11 +562,11 @@ def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p) -> c_int32:
 @ctypes_function_mtmd(
     "mtmd_input_chunk_get_tokens_text",
     [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)],
-    POINTER(llama_cpp.llama_token)
+    POINTER(llama_cpp_lib.llama_token)
 )
 def mtmd_input_chunk_get_tokens_text(
     chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", /
-) -> Optional["_Pointer[llama_cpp.llama_token]"]:
+) -> Optional["_Pointer[llama_cpp_lib.llama_token]"]:
     ...
 
 # MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
@@ -1126,7 +1126,7 @@ def mtmd_helper_image_get_decoder_pos(
 @ctypes_function_mtmd(
     "mtmd_helper_eval_chunks", [
         mtmd_context_p_ctypes,
-        llama_cpp.llama_context_p_ctypes,
+        llama_cpp_lib.llama_context_p_ctypes,
         mtmd_input_chunks_p_ctypes,
         c_int32,
         c_int32,
@@ -1137,7 +1137,7 @@ def mtmd_helper_image_get_decoder_pos(
     c_int32)
 def mtmd_helper_eval_chunks(
     ctx: mtmd_context_p,
-    lctx: llama_cpp.llama_context_p,
+    lctx: llama_cpp_lib.llama_context_p,
     chunks: mtmd_input_chunks_p,
     n_past: c_int32,
     seq_id: c_int32,
@@ -1169,7 +1169,7 @@ def mtmd_helper_eval_chunks(
 @ctypes_function_mtmd(
     "mtmd_helper_eval_chunk_single", [
         mtmd_context_p_ctypes,
-        llama_cpp.llama_context_p_ctypes,
+        llama_cpp_lib.llama_context_p_ctypes,
         mtmd_input_chunk_p_ctypes,
         c_int32,
         c_int32,
@@ -1180,7 +1180,7 @@ def mtmd_helper_eval_chunks(
     c_int32)
 def mtmd_helper_eval_chunk_single(
     ctx: mtmd_context_p,
-    lctx: llama_cpp.llama_context_p,
+    lctx: llama_cpp_lib.llama_context_p,
     chunks: mtmd_input_chunk_p,
     n_past: c_int32,
     seq_id: c_int32,
@@ -1195,6 +1195,13 @@ def mtmd_helper_eval_chunk_single(
     ...
 
 
+# typedef int32_t (*mtmd_helper_post_decode_callback)(struct llama_batch batch, void * user_data);
+mtmd_helper_post_decode_callback = CFUNCTYPE(
+    c_int32,
+    llama_cpp_lib.llama_batch,
+    c_void_p,
+)
+
 # // helper function to decode an image whose embeddings have already been calculated
 # // this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
 # // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
@@ -1205,28 +1212,34 @@ def mtmd_helper_eval_chunk_single(
 #                                                 llama_pos n_past,
 #                                                 llama_seq_id seq_id,
 #                                                 int32_t n_batch,
-#                                                 llama_pos * new_n_past);
+#                                                 llama_pos * new_n_past,
+#                                                 mtmd_helper_post_decode_callback callback,
+#                                                 void * user_data);
 @ctypes_function_mtmd(
     "mtmd_helper_decode_image_chunk", [
         mtmd_context_p_ctypes,
-        llama_cpp.llama_context_p_ctypes,
+        llama_cpp_lib.llama_context_p_ctypes,
         mtmd_input_chunk_p_ctypes,
         POINTER(c_float),
         c_int32,
         c_int32,
         c_int32,
         POINTER(c_int32),
+        mtmd_helper_post_decode_callback,
+        c_void_p,
     ],
     c_int32)
 def mtmd_helper_decode_image_chunk(
     ctx: mtmd_context_p,
-    lctx: llama_cpp.llama_context_p,
-    chunks: mtmd_input_chunk_p,
+    lctx: llama_cpp_lib.llama_context_p,
+    chunk: mtmd_input_chunk_p,
     encoded_embd: POINTER(c_float), # type: ignore
     n_past: c_int32,
     seq_id: c_int32,
     n_batch: c_int32,
     new_n_past: POINTER(c_int32),   # type: ignore
+    callback: mtmd_helper_post_decode_callback, # type: ignore
+    user_data: c_void_p,
     /,
 ) -> c_int32:
     """

From d22099d4e46cc64a945c41c88f67fb40f9e8f5be Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Thu, 18 Jun 2026 06:08:40 +0800
Subject: [PATCH 29/36] Update Submodule vendor/llama.cpp 7dad2f1..f3e1828

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 7dad2f1a17..f3e1828164 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 7dad2f1a17d65b5e2034c277125bc9f97573a779
+Subproject commit f3e182816421c648188b5eab269853bf1531d950

From 491ef59d8c6686369bdbff28294bacba2acebf07 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Thu, 18 Jun 2026 06:49:41 +0800
Subject: [PATCH 30/36] ci(macos): update CMake arguments for Metal builds

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 .github/workflows/test.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a9f359d1cd..ec81b294c4 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -40,11 +40,11 @@ jobs:
           # macOS Metal
           - os: macos-26
             python-version: "3.9"
-            cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on"
+            cmake_args: "-DGGML_METAL_EMBED_LIBRARY=off -DGGML_RPC=on"
             metal_status: "(Metal)"
           - os: macos-26
             python-version: "3.14"
-            cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on"
+            cmake_args: "-DGGML_METAL_EMBED_LIBRARY=off -DGGML_RPC=on"
             metal_status: "(Metal)"
 
     steps:

From b4f2d966f6e4dfd3163a0a7a1e7a9c51c3b26145 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Fri, 19 Jun 2026 10:52:46 +0800
Subject: [PATCH 31/36] Update Submodule vendor/llama.cpp f3e1828..db52540

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/mtmd_cpp.py | 12 ++++++------
 vendor/llama.cpp      |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 4e8053331e..04884f5a62 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -489,13 +489,13 @@ def mtmd_bitmap_set_id(
 #                                              void * user_data,
 #                                              mtmd_bitmap_lazy_callback callback);
 @ctypes_function_mtmd(
-    "mtmd_input_chunks_get", [
+    "mtmd_bitmap_init_lazy", [
         mtmd_context_p_ctypes,
         c_char_p,
         c_void_p,
         mtmd_bitmap_lazy_callback,
     ], mtmd_bitmap_p_ctypes)
-def mtmd_input_chunks_get(
+def mtmd_bitmap_init_lazy(
     ctx: mtmd_context_p,
     id: c_char_p,
     user_data: c_void_p,
@@ -529,11 +529,11 @@ def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p) -> c_size_t:
 @ctypes_function_mtmd(
     "mtmd_input_chunks_get", [
         mtmd_input_chunks_p_ctypes,
-        c_int32,
+        c_size_t,
     ], mtmd_input_chunk_p_ctypes)
 def mtmd_input_chunks_get(
     chunks: mtmd_input_chunks_p,
-    idx: c_int32,
+    idx: c_size_t,
     /,
 ) -> mtmd_input_chunk_p:
     ...
@@ -726,7 +726,7 @@ def mtmd_image_tokens_get_decoder_pos(image_tokens: mtmd_image_tokens_p, pos_0:
         mtmd_input_chunks_p_ctypes,
         mtmd_input_text_p_ctypes,
         POINTER(mtmd_bitmap_p_ctypes),
-        c_uint,
+        c_size_t,
     ],
     c_int32,
 )
@@ -735,7 +735,7 @@ def mtmd_tokenize(
     output: mtmd_input_chunks_p,
     text: mtmd_input_text_p,
     bitmaps: POINTER(mtmd_bitmap_p), # type: ignore
-    n_bitmaps: c_uint,
+    n_bitmaps: c_size_t,
     /,
 ) -> c_int32:
     """
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f3e1828164..db52540f73 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f3e182816421c648188b5eab269853bf1531d950
+Subproject commit db52540f730de39efcf7172d4ab1f79bb50556e2

From ffc528f17065fa6c92b46cf7420464caa2d5aaba Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sat, 20 Jun 2026 12:00:29 +0800
Subject: [PATCH 32/36] Update Submodule vendor/llama.cpp db52540..e27f308

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index db52540f73..e27f308597 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit db52540f730de39efcf7172d4ab1f79bb50556e2
+Subproject commit e27f3085973722407518ea4822fb3e0a2b41df9c

From df66523e7654b29c7da0b3ab640574b7b00b6f09 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sun, 21 Jun 2026 19:25:49 +0800
Subject: [PATCH 33/36] Update Submodule vendor/llama.cpp e27f308..bfa3219

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index e27f308597..bfa3219177 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e27f3085973722407518ea4822fb3e0a2b41df9c
+Subproject commit bfa3219177c81bbf9f38939901656d60a745eb7e

From f566d6be1c867615acc7daf97ebab9d4e2eef696 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sun, 21 Jun 2026 19:26:19 +0800
Subject: [PATCH 34/36] Sync upstream: common/json-schema-to-grammar : align
 spacing rules with parsers (#24835)

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/llama_grammar.py | 42 +++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py
index 3c431fc3d8..67ad424490 100644
--- a/llama_cpp/llama_grammar.py
+++ b/llama_cpp/llama_grammar.py
@@ -465,18 +465,18 @@ def __init__(self, content: str, deps: list = None):
 SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'
 
 PRIMITIVE_RULES = {
-    'boolean'      : BuiltinRule('("true" | "false") space', []),
+    'boolean'      : BuiltinRule('("true" | "false")', []),
     'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
     'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
-    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
-    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
+    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?', ['integral-part', 'decimal-part']),
+    'integer'      : BuiltinRule('("-"? integral-part)', ['integral-part']),
     'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
-    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
-    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
+    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? space "}"', ['string', 'value']),
+    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? space "]"', ['value']),
+    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""', []),
     'char'         : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
-    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
-    'null'         : BuiltinRule('"null" space', []),
+    'string'       : BuiltinRule(r'"\"" char* "\""', ['char']),
+    'null'         : BuiltinRule('"null"', []),
 }
 
 # TODO: support "uri", "email" string formats
@@ -484,9 +484,9 @@ def __init__(self, content: str, deps: list = None):
     'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
     'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
     'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
-    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
-    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
-    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
+    'date-string'     : BuiltinRule('"\\"" date "\\""', ['date']),
+    'time-string'     : BuiltinRule('"\\"" time "\\""', ['time']),
+    'date-time-string': BuiltinRule('"\\"" date-time "\\""', ['date-time']),
 }
 
 DOTALL = '[\\U00000000-\\U0010FFFF]'
@@ -585,7 +585,7 @@ def visit(node):
                 out.append(f'[^"{"".join(rejects)}] {char_rule}*')
         visit(trie)
 
-        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
+        out.append(f' ){"" if trie.is_end_of_string else "?"} ["]')
         return ''.join(out)
 
     def _add_rule(self, name, rule):
@@ -815,7 +815,7 @@ def join_seq():
         return self._add_rule(
             name,
             to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"")
 
 
     def _resolve_ref(self, ref):
@@ -846,10 +846,10 @@ def visit(self, schema, name):
             return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
 
         elif 'const' in schema:
-            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
+            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
 
         elif 'enum' in schema:
-            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
+            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ')'
             return self._add_rule(rule_name, rule)
 
         elif schema_type in (None, 'object') and \
@@ -890,7 +890,7 @@ def add_component(comp_schema, is_required):
                     enum_intersection &= s
 
                 if enum_intersection:
-                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
+                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ')'
                     return self._add_rule(rule_name, rule)
 
             return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
@@ -904,12 +904,12 @@ def add_component(comp_schema, is_required):
                     ' "," space '.join(
                         self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
                         for i, item in enumerate(items)) +
-                    ' "]" space')
+                    ' space "]"')
             else:
                 item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
                 min_items = schema.get("minItems", 0)
                 max_items = schema.get("maxItems")
-                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
+                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' space "]"')
 
         elif schema_type in (None, 'string') and 'pattern' in schema:
             return self._visit_pattern(schema['pattern'], rule_name)
@@ -929,7 +929,7 @@ def add_component(comp_schema, is_required):
             min_len = schema.get('minLength', 0)
             max_len = schema.get('maxLength')
 
-            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
+            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\""')
 
         elif schema_type in (None, 'integer') and \
                 ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
@@ -946,7 +946,7 @@ def add_component(comp_schema, is_required):
 
             out = ["("]
             _generate_min_max_int(min_value, max_value, out)
-            out.append(") space")
+            out.append(")")
             return self._add_rule(rule_name, ''.join(out))
 
         elif (schema_type == 'object') or (len(schema) == 0):
@@ -1031,7 +1031,7 @@ def get_recursive_refs(ks, first_is_optional):
                 rule += ' )'
             rule += ' )?'
 
-        rule += ' "}" space'
+        rule += ' space "}"'
 
         return rule
 

From a27b10061f294ef2a67cfb442b3ae0a8db21c231 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Sun, 21 Jun 2026 21:11:53 +0800
Subject: [PATCH 35/36] Update Submodule vendor/llama.cpp bfa3219..bddfd2b

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/mtmd_cpp.py | 25 ++++++++++++++++++++-----
 vendor/llama.cpp      |  2 +-
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 04884f5a62..27a1a56d8d 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -169,6 +169,13 @@ class mtmd_pos_type(enum.IntEnum):
 mtmd_batch_p = NewType("mtmd_batch_p", int)
 mtmd_batch_p_ctypes = c_void_p
 
+# typedef bool (*mtmd_progress_callback)(float progress, void * user_data);
+mtmd_progress_callback = CFUNCTYPE(
+    c_bool,
+    c_float,                  # progress
+    c_void_p,                 # user_data
+)
+
 # struct mtmd_input_text {
 #     const char * text;
 #     bool add_special;
@@ -217,19 +224,25 @@ class clip_context_params(Structure):
 #     const char * media_marker;
 #     enum llama_flash_attn_type flash_attn_type;
 #     bool warmup; // whether to run a warmup encode pass after initialization
-#
+
 #     // limit number of image tokens, only for vision models with dynamic resolution
 #     int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
 #     int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
-#
+
 #     // callback function passed over to mtmd proper
 #     ggml_backend_sched_eval_callback cb_eval;
 #     void * cb_eval_user_data;
-#
+
 #     // batching params
 #     int32_t batch_max_tokens; // maximum number of output tokens in a batch
 #                               // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit)
 #                               // (default: 1024)
+
+#     // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+#     // If the provided progress_callback returns true, model loading continues.
+#     // If it returns false, model loading is immediately aborted.
+#     mtmd_progress_callback progress_callback;
+#     void * progress_callback_user_data;
 # };
 class mtmd_context_params(Structure):
     _fields_ = [
@@ -245,6 +258,8 @@ class mtmd_context_params(Structure):
         ("cb_eval", ggml_backend_sched_eval_callback),
         ("cb_eval_user_data", c_void_p),
         ("batch_max_tokens", c_int32),
+        ("progress_callback", mtmd_progress_callback),
+        ("progress_callback_user_data", c_void_p)
     ]
 
 mtmd_context_params_p_ctypes = POINTER(mtmd_context_params)
@@ -499,7 +514,7 @@ def mtmd_bitmap_init_lazy(
     ctx: mtmd_context_p,
     id: c_char_p,
     user_data: c_void_p,
-    callback: mtmd_bitmap_lazy_callback,  # type: ignore
+    callback: Optional[mtmd_bitmap_lazy_callback],  # type: ignore
     /,
 ) -> mtmd_bitmap_p:
     ...
@@ -1238,7 +1253,7 @@ def mtmd_helper_decode_image_chunk(
     seq_id: c_int32,
     n_batch: c_int32,
     new_n_past: POINTER(c_int32),   # type: ignore
-    callback: mtmd_helper_post_decode_callback, # type: ignore
+    callback: Optional[mtmd_helper_post_decode_callback], # type: ignore
     user_data: c_void_p,
     /,
 ) -> c_int32:
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index bfa3219177..bddfd2b113 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit bfa3219177c81bbf9f38939901656d60a745eb7e
+Subproject commit bddfd2b1137cd6e51fbb939081caf50e9f496a66

From acc3fe2f214a4cbeb0bd859f37c626b8ce9ef926 Mon Sep 17 00:00:00 2001
From: JamePeng <jame_peng@sina.com>
Date: Mon, 22 Jun 2026 00:22:47 +0800
Subject: [PATCH 36/36] feat(speculative): Improve ngram-map draft selection
 and accept feedback

- Store accepted draft lengths per key/value and truncate future drafts accordingly
- Make key-only mode draft on any key match without applying min_hits
- Select k4v continuations by frequency instead of latest occurrence
- Skip ambiguous k4v drafts when the top continuation is not dominant
- Track fixed-size k4v continuations to keep frequency statistics comparable

Signed-off-by: JamePeng <jame_peng@sina.com>
---
 llama_cpp/llama_speculative.py | 93 ++++++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 15 deletions(-)

diff --git a/llama_cpp/llama_speculative.py b/llama_cpp/llama_speculative.py
index c4289d0797..5cb930558e 100644
--- a/llama_cpp/llama_speculative.py
+++ b/llama_cpp/llama_speculative.py
@@ -106,11 +106,23 @@ def __init__(
         #   key -> {position: continuation}
         #
         # A dict is used so that recent entries can be refreshed when more continuation
-        # tokens become available.
+        # tokens become available. Draft selection is based on continuation frequency,
+        # not just the most recent continuation.
         self._map_k4v: DefaultDict[
             Tuple[int, ...], Dict[int, Tuple[int, ...]]
         ] = collections.defaultdict(dict)
 
+        # Acceptance feedback, aligned with llama.cpp's ngram-map behavior:
+        # accept(n) stores how many tokens were accepted for the key/value used by
+        # the previous draft and limits the future draft length for that key/value.
+        self._accepted_k: Dict[Tuple[int, ...], int] = {}
+        self._accepted_k4v: DefaultDict[
+            Tuple[int, ...], Dict[Tuple[int, ...], int]
+        ] = collections.defaultdict(dict)
+
+        self._last_draft_key: Optional[Tuple[int, ...]] = None
+        self._last_draft_value: Optional[Tuple[int, ...]] = None
+
         self._closed = False
         self._last_draft_len = 0
 
@@ -124,6 +136,10 @@ def clear(self) -> None:
         self._history.clear()
         self._map_k.clear()
         self._map_k4v.clear()
+        self._accepted_k.clear()
+        self._accepted_k4v.clear()
+        self._last_draft_key = None
+        self._last_draft_value = None
         self._last_draft_len = 0
 
     def close(self) -> None:
@@ -147,13 +163,27 @@ def accept(self, n_accepted: int) -> None:
         """
         Notify how many draft tokens were accepted by the target model.
 
-        This implementation does not need to update internal state here, because the
-        next call receives the verified token history through `input_ids`.
-
-        The method is kept for API symmetry and future extensions, such as acceptance
-        statistics, adaptive reset, or low-acceptance fallback.
+        The accepted length is written back to the key/value used by the previous
+        draft. Future drafts for the same key/value are truncated to this accepted
+        length, matching llama.cpp's ngram-map feedback loop.
         """
-        return
+        if n_accepted < 0:
+            raise ValueError("n_accepted must be non-negative")
+
+        if self._last_draft_key is None or self._last_draft_len <= 0:
+            return
+
+        accepted = min(int(n_accepted), self._last_draft_len)
+
+        if self.mode == "k":
+            self._accepted_k[self._last_draft_key] = accepted
+        else:
+            if self._last_draft_value is not None:
+                self._accepted_k4v[self._last_draft_key][self._last_draft_value] = accepted
+
+        self._last_draft_key = None
+        self._last_draft_value = None
+        self._last_draft_len = 0
 
     def _sync_and_index(self, input_ids: npt.NDArray[np.intc]) -> None:
         """
@@ -231,9 +261,11 @@ def _sync_and_index(self, input_ids: npt.NDArray[np.intc]) -> None:
             for pos in range(start, end):
                 key_start = pos
                 value_start = pos + self.ngram_size
-                value_end = min(value_start + self.num_pred_tokens, len(self._history))
+                value_end = value_start + self.num_pred_tokens
 
-                if value_start >= value_end:
+                # K4V tracks fixed-size continuation m-grams. Partial tail values are
+                # intentionally skipped so frequency statistics remain comparable.
+                if value_end > len(self._history):
                     continue
 
                 key = tuple(self._history[key_start:value_start])
@@ -267,6 +299,8 @@ def __call__(
         _ = kwargs
 
         self._sync_and_index(input_ids)
+        self._last_draft_key = None
+        self._last_draft_value = None
         self._last_draft_len = 0
 
         if len(self._history) < self.ngram_size:
@@ -276,28 +310,57 @@ def __call__(
 
         if self.mode == "k":
             positions = self._map_k.get(search_key)
-            if not positions or len(positions) < self.min_hits:
+            if not positions:
                 return np.array([], dtype=np.intc)
 
-            # Use the latest valid match with an available continuation.
+            # Key-only mode follows llama.cpp's ngram-map-k behavior: once a key
+            # match is found, draft from the latest valid match. min_hits is not
+            # used as a confidence gate for key-only mode.
             draft: List[int] = []
+            accepted_limit = self._accepted_k.get(search_key, self.num_pred_tokens)
+            if accepted_limit <= 0:
+                return np.array([], dtype=np.intc)
+
             for pos in reversed(positions):
                 start = pos + self.ngram_size
                 if start < len(self._history):
-                    end = min(start + self.num_pred_tokens, len(self._history))
+                    end = min(start + accepted_limit, len(self._history))
                     draft = self._history[start:end]
                     break
 
+            self._last_draft_key = search_key
+
         else:
             values = self._map_k4v.get(search_key)
             if not values or len(values) < self.min_hits:
                 return np.array([], dtype=np.intc)
 
-            # Use the continuation from the latest historical position.
-            latest_pos = max(values)
-            draft = list(values[latest_pos])
+            # K4V mode chooses the most frequent continuation m-gram rather than the
+            # latest one. If the strongest continuation is not at least twice as
+            # frequent as all other continuations combined, skip drafting.
+            counts = collections.Counter(values.values())
+            best_value, best_count = counts.most_common(1)[0]
+            other_count = sum(counts.values()) - best_count
+
+            if other_count > 0 and best_count < 2 * other_count:
+                return np.array([], dtype=np.intc)
+
+            accepted_limit = self._accepted_k4v[search_key].get(
+                best_value, self.num_pred_tokens
+            )
+            if accepted_limit <= 0:
+                return np.array([], dtype=np.intc)
+
+            draft = list(best_value[:accepted_limit])
+            self._last_draft_key = search_key
+            self._last_draft_value = best_value
 
         self._last_draft_len = len(draft)
+        if self._last_draft_len <= 0:
+            self._last_draft_key = None
+            self._last_draft_value = None
+            return np.array([], dtype=np.intc)
+
         return np.asarray(draft, dtype=np.intc)